diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 2965 |
1 files changed, 1629 insertions, 1336 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index dac7032fa08f..595b2ec88943 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -50,6 +50,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -92,6 +93,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Vectorize.h" @@ -266,21 +268,6 @@ static bool hasCyclesInLoopBody(const Loop &L) { return false; } -/// \brief This modifies LoopAccessReport to initialize message with -/// loop-vectorizer-specific part. -class VectorizationReport : public LoopAccessReport { -public: - VectorizationReport(Instruction *I = nullptr) - : LoopAccessReport("loop not vectorized: ", I) {} - - /// \brief This allows promotion of the loop-access analysis report into the - /// loop-vectorizer report. It modifies the message to add the - /// loop-vectorizer-specific part of the message. - explicit VectorizationReport(const LoopAccessReport &R) - : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), - R.getInstr()) {} -}; - /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return /// the scalar type. @@ -290,31 +277,9 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } -/// A helper function that returns GEP instruction and knows to skip a -/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination -/// pointee types of the 'bitcast' have the same size. -/// For example: -/// bitcast double** %var to i64* - can be skipped -/// bitcast double** %var to i8* - can not -static GetElementPtrInst *getGEPInstruction(Value *Ptr) { - - if (isa<GetElementPtrInst>(Ptr)) - return cast<GetElementPtrInst>(Ptr); - - if (isa<BitCastInst>(Ptr) && - isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) { - Type *BitcastTy = Ptr->getType(); - Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy(); - if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy)) - return nullptr; - Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType(); - Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType(); - const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout(); - if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) - return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0)); - } - return nullptr; -} +// FIXME: The following helper functions have multiple implementations +// in the project. They can be effectively organized in a common Load/Store +// utilities unit. /// A helper function that returns the pointer operand of a load or store /// instruction. @@ -326,6 +291,34 @@ static Value *getPointerOperand(Value *I) { return nullptr; } +/// A helper function that returns the type of loaded or stored value. +static Type *getMemInstValueType(Value *I) { + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast<LoadInst>(I)) + return LI->getType(); + return cast<StoreInst>(I)->getValueOperand()->getType(); +} + +/// A helper function that returns the alignment of load or store instruction. +static unsigned getMemInstAlignment(Value *I) { + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast<LoadInst>(I)) + return LI->getAlignment(); + return cast<StoreInst>(I)->getAlignment(); +} + +/// A helper function that returns the address space of the pointer operand of +/// load or store instruction. +static unsigned getMemInstAddressSpace(Value *I) { + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast<LoadInst>(I)) + return LI->getPointerAddressSpace(); + return cast<StoreInst>(I)->getPointerAddressSpace(); +} + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. @@ -351,6 +344,23 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { /// we always assume predicated blocks have a 50% chance of executing. static unsigned getReciprocalPredBlockProb() { return 2; } +/// A helper function that adds a 'fast' flag to floating-point operations. +static Value *addFastMathFlag(Value *V) { + if (isa<FPMathOperator>(V)) { + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + cast<Instruction>(V)->setFastMathFlags(Flags); + } + return V; +} + +/// A helper function that returns an integer or floating-point constant with +/// value C. +static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { + return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) + : ConstantFP::get(Ty, C); +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -428,10 +438,17 @@ protected: /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); + /// Handle all cross-iteration phis in the header. + void fixCrossIterationPHIs(); + /// Fix a first-order recurrence. This is the second phase of vectorizing /// this phi node. void fixFirstOrderRecurrence(PHINode *Phi); + /// Fix a reduction cross-iteration phi. This is the second phase of + /// vectorizing this phi node. + void fixReduction(PHINode *Phi); + /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values /// that were defined inside the loop. Here we fix the 'undef case'. @@ -448,7 +465,8 @@ protected: /// Collect the instructions from the original loop that would be trivially /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions(); + void collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions); /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. @@ -462,14 +480,14 @@ protected: /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// A helper function to vectorize a single BB within the innermost loop. - void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); + /// A helper function to vectorize a single instruction within the innermost + /// loop. + void vectorizeInstruction(Instruction &I); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF, - PhiVector *PV); + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. @@ -504,20 +522,21 @@ protected: /// \p EntryVal is the value from the original loop that maps to the steps. /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it /// can be a truncate instruction). - void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal); - - /// Create a vector induction phi node based on an existing scalar one. This - /// currently only works for integer induction variables with a constant - /// step. \p EntryVal is the value from the original loop that maps to the - /// vector phi node. If \p EntryVal is a truncate instruction, instead of - /// widening the original IV, we widen a version of the IV truncated to \p - /// EntryVal's type. - void createVectorIntInductionPHI(const InductionDescriptor &II, - Instruction *EntryVal); - - /// Widen an integer induction variable \p IV. If \p Trunc is provided, the - /// induction variable will first be truncated to the corresponding type. - void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr); + void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal, + const InductionDescriptor &ID); + + /// Create a vector induction phi node based on an existing scalar one. \p + /// EntryVal is the value from the original loop that maps to the vector phi + /// node, and \p Step is the loop-invariant step. If \p EntryVal is a + /// truncate instruction, instead of widening the original IV, we widen a + /// version of the IV truncated to \p EntryVal's type. + void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, + Value *Step, Instruction *EntryVal); + + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc + /// is provided, the integer induction variable will first be truncated to + /// the corresponding type. + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. @@ -583,6 +602,10 @@ protected: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); + /// \brief Set the debug location in the builder using the debug location in + /// the instruction. + void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + /// This is a helper class for maintaining vectorization state. It's used for /// mapping values from the original loop to their corresponding values in /// the new loop. Two mappings are maintained: one for vectorized values and @@ -777,14 +800,6 @@ protected: // Record whether runtime checks are added. bool AddedSafetyChecks; - // Holds instructions from the original loop whose counterparts in the - // vectorized loop would be trivially dead if generated. For example, - // original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap<PHINode *, Value *> IVEndValues; @@ -803,8 +818,6 @@ public: UnrollFactor, LVL, CM) {} private: - void scalarizeInstruction(Instruction *Instr, - bool IfPredicateInstr = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, @@ -832,12 +845,14 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { return I; } -/// \brief Set the debug location in the builder using the debug location in the -/// instruction. -static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { - if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) - B.SetCurrentDebugLocation(Inst->getDebugLoc()); - else +void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { + if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { + const DILocation *DIL = Inst->getDebugLoc(); + if (DIL && Inst->getFunction()->isDebugInfoForProfiling()) + B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF)); + else + B.SetCurrentDebugLocation(DIL); + } else B.SetCurrentDebugLocation(DebugLoc()); } @@ -1497,14 +1512,6 @@ private: OptimizationRemarkEmitter &ORE; }; -static void emitAnalysisDiag(const Loop *TheLoop, - const LoopVectorizeHints &Hints, - OptimizationRemarkEmitter &ORE, - const LoopAccessReport &Message) { - const char *Name = Hints.vectorizeAnalysisPassName(); - LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE); -} - static void emitMissedWarning(Function *F, Loop *L, const LoopVectorizeHints &LH, OptimizationRemarkEmitter *ORE) { @@ -1512,13 +1519,17 @@ static void emitMissedWarning(Function *F, Loop *L, if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { if (LH.getWidth() != 1) - emitLoopVectorizeWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop vectorization"); + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedVectorization", + L->getStartLoc(), L->getHeader()) + << "loop not vectorized: " + << "failed explicitly specified loop vectorization"); else if (LH.getInterleave() != 1) - emitLoopInterleaveWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop interleaving"); + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(), + L->getHeader()) + << "loop not interleaved: " + << "failed explicitly specified loop interleaving"); } } @@ -1546,7 +1557,7 @@ public: LoopVectorizeHints *H) : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), + PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H) {} /// ReductionList contains the reduction descriptors for all @@ -1566,8 +1577,8 @@ public: /// loop, only that it is legal to do so. bool canVectorize(); - /// Returns the Induction variable. - PHINode *getInduction() { return Induction; } + /// Returns the primary induction variable. + PHINode *getPrimaryInduction() { return PrimaryInduction; } /// Returns the reduction variables found in the loop. ReductionList *getReductionVars() { return &Reductions; } @@ -1607,12 +1618,6 @@ public: /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); - /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); } - - /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); } - /// Returns the information that we collected about runtime memory check. const RuntimePointerChecking *getRuntimePointerChecking() const { return LAI->getRuntimePointerChecking(); @@ -1689,15 +1694,9 @@ public: /// instructions that may divide by zero. bool isScalarWithPredication(Instruction *I); - /// Returns true if \p I is a memory instruction that has a consecutive or - /// consecutive-like pointer operand. Consecutive-like pointers are pointers - /// that are treated like consecutive pointers during vectorization. The - /// pointer operands of interleaved accesses are an example. - bool hasConsecutiveLikePtrOperand(Instruction *I); - - /// Returns true if \p I is a memory instruction that must be scalarized - /// during vectorization. - bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction with consecutive memory + /// access that can be widened. + bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); private: /// Check if a single basic block loop is vectorizable. @@ -1715,24 +1714,6 @@ private: /// transformation. bool canVectorizeWithIfConvert(); - /// Collect the instructions that are uniform after vectorization. An - /// instruction is uniform if we represent it with a single scalar value in - /// the vectorized loop corresponding to each vector iteration. Examples of - /// uniform instructions include pointer operands of consecutive or - /// interleaved memory accesses. Note that although uniformity implies an - /// instruction will be scalar, the reverse is not true. In general, a - /// scalarized instruction will be represented by VF scalar values in the - /// vectorized loop, each corresponding to an iteration of the original - /// scalar loop. - void collectLoopUniforms(); - - /// Collect the instructions that are scalar after vectorization. An - /// instruction is scalar if it is known to be uniform or will be scalarized - /// during vectorization. Non-uniform scalarized instructions will be - /// represented by VF values in the vectorized loop, each corresponding to an - /// iteration of the original scalar loop. - void collectLoopScalars(); - /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -1744,14 +1725,6 @@ private: void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, SmallPtrSetImpl<Value *> &AllowedExit); - /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. These are handled as LoopAccessReport rather than - /// VectorizationReport because the << operator of VectorizationReport returns - /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) const { - emitAnalysisDiag(TheLoop, *Hints, *ORE, Message); - } - /// Create an analysis remark that explains why vectorization failed /// /// \p RemarkName is the identifier for the remark. If \p I is passed it is @@ -1804,9 +1777,9 @@ private: // --- vectorization state --- // - /// Holds the integer induction variable. This is the counter of the + /// Holds the primary induction variable. This is the counter of the /// loop. - PHINode *Induction; + PHINode *PrimaryInduction; /// Holds the reduction variables. ReductionList Reductions; /// Holds all of the induction variables that we found in the loop. @@ -1822,12 +1795,6 @@ private: /// vars which can be accessed from outside the loop. SmallPtrSet<Value *, 4> AllowedExit; - /// Holds the instructions known to be uniform after vectorization. - SmallPtrSet<Instruction *, 4> Uniforms; - - /// Holds the instructions known to be scalar after vectorization. - SmallPtrSet<Instruction *, 4> Scalars; - /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; @@ -1861,16 +1828,26 @@ public: : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} + /// \return An upper bound for the vectorization factor, or None if + /// vectorization should be avoided up front. + Optional<unsigned> computeMaxVF(bool OptForSize); + /// Information about vectorization costs struct VectorizationFactor { unsigned Width; // Vector width with best cost unsigned Cost; // Cost of the loop with that width }; /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every power of two up to VF. If UserVF is not ZERO + /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(bool OptForSize); + VectorizationFactor selectVectorizationFactor(unsigned MaxVF); + + /// Setup cost-based decisions for user vectorization factor. + void selectUserVectorizationFactor(unsigned UserVF) { + collectUniformsAndScalars(UserVF); + collectInstsToScalarize(UserVF); + } /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as @@ -1884,6 +1861,15 @@ public: unsigned selectInterleaveCount(bool OptForSize, unsigned VF, unsigned LoopCost); + /// Memory access instruction may be vectorized in more than one way. + /// Form of instruction after vectorization depends on cost. + /// This function takes cost-based decisions for Load/Store instructions + /// and collects them in a map. This decisions map is used for building + /// the lists of loop-uniform and loop-scalar instructions. + /// The calculated cost is saved with widening decision in order to + /// avoid redundant calculations. + void setCostBasedWideningDecision(unsigned VF); + /// \brief A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { @@ -1918,14 +1904,118 @@ public: return Scalars->second.count(I); } + /// Returns true if \p I is known to be uniform after vectorization. + bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) + return true; + assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity"); + auto UniformsPerVF = Uniforms.find(VF); + return UniformsPerVF->second.count(I); + } + + /// Returns true if \p I is known to be scalar after vectorization. + bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) + return true; + assert(Scalars.count(VF) && "Scalar values are not calculated for VF"); + auto ScalarsPerVF = Scalars.find(VF); + return ScalarsPerVF->second.count(I); + } + /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) && - !Legal->isScalarAfterVectorization(I); + !isScalarAfterVectorization(I, VF); + } + + /// Decision that was taken during cost calculation for memory instruction. + enum InstWidening { + CM_Unknown, + CM_Widen, + CM_Interleave, + CM_GatherScatter, + CM_Scalarize + }; + + /// Save vectorization decision \p W and \p Cost taken by the cost model for + /// instruction \p I and vector width \p VF. + void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, + unsigned Cost) { + assert(VF >= 2 && "Expected VF >=2"); + WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + } + + /// Save vectorization decision \p W and \p Cost taken by the cost model for + /// interleaving group \p Grp and vector width \p VF. + void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, + InstWidening W, unsigned Cost) { + assert(VF >= 2 && "Expected VF >=2"); + /// Broadcast this decicion to all instructions inside the group. + /// But the cost will be assigned to one instruction only. + for (unsigned i = 0; i < Grp->getFactor(); ++i) { + if (auto *I = Grp->getMember(i)) { + if (Grp->getInsertPos() == I) + WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + else + WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); + } + } + } + + /// Return the cost model decision for the given instruction \p I and vector + /// width \p VF. Return CM_Unknown if this instruction did not pass + /// through the cost modeling. + InstWidening getWideningDecision(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); + auto Itr = WideningDecisions.find(InstOnVF); + if (Itr == WideningDecisions.end()) + return CM_Unknown; + return Itr->second.first; + } + + /// Return the vectorization cost for the given instruction \p I and vector + /// width \p VF. + unsigned getWideningCost(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); + assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated"); + return WideningDecisions[InstOnVF].second; + } + + /// Return True if instruction \p I is an optimizable truncate whose operand + /// is an induction variable. Such a truncate will be removed by adding a new + /// induction variable with the destination type. + bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { + + // If the instruction is not a truncate, return false. + auto *Trunc = dyn_cast<TruncInst>(I); + if (!Trunc) + return false; + + // Get the source and destination types of the truncate. + Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); + Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); + + // If the truncate is free for the given types, return false. Replacing a + // free truncate with an induction variable would add an induction variable + // update instruction to each iteration of the loop. We exclude from this + // check the primary induction variable since it will need an update + // instruction regardless. + Value *Op = Trunc->getOperand(0); + if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) + return false; + + // If the truncated value is not an induction variable, return false. + return Legal->isInductionVariable(Op); } private: + /// \return An upper bound for the vectorization factor, larger than zero. + /// One is returned if vectorization should best be avoided due to cost. + unsigned computeFeasibleMaxVF(bool OptForSize); + /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on @@ -1949,6 +2039,26 @@ private: /// the vector type as an output parameter. unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + /// Calculate vectorization cost of memory instruction \p I. + unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); + + /// The cost computation for scalarized memory instruction. + unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + + /// The cost computation for interleaving group of memory instructions. + unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + + /// The cost computation for Gather/Scatter instruction. + unsigned getGatherScatterCost(Instruction *I, unsigned VF); + + /// The cost computation for widening instruction \p I with consecutive + /// memory access. + unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); + + /// The cost calculation for Load instruction \p I with uniform pointer - + /// scalar load + broadcast. + unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -1972,12 +2082,24 @@ private: /// pairs. typedef DenseMap<Instruction *, unsigned> ScalarCostsTy; + /// A set containing all BasicBlocks that are known to present after + /// vectorization as a predicated block. + SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; + /// Holds the instructions known to be uniform after vectorization. + /// The data is collected per VF. + DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; + + /// Holds the instructions known to be scalar after vectorization. + /// The data is collected per VF. + DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -1990,6 +2112,44 @@ private: /// the loop. void collectInstsToScalarize(unsigned VF); + /// Collect the instructions that are uniform after vectorization. An + /// instruction is uniform if we represent it with a single scalar value in + /// the vectorized loop corresponding to each vector iteration. Examples of + /// uniform instructions include pointer operands of consecutive or + /// interleaved memory accesses. Note that although uniformity implies an + /// instruction will be scalar, the reverse is not true. In general, a + /// scalarized instruction will be represented by VF scalar values in the + /// vectorized loop, each corresponding to an iteration of the original + /// scalar loop. + void collectLoopUniforms(unsigned VF); + + /// Collect the instructions that are scalar after vectorization. An + /// instruction is scalar if it is known to be uniform or will be scalarized + /// during vectorization. Non-uniform scalarized instructions will be + /// represented by VF values in the vectorized loop, each corresponding to an + /// iteration of the original scalar loop. + void collectLoopScalars(unsigned VF); + + /// Collect Uniform and Scalar values for the given \p VF. + /// The sets depend on CM decision for Load/Store instructions + /// that may be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(unsigned VF) { + // Do the analysis once. + if (VF == 1 || Uniforms.count(VF)) + return; + setCostBasedWideningDecision(VF); + collectLoopUniforms(VF); + collectLoopScalars(VF); + } + + /// Keeps cost model vectorization decision and cost for instructions. + /// Right now it is used for memory instructions only. + typedef DenseMap<std::pair<Instruction *, unsigned>, + std::pair<InstWidening, unsigned>> + DecisionList; + + DecisionList WideningDecisions; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -2019,6 +2179,23 @@ public: SmallPtrSet<const Value *, 16> VecValuesToIgnore; }; +/// LoopVectorizationPlanner - drives the vectorization process after having +/// passed Legality checks. +class LoopVectorizationPlanner { +public: + LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {} + + ~LoopVectorizationPlanner() {} + + /// Plan how to best vectorize, return the best VF and its cost. + LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, + unsigned UserVF); + +private: + /// The profitablity analysis. + LoopVectorizationCostModel &CM; +}; + /// \brief This holds vectorization requirements that must be verified late in /// the process. The requirements are set by legalize and costmodel. Once /// vectorization has been determined to be possible and profitable the @@ -2134,8 +2311,6 @@ struct LoopVectorize : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequiredID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); AU.addRequired<BlockFrequencyInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); @@ -2156,7 +2331,7 @@ struct LoopVectorize : public FunctionPass { //===----------------------------------------------------------------------===// // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and -// LoopVectorizationCostModel. +// LoopVectorizationCostModel and LoopVectorizationPlanner. //===----------------------------------------------------------------------===// Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { @@ -2176,27 +2351,51 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -void InnerLoopVectorizer::createVectorIntInductionPHI( - const InductionDescriptor &II, Instruction *EntryVal) { +void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( + const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { Value *Start = II.getStartValue(); - ConstantInt *Step = II.getConstIntStepValue(); - assert(Step && "Can not widen an IV with a non-constant step"); // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); if (isa<TruncInst>(EntryVal)) { + assert(Start->getType()->isIntegerTy() && + "Truncation requires an integer type"); auto *TruncType = cast<IntegerType>(EntryVal->getType()); - Step = ConstantInt::getSigned(TruncType, Step->getSExtValue()); + Step = Builder.CreateTrunc(Step, TruncType); Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } Value *SplatStart = Builder.CreateVectorSplat(VF, Start); - Value *SteppedStart = getStepVector(SplatStart, 0, Step); + Value *SteppedStart = + getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); + + // We create vector phi nodes for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (Step->getType()->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = II.getInductionOpcode(); + MulOp = Instruction::FMul; + } + + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); + Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't + // handle a constant vector splat. + Value *SplatVF = isa<Constant>(Mul) + ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); - Value *SplatVF = - ConstantVector::getSplat(VF, ConstantInt::getSigned(Start->getType(), - VF * Step->getSExtValue())); // We may need to add the step a number of times, depending on the unroll // factor. The last of those goes into the PHI. PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", @@ -2205,8 +2404,8 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = LastInduction; - LastInduction = cast<Instruction>( - Builder.CreateAdd(LastInduction, SplatVF, "step.add")); + LastInduction = cast<Instruction>(addFastMathFlag( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); } VectorLoopValueMap.initVector(EntryVal, Entry); if (isa<TruncInst>(EntryVal)) @@ -2225,7 +2424,7 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( } bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { - return Legal->isScalarAfterVectorization(I) || + return Cost->isScalarAfterVectorization(I, VF) || Cost->isProfitableToScalarize(I, VF); } @@ -2239,7 +2438,10 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { return any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { + + assert((IV->getType()->isIntegerTy() || IV != OldInduction) && + "Primary induction variable must have an integer type"); auto II = Legal->getInductionVars()->find(IV); assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); @@ -2251,9 +2453,6 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { // induction variable. Value *ScalarIV = nullptr; - // The step of the induction. - Value *Step = nullptr; - // The value from the original loop to which we are mapping the new induction // variable. Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; @@ -2266,45 +2465,49 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { // least one user in the loop that is not widened. auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); - // If the induction variable has a constant integer step value, go ahead and - // get it now. - if (ID.getConstIntStepValue()) - Step = ID.getConstIntStepValue(); + // Generate code for the induction step. Note that induction steps are + // required to be loop-invariant + assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && + "Induction step should be loop invariant"); + auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + Value *Step = nullptr; + if (PSE.getSE()->isSCEVable(IV->getType())) { + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), + LoopVectorPreHeader->getTerminator()); + } else { + Step = cast<SCEVUnknown>(ID.getStep())->getValue(); + } // Try to create a new independent vector induction variable. If we can't // create the phi node, we will splat the scalar induction variable in each // loop iteration. - if (VF > 1 && IV->getType() == Induction->getType() && Step && - !shouldScalarizeInstruction(EntryVal)) { - createVectorIntInductionPHI(ID, EntryVal); + if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { + createVectorIntOrFpInductionPHI(ID, Step, EntryVal); VectorizedIV = true; } // If we haven't yet vectorized the induction variable, or if we will create // a scalar one, we need to define the scalar induction variable and step // values. If we were given a truncation type, truncate the canonical - // induction variable and constant step. Otherwise, derive these values from - // the induction descriptor. + // induction variable and step. Otherwise, derive these values from the + // induction descriptor. if (!VectorizedIV || NeedsScalarIV) { + ScalarIV = Induction; + if (IV != OldInduction) { + ScalarIV = IV->getType()->isIntegerTy() + ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) + : Builder.CreateCast(Instruction::SIToFP, Induction, + IV->getType()); + ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); + ScalarIV->setName("offset.idx"); + } if (Trunc) { auto *TruncType = cast<IntegerType>(Trunc->getType()); - assert(Step && "Truncation requires constant integer step"); - auto StepInt = cast<ConstantInt>(Step)->getSExtValue(); - ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType); - Step = ConstantInt::getSigned(TruncType, StepInt); - } else { - ScalarIV = Induction; - auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - if (IV != OldInduction) { - ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()); - ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); - ScalarIV->setName("offset.idx"); - } - if (!Step) { - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), - &*Builder.GetInsertPoint()); - } + assert(Step->getType()->isIntegerTy() && + "Truncation requires an integer step"); + ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); + Step = Builder.CreateTrunc(Step, TruncType); } } @@ -2314,7 +2517,8 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + Entry[Part] = + getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.initVector(EntryVal, Entry); if (Trunc) addMetadata(Entry, Trunc); @@ -2327,7 +2531,7 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { // in the loop in the common case prior to InstCombine. We will be trading // one vector extract for each scalar step. if (NeedsScalarIV) - buildScalarSteps(ScalarIV, Step, EntryVal); + buildScalarSteps(ScalarIV, Step, EntryVal, ID); } Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, @@ -2387,30 +2591,43 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, - Value *EntryVal) { + Value *EntryVal, + const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF > 1 && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); - assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && - "Val and Step should have the same integer type"); + assert(ScalarIVTy == Step->getType() && + "Val and Step should have the same type"); + + // We build scalar steps for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (ScalarIVTy->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = ID.getInductionOpcode(); + MulOp = Instruction::FMul; + } // Determine the number of scalars we need to generate for each unroll // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF; + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF; // Compute the scalar steps and save the results in VectorLoopValueMap. ScalarParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part].resize(VF); for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane); - auto *Mul = Builder.CreateMul(StartIdx, Step); - auto *Add = Builder.CreateAdd(ScalarIV, Mul); + auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); + auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); + auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); Entry[Part][Lane] = Add; } } @@ -2469,7 +2686,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // known to be uniform after vectorization, this corresponds to lane zero // of the last unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the last unroll iteration. - unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1; + unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane)); // Set the insert point after the last scalarized instruction. This ensures @@ -2486,7 +2703,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // VectorLoopValueMap, we will only generate the insertelements once. for (unsigned Part = 0; Part < UF; ++Part) { Value *VectorValue = nullptr; - if (Legal->isUniformAfterVectorization(I)) { + if (Cost->isUniformAfterVectorization(I, VF)) { VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0)); } else { VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); @@ -2515,8 +2732,9 @@ Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, if (OrigLoop->isLoopInvariant(V)) return V; - assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V)) - : true && "Uniform values only have lane zero"); + assert(Lane > 0 ? + !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested @@ -2551,102 +2769,6 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { "reverse"); } -// Get a mask to interleave \p NumVec vectors into a wide vector. -// I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> -// E.g. For 2 interleaved vectors, if VF is 4, the mask is: -// <0, 4, 1, 5, 2, 6, 3, 7> -static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, - unsigned NumVec) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < VF; i++) - for (unsigned j = 0; j < NumVec; j++) - Mask.push_back(Builder.getInt32(j * VF + i)); - - return ConstantVector::get(Mask); -} - -// Get the strided mask starting from index \p Start. -// I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)> -static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, - unsigned Stride, unsigned VF) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < VF; i++) - Mask.push_back(Builder.getInt32(Start + i * Stride)); - - return ConstantVector::get(Mask); -} - -// Get a mask of two parts: The first part consists of sequential integers -// starting from 0, The second part consists of UNDEFs. -// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> -static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, - unsigned NumUndef) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < NumInt; i++) - Mask.push_back(Builder.getInt32(i)); - - Constant *Undef = UndefValue::get(Builder.getInt32Ty()); - for (unsigned i = 0; i < NumUndef; i++) - Mask.push_back(Undef); - - return ConstantVector::get(Mask); -} - -// Concatenate two vectors with the same element type. The 2nd vector should -// not have more elements than the 1st vector. If the 2nd vector has less -// elements, extend it with UNDEFs. -static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, - Value *V2) { - VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType()); - VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType()); - assert(VecTy1 && VecTy2 && - VecTy1->getScalarType() == VecTy2->getScalarType() && - "Expect two vectors with the same element type"); - - unsigned NumElts1 = VecTy1->getNumElements(); - unsigned NumElts2 = VecTy2->getNumElements(); - assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); - - if (NumElts1 > NumElts2) { - // Extend with UNDEFs. - Constant *ExtMask = - getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); - V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); - } - - Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); - return Builder.CreateShuffleVector(V1, V2, Mask); -} - -// Concatenate vectors in the given list. All vectors have the same type. -static Value *ConcatenateVectors(IRBuilder<> &Builder, - ArrayRef<Value *> InputList) { - unsigned NumVec = InputList.size(); - assert(NumVec > 1 && "Should be at least two vectors"); - - SmallVector<Value *, 8> ResList; - ResList.append(InputList.begin(), InputList.end()); - do { - SmallVector<Value *, 8> TmpList; - for (unsigned i = 0; i < NumVec - 1; i += 2) { - Value *V0 = ResList[i], *V1 = ResList[i + 1]; - assert((V0->getType() == V1->getType() || i == NumVec - 2) && - "Only the last vector may have a different type"); - - TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1)); - } - - // Push the last vector if the total number of vectors is odd. - if (NumVec % 2 != 0) - TmpList.push_back(ResList[NumVec - 1]); - - ResList = TmpList; - NumVec = ResList.size(); - } while (NumVec > 1); - - return ResList[0]; -} - // Try to vectorize the interleave group that \p Instr belongs to. // // E.g. Translate following interleaved load group (factor = 3): @@ -2683,15 +2805,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { if (Instr != Group->getInsertPos()) return; - LoadInst *LI = dyn_cast<LoadInst>(Instr); - StoreInst *SI = dyn_cast<StoreInst>(Instr); Value *Ptr = getPointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. - Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); - Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr)); // Prepare for the new pointers. setDebugLocFromInst(Builder, Ptr); @@ -2731,7 +2851,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Value *UndefVec = UndefValue::get(VecTy); // Vectorize the interleaved load group. - if (LI) { + if (isa<LoadInst>(Instr)) { // For each unroll part, create a wide load for the group. SmallVector<Value *, 2> NewLoads; @@ -2752,7 +2872,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { continue; VectorParts Entry(UF); - Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF); + Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], UndefVec, StrideMask, "strided.vec"); @@ -2796,10 +2916,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { } // Concatenate all vectors into a wide vector. - Value *WideVec = ConcatenateVectors(Builder, StoredVecs); + Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. - Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor); + Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, "interleaved.vec"); @@ -2816,103 +2936,44 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { assert((LI || SI) && "Invalid Load/Store instruction"); - // Try to vectorize the interleave group if this access is interleaved. - if (Legal->isAccessInterleaved(Instr)) + LoopVectorizationCostModel::InstWidening Decision = + Cost->getWideningDecision(Instr, VF); + assert(Decision != LoopVectorizationCostModel::CM_Unknown && + "CM decision should be taken at this point"); + if (Decision == LoopVectorizationCostModel::CM_Interleave) return vectorizeInterleaveGroup(Instr); - Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = getPointerOperand(Instr); - unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); + unsigned Alignment = getMemInstAlignment(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); if (!Alignment) Alignment = DL.getABITypeAlignment(ScalarDataTy); - unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + unsigned AddressSpace = getMemInstAddressSpace(Instr); // Scalarize the memory instruction if necessary. - if (Legal->memoryInstructionMustBeScalarized(Instr, VF)) + if (Decision == LoopVectorizationCostModel::CM_Scalarize) return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. bool CreateGatherScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr); + (Decision == LoopVectorizationCostModel::CM_GatherScatter); VectorParts VectorGep; // Handle consecutive loads/stores. - GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (ConsecutiveStride) { - if (Gep) { - unsigned NumOperands = Gep->getNumOperands(); -#ifndef NDEBUG - // The original GEP that identified as a consecutive memory access - // should have only one loop-variant operand. - unsigned NumOfLoopVariantOps = 0; - for (unsigned i = 0; i < NumOperands; ++i) - if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), - OrigLoop)) - NumOfLoopVariantOps++; - assert(NumOfLoopVariantOps == 1 && - "Consecutive GEP should have only one loop-variant operand"); -#endif - GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - Gep2->setName("gep.indvar"); - - // A new GEP is created for a 0-lane value of the first unroll iteration. - // The GEPs for the rest of the unroll iterations are computed below as an - // offset from this GEP. - for (unsigned i = 0; i < NumOperands; ++i) - // We can apply getScalarValue() for all GEP indices. It returns an - // original value for loop-invariant operand and 0-lane for consecutive - // operand. - Gep2->setOperand(i, getScalarValue(Gep->getOperand(i), - 0, /* First unroll iteration */ - 0 /* 0-lane of the vector */ )); - setDebugLocFromInst(Builder, Gep); - Ptr = Builder.Insert(Gep2); - - } else { // No GEP - setDebugLocFromInst(Builder, Ptr); - Ptr = getScalarValue(Ptr, 0, 0); - } + Ptr = getScalarValue(Ptr, 0, 0); } else { // At this point we should vector version of GEP for Gather or Scatter assert(CreateGatherScatter && "The instruction should be scalarized"); - if (Gep) { - // Vectorizing GEP, across UF parts. We want to get a vector value for base - // and each index that's defined inside the loop, even if it is - // loop-invariant but wasn't hoisted out. Otherwise we want to keep them - // scalar. - SmallVector<VectorParts, 4> OpsV; - for (Value *Op : Gep->operands()) { - Instruction *SrcInst = dyn_cast<Instruction>(Op); - if (SrcInst && OrigLoop->contains(SrcInst)) - OpsV.push_back(getVectorValue(Op)); - else - OpsV.push_back(VectorParts(UF, Op)); - } - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 4> Ops; - Value *GEPBasePtr = OpsV[0][Part]; - for (unsigned i = 1; i < Gep->getNumOperands(); i++) - Ops.push_back(OpsV[i][Part]); - Value *NewGep = Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep"); - cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds()); - assert(NewGep->getType()->isVectorTy() && "Expected vector GEP"); - - NewGep = - Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF)); - VectorGep.push_back(NewGep); - } - } else - VectorGep = getVectorValue(Ptr); + VectorGep = getVectorValue(Ptr); } VectorParts Mask = createBlockInMask(Instr->getParent()); @@ -3027,7 +3088,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { @@ -3038,7 +3099,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Start if-block. Value *Cmp = nullptr; if (IfPredicateInstr) { - Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane)); + Cmp = Cond[Part]; + if (Cmp->getType()->isVectorTy()) + Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); } @@ -3346,7 +3409,7 @@ void InnerLoopVectorizer::createEmptyLoop() { // - counts from zero, stepping by one // - is the size of the widest induction variable type // then we create a new one. - OldInduction = Legal->getInduction(); + OldInduction = Legal->getPrimaryInduction(); Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. @@ -3543,7 +3606,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, namespace { struct CSEDenseMapInfo { - static bool canHandle(Instruction *I) { + static bool canHandle(const Instruction *I) { return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); } @@ -3553,12 +3616,12 @@ struct CSEDenseMapInfo { static inline Instruction *getTombstoneKey() { return DenseMapInfo<Instruction *>::getTombstoneKey(); } - static unsigned getHashValue(Instruction *I) { + static unsigned getHashValue(const Instruction *I) { assert(canHandle(I) && "Unknown instruction!"); return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), I->value_op_end())); } - static bool isEqual(Instruction *LHS, Instruction *RHS) { + static bool isEqual(const Instruction *LHS, const Instruction *RHS) { if (LHS == getEmptyKey() || RHS == getEmptyKey() || LHS == getTombstoneKey() || RHS == getTombstoneKey()) return LHS == RHS; @@ -3589,51 +3652,6 @@ static void cse(BasicBlock *BB) { } } -/// \brief Adds a 'fast' flag to floating point operations. -static Value *addFastMathFlag(Value *V) { - if (isa<FPMathOperator>(V)) { - FastMathFlags Flags; - Flags.setUnsafeAlgebra(); - cast<Instruction>(V)->setFastMathFlags(Flags); - } - return V; -} - -/// \brief Estimate the overhead of scalarizing a value based on its type. -/// Insert and Extract are set if the result needs to be inserted and/or -/// extracted from vectors. -static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, - const TargetTransformInfo &TTI) { - if (Ty->isVoidTy()) - return 0; - - assert(Ty->isVectorTy() && "Can only scalarize vectors"); - unsigned Cost = 0; - - for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) { - if (Extract) - Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I); - if (Insert) - Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); - } - - return Cost; -} - -/// \brief Estimate the overhead of scalarizing an Instruction based on the -/// types of its operands and return value. -static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys, - Type *RetTy, - const TargetTransformInfo &TTI) { - unsigned ScalarizationCost = - getScalarizationOverhead(RetTy, true, false, TTI); - - for (Type *Ty : OpTys) - ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); - - return ScalarizationCost; -} - /// \brief Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, @@ -3641,14 +3659,24 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, if (VF == 1) return 0; + unsigned Cost = 0; Type *RetTy = ToVectorTy(I->getType(), VF); + if (!RetTy->isVoidTy() && + (!isa<LoadInst>(I) || + !TTI.supportsEfficientVectorElementLoadStore())) + Cost += TTI.getScalarizationOverhead(RetTy, true, false); - SmallVector<Type *, 4> OpTys; - unsigned OperandsNum = I->getNumOperands(); - for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd) - OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF)); + if (CallInst *CI = dyn_cast<CallInst>(I)) { + SmallVector<const Value *, 4> Operands(CI->arg_operands()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } + else if (!isa<StoreInst>(I) || + !TTI.supportsEfficientVectorElementLoadStore()) { + SmallVector<const Value *, 4> Operands(I->operand_values()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } - return getScalarizationOverhead(OpTys, RetTy, TTI); + return Cost; } // Estimate cost of a call instruction CI if it were vectorized with factor VF. @@ -3681,7 +3709,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3709,16 +3737,12 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector<Type *, 4> Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - FastMathFlags FMF; if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) FMF = FPMO->getFastMathFlags(); - return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF); + SmallVector<Value *, 4> Operands(CI->arg_operands()); + return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); } static Type *smallestIntegerVectorType(Type *T1, Type *T2) { @@ -3861,30 +3885,27 @@ void InnerLoopVectorizer::vectorizeLoop() { // the cost-model. // //===------------------------------------------------===// - Constant *Zero = Builder.getInt32(0); - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. First, - // we create a new vector PHI node with no incoming edges. We use this value - // when we vectorize all of the instructions that use the PHI. Next, after - // all of the instructions in the block are complete we add the new incoming - // edges to the PHI. At this point all of the instructions in the basic block - // are vectorized, so we can use them to construct the PHI. - PhiVector PHIsToFix; - - // Collect instructions from the original loop that will become trivially - // dead in the vectorized loop. We don't need to vectorize these - // instructions. - collectTriviallyDeadInstructions(); + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet<Instruction *, 4> DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); // Scan the loop in a topological order to ensure that defs are vectorized // before users. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); - // Vectorize all of the blocks in the original loop. + // Vectorize all instructions in the original loop that will not become + // trivially dead when vectorized. for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - vectorizeBlockInLoop(BB, &PHIsToFix); + for (Instruction &I : *BB) + if (!DeadInstructions.count(&I)) + vectorizeInstruction(I); // Insert truncates and extends for any truncated instructions as hints to // InstCombine. @@ -3892,224 +3913,10 @@ void InnerLoopVectorizer::vectorizeLoop() { truncateToMinimalBitwidths(); // At this point every instruction in the original loop is widened to a - // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI + // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. // This is the second stage of vectorizing recurrences. - for (PHINode *Phi : PHIsToFix) { - assert(Phi && "Unable to recover vectorized PHI"); - - // Handle first-order recurrences that need to be fixed. - if (Legal->isFirstOrderRecurrence(Phi)) { - fixFirstOrderRecurrence(Phi); - continue; - } - - // If the phi node is not a first-order recurrence, it must be a reduction. - // Get it's reduction variable descriptor. - assert(Legal->isReductionVariable(Phi) && - "Unable to find the reduction variable"); - RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; - - RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); - TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = - RdxDesc.getMinMaxRecurrenceKind(); - setDebugLocFromInst(Builder, ReductionStartValue); - - // We need to generate a reduction vector from the incoming scalar. - // To do so, we need to generate the 'identity' vector and override - // one of the elements with the incoming scalar reduction. We need - // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); - - // This is the vector-clone of the value that leaves the loop. - const VectorParts &VectorExit = getVectorValue(LoopExitInst); - Type *VecTy = VectorExit[0]->getType(); - - // Find the reduction identity variable. Zero for addition, or, xor, - // one for multiplication, -1 for And. - Value *Identity; - Value *VectorStart; - if (RK == RecurrenceDescriptor::RK_IntegerMinMax || - RK == RecurrenceDescriptor::RK_FloatMinMax) { - // MinMax reduction have the start value as their identify. - if (VF == 1) { - VectorStart = Identity = ReductionStartValue; - } else { - VectorStart = Identity = - Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); - } - } else { - // Handle other reduction kinds: - Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( - RK, VecTy->getScalarType()); - if (VF == 1) { - Identity = Iden; - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = ReductionStartValue; - } else { - Identity = ConstantVector::getSplat(VF, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = - Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); - } - } - - // Fix the vector-loop phi. - - // Reductions do not have to start at zero. They can start with - // any loop invariant values. - const VectorParts &VecRdxPhi = getVectorValue(Phi); - BasicBlock *Latch = OrigLoop->getLoopLatch(); - Value *LoopVal = Phi->getIncomingValueForBlock(Latch); - const VectorParts &Val = getVectorValue(LoopVal); - for (unsigned part = 0; part < UF; ++part) { - // Make sure to add the reduction stat value only to the - // first unroll part. - Value *StartVal = (part == 0) ? VectorStart : Identity; - cast<PHINode>(VecRdxPhi[part]) - ->addIncoming(StartVal, LoopVectorPreHeader); - cast<PHINode>(VecRdxPhi[part]) - ->addIncoming(Val[part], LoopVectorBody); - } - - // Before each round, move the insertion point right between - // the PHIs and the values we are going to write. - // This allows us to write both PHINodes and the extractelement - // instructions. - Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - - VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst); - setDebugLocFromInst(Builder, LoopExitInst); - - // If the vector reduction can be performed in a smaller type, we truncate - // then extend the loop exit value to enable InstCombine to evaluate the - // entire expression in the smaller type. - if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { - Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); - Builder.SetInsertPoint(LoopVectorBody->getTerminator()); - for (unsigned part = 0; part < UF; ++part) { - Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); - Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) - : Builder.CreateZExt(Trunc, VecTy); - for (Value::user_iterator UI = RdxParts[part]->user_begin(); - UI != RdxParts[part]->user_end();) - if (*UI != Trunc) { - (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); - RdxParts[part] = Extnd; - } else { - ++UI; - } - } - Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - for (unsigned part = 0; part < UF; ++part) - RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); - } - - // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = RdxParts[0]; - unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); - setDebugLocFromInst(Builder, ReducedPartRdx); - for (unsigned part = 1; part < UF; ++part) { - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - ReducedPartRdx = addFastMathFlag( - Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], - ReducedPartRdx, "bin.rdx")); - else - ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( - Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); - } - - if (VF > 1) { - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = Builder.getInt32(i / 2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), - ConstantVector::get(ShuffleMask), "rdx.shuf"); - - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp( - (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); - else - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, - TmpVec, Shuf); - } - - // The result is in the first element of the vector. - ReducedPartRdx = - Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - - // If the reduction can be performed in a smaller type, we need to extend - // the reduction to the wider type before we branch to the original loop. - if (Phi->getType() != RdxDesc.getRecurrenceType()) - ReducedPartRdx = - RdxDesc.isSigned() - ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) - : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); - } - - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()); - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); - BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); - - // Now, we need to fix the users of the reduction variable - // inside and outside of the scalar remainder loop. - // We know that the loop is in LCSSA form. We need to update the - // PHI nodes in the exit blocks. - for (BasicBlock::iterator LEI = LoopExitBlock->begin(), - LEE = LoopExitBlock->end(); - LEI != LEE; ++LEI) { - PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) - break; - - // All PHINodes need to have a single entry edge, or two if - // we already fixed them. - assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); - - // We found our reduction value exit-PHI. Update it with the - // incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) { - // Add an edge coming from the bypass. - LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); - break; - } - } // end of the LCSSA phi scan. - - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); - } // end of for each Phi in PHIsToFix. + fixCrossIterationPHIs(); // Update the dominator tree. // @@ -4134,6 +3941,25 @@ void InnerLoopVectorizer::vectorizeLoop() { cse(LoopVectorBody); } +void InnerLoopVectorizer::fixCrossIterationPHIs() { + // In order to support recurrences we need to be able to vectorize Phi nodes. + // Phi nodes have cycles, so we need to vectorize them in two stages. This is + // stage #2: We now need to fix the recurrences by adding incoming edges to + // the currently empty PHI nodes. At this point every instruction in the + // original loop is widened to a vector form so we can use them to construct + // the incoming edges. + for (Instruction &I : *OrigLoop->getHeader()) { + PHINode *Phi = dyn_cast<PHINode>(&I); + if (!Phi) + break; + // Handle first-order recurrences and reductions that need to be fixed. + if (Legal->isFirstOrderRecurrence(Phi)) + fixFirstOrderRecurrence(Phi); + else if (Legal->isReductionVariable(Phi)) + fixReduction(Phi); + } +} + void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // This is the second phase of vectorizing first-order recurrences. An @@ -4212,15 +4038,17 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); - // Get the vectorized previous value. We ensured the previous values was an - // instruction when detecting the recurrence. + // Get the vectorized previous value. auto &PreviousParts = getVectorValue(Previous); - // Set the insertion point to be after this instruction. We ensured the - // previous value dominated all uses of the phi when detecting the - // recurrence. - Builder.SetInsertPoint( - &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1]))); + // Set the insertion point after the previous value if it is an instruction. + // Note that the previous value may have been constant-folded so it is not + // guaranteed to be an instruction in the vector loop. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1])) + Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); + else + Builder.SetInsertPoint( + &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1]))); // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. @@ -4251,18 +4079,33 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. - auto *Extract = Incoming; + auto *ExtractForScalar = Incoming; if (VF > 1) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1), - "vector.recur.extract"); - } + ExtractForScalar = Builder.CreateExtractElement( + ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); + } + // Extract the second last element in the middle block if the + // Phi is used outside the loop. We need to extract the phi itself + // and not the last element (the phi update in the current iteration). This + // will be the value when jumping to the exit block from the LoopMiddleBlock, + // when the scalar loop is not run at all. + Value *ExtractForPhiUsedOutsideLoop = nullptr; + if (VF > 1) + ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( + Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); + // When loop is unrolled without vectorizing, initialize + // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of + // `Incoming`. This is analogous to the vectorized case above: extracting the + // second last element when VF > 1. + else if (UF > 1) + ExtractForPhiUsedOutsideLoop = PreviousParts[UF - 2]; // Fix the initial value of the original recurrence in the scalar loop. Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); for (auto *BB : predecessors(LoopScalarPreHeader)) { - auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit; + auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; Start->addIncoming(Incoming, BB); } @@ -4279,12 +4122,218 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { if (!LCSSAPhi) break; if (LCSSAPhi->getIncomingValue(0) == Phi) { - LCSSAPhi->addIncoming(Extract, LoopMiddleBlock); + LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); break; } } } +void InnerLoopVectorizer::fixReduction(PHINode *Phi) { + Constant *Zero = Builder.getInt32(0); + + // Get it's reduction variable descriptor. + assert(Legal->isReductionVariable(Phi) && + "Unable to find the reduction variable"); + RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; + + RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); + TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); + Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = + RdxDesc.getMinMaxRecurrenceKind(); + setDebugLocFromInst(Builder, ReductionStartValue); + + // We need to generate a reduction vector from the incoming scalar. + // To do so, we need to generate the 'identity' vector and override + // one of the elements with the incoming scalar reduction. We need + // to do it in the vector-loop preheader. + Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); + + // This is the vector-clone of the value that leaves the loop. + const VectorParts &VectorExit = getVectorValue(LoopExitInst); + Type *VecTy = VectorExit[0]->getType(); + + // Find the reduction identity variable. Zero for addition, or, xor, + // one for multiplication, -1 for And. + Value *Identity; + Value *VectorStart; + if (RK == RecurrenceDescriptor::RK_IntegerMinMax || + RK == RecurrenceDescriptor::RK_FloatMinMax) { + // MinMax reduction have the start value as their identify. + if (VF == 1) { + VectorStart = Identity = ReductionStartValue; + } else { + VectorStart = Identity = + Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); + } + } else { + // Handle other reduction kinds: + Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( + RK, VecTy->getScalarType()); + if (VF == 1) { + Identity = Iden; + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = ReductionStartValue; + } else { + Identity = ConstantVector::getSplat(VF, Iden); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = + Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); + } + } + + // Fix the vector-loop phi. + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + const VectorParts &VecRdxPhi = getVectorValue(Phi); + BasicBlock *Latch = OrigLoop->getLoopLatch(); + Value *LoopVal = Phi->getIncomingValueForBlock(Latch); + const VectorParts &Val = getVectorValue(LoopVal); + for (unsigned part = 0; part < UF; ++part) { + // Make sure to add the reduction stat value only to the + // first unroll part. + Value *StartVal = (part == 0) ? VectorStart : Identity; + cast<PHINode>(VecRdxPhi[part]) + ->addIncoming(StartVal, LoopVectorPreHeader); + cast<PHINode>(VecRdxPhi[part]) + ->addIncoming(Val[part], LI->getLoopFor(LoopVectorBody)->getLoopLatch()); + } + + // Before each round, move the insertion point right between + // the PHIs and the values we are going to write. + // This allows us to write both PHINodes and the extractelement + // instructions. + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); + + VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst); + setDebugLocFromInst(Builder, LoopExitInst); + + // If the vector reduction can be performed in a smaller type, we truncate + // then extend the loop exit value to enable InstCombine to evaluate the + // entire expression in the smaller type. + if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + Builder.SetInsertPoint(LoopVectorBody->getTerminator()); + for (unsigned part = 0; part < UF; ++part) { + Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) + : Builder.CreateZExt(Trunc, VecTy); + for (Value::user_iterator UI = RdxParts[part]->user_begin(); + UI != RdxParts[part]->user_end();) + if (*UI != Trunc) { + (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); + RdxParts[part] = Extnd; + } else { + ++UI; + } + } + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); + for (unsigned part = 0; part < UF; ++part) + RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + } + + // Reduce all of the unrolled parts into a single vector. + Value *ReducedPartRdx = RdxParts[0]; + unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); + setDebugLocFromInst(Builder, ReducedPartRdx); + for (unsigned part = 1; part < UF; ++part) { + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + // Floating point operations had to be 'fast' to enable the reduction. + ReducedPartRdx = addFastMathFlag( + Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], + ReducedPartRdx, "bin.rdx")); + else + ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( + Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); + } + + if (VF > 1) { + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = ReducedPartRdx; + SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = Builder.getInt32(i / 2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), "rdx.shuf"); + + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + // Floating point operations had to be 'fast' to enable the reduction. + TmpVec = addFastMathFlag(Builder.CreateBinOp( + (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); + else + TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, + TmpVec, Shuf); + } + + // The result is in the first element of the vector. + ReducedPartRdx = + Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + + // If the reduction can be performed in a smaller type, we need to extend + // the reduction to the wider type before we branch to the original loop. + if (Phi->getType() != RdxDesc.getRecurrenceType()) + ReducedPartRdx = + RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) + : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); + } + + // Create a phi node that merges control-flow from the backedge-taken check + // block and the middle block. + PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", + LoopScalarPreHeader->getTerminator()); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); + BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + + // Now, we need to fix the users of the reduction variable + // inside and outside of the scalar remainder loop. + // We know that the loop is in LCSSA form. We need to update the + // PHI nodes in the exit blocks. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); + LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); + if (!LCSSAPhi) + break; + + // All PHINodes need to have a single entry edge, or two if + // we already fixed them. + assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + + // We found a reduction value exit-PHI. Update it with the + // incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) + LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + } // end of the LCSSA phi scan. + + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = + Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); + assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); + // Pick the other block. + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); + Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); + Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (Instruction &LEI : *LoopExitBlock) { auto *LCSSAPhi = dyn_cast<PHINode>(&LEI); @@ -4296,7 +4345,8 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { } } -void InnerLoopVectorizer::collectTriviallyDeadInstructions() { +void InnerLoopVectorizer::collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions) { BasicBlock *Latch = OrigLoop->getLoopLatch(); // We create new control-flow for the vectorized loop, so the original @@ -4563,9 +4613,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { } void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, - unsigned VF, PhiVector *PV) { + unsigned VF) { PHINode *P = cast<PHINode>(PN); - // Handle recurrences. + // In order to support recurrences we need to be able to vectorize Phi nodes. + // Phi nodes have cycles, so we need to vectorize them in two stages. This is + // stage #1: We create a new vector PHI node with no incoming edges. We'll use + // this value when we vectorize all of the instructions that use the PHI. if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { VectorParts Entry(UF); for (unsigned part = 0; part < UF; ++part) { @@ -4576,7 +4629,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); } VectorLoopValueMap.initVector(P, Entry); - PV->push_back(P); return; } @@ -4631,7 +4683,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: - return widenIntInduction(P); + case InductionDescriptor::IK_FpInduction: + return widenIntOrFpInduction(P); case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4641,7 +4694,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; // These are the scalar results. Notice that we don't generate vector GEPs // because scalar GEPs result in better code. ScalarParts Entry(UF); @@ -4658,30 +4711,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, VectorLoopValueMap.initScalar(P, Entry); return; } - case InductionDescriptor::IK_FpInduction: { - assert(P->getType() == II.getStartValue()->getType() && - "Types must match"); - // Handle other induction variables that are now based on the - // canonical one. - assert(P != OldInduction && "Primary induction can be integer only"); - - Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType()); - V = II.transform(Builder, V, PSE.getSE(), DL); - V->setName("fp.offset.idx"); - - // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal - - Value *Broadcasted = getBroadcastInstrs(V); - // After broadcasting the induction variable we need to make the vector - // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc. - Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue(); - VectorParts Entry(UF); - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getStepVector(Broadcasted, VF * part, StepVal, - II.getInductionOpcode()); - VectorLoopValueMap.initVector(P, Entry); - return; - } } } @@ -4703,269 +4732,323 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { - // For each instruction in the old loop. - for (Instruction &I : *BB) { - - // If the instruction will become trivially dead when vectorized, we don't - // need to generate it. - if (DeadInstructions.count(&I)) - continue; +void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { + // Scalarize instructions that should remain scalar after vectorization. + if (VF > 1 && + !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) && + shouldScalarizeInstruction(&I)) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); + return; + } - // Scalarize instructions that should remain scalar after vectorization. - if (VF > 1 && - !(isa<BranchInst>(&I) || isa<PHINode>(&I) || - isa<DbgInfoIntrinsic>(&I)) && - shouldScalarizeInstruction(&I)) { - scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); - continue; - } + switch (I.getOpcode()) { + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + break; + case Instruction::PHI: { + // Vectorize PHINodes. + widenPHIInstruction(&I, UF, VF); + break; + } // End of PHI. + case Instruction::GetElementPtr: { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + auto *GEP = cast<GetElementPtrInst>(&I); + VectorParts Entry(UF); - switch (I.getOpcode()) { - case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - continue; - case Instruction::PHI: { - // Vectorize PHINodes. - widenPHIInstruction(&I, UF, VF, PV); - continue; - } // End of PHI. - - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - // Scalarize with predication if this instruction may divide by zero and - // block execution is conditional, otherwise fallthrough. - if (Legal->isScalarWithPredication(&I)) { - scalarizeInstruction(&I, true); - continue; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen binops. - auto *BinOp = cast<BinaryOperator>(&I); - setDebugLocFromInst(Builder, BinOp); - const VectorParts &A = getVectorValue(BinOp->getOperand(0)); - const VectorParts &B = getVectorValue(BinOp->getOperand(1)); - - // Use this vector value for all users of the original instruction. - VectorParts Entry(UF); + if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateVectorSplat(VF, Clone); + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. for (unsigned Part = 0; Part < UF; ++Part) { - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); - if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) - VecOp->copyIRFlags(BinOp); + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand()) + ? GEP->getPointerOperand() + : getVectorValue(GEP->getPointerOperand())[Part]; + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { + if (OrigLoop->isLoopInvariant(U.get())) + Indices.push_back(U.get()); + else + Indices.push_back(getVectorValue(U.get())[Part]); + } - Entry[Part] = V; + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = GEP->isInBounds() + ? Builder.CreateInBoundsGEP(Ptr, Indices) + : Builder.CreateGEP(Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + Entry[Part] = NewGEP; } + } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, BinOp); + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, GEP); + break; + } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // Scalarize with predication if this instruction may divide by zero and + // block execution is conditional, otherwise fallthrough. + if (Legal->isScalarWithPredication(&I)) { + scalarizeInstruction(&I, true); break; } - case Instruction::Select: { - // Widen selects. - // If the selector is loop invariant we can create a select - // instruction with a scalar condition. Otherwise, use vector-select. - auto *SE = PSE.getSE(); - bool InvariantCond = - SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); - setDebugLocFromInst(Builder, &I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - const VectorParts &Cond = getVectorValue(I.getOperand(0)); - const VectorParts &Op0 = getVectorValue(I.getOperand(1)); - const VectorParts &Op1 = getVectorValue(I.getOperand(2)); - - auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen binops. + auto *BinOp = cast<BinaryOperator>(&I); + setDebugLocFromInst(Builder, BinOp); + const VectorParts &A = getVectorValue(BinOp->getOperand(0)); + const VectorParts &B = getVectorValue(BinOp->getOperand(1)); + + // Use this vector value for all users of the original instruction. + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); + + if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) + VecOp->copyIRFlags(BinOp); + + Entry[Part] = V; + } - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part] = Builder.CreateSelect( - InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); - } + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, BinOp); + break; + } + case Instruction::Select: { + // Widen selects. + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + auto *SE = PSE.getSE(); + bool InvariantCond = + SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); + setDebugLocFromInst(Builder, &I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + const VectorParts &Cond = getVectorValue(I.getOperand(0)); + const VectorParts &Op0 = getVectorValue(I.getOperand(1)); + const VectorParts &Op1 = getVectorValue(I.getOperand(2)); + + auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); - break; + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part] = Builder.CreateSelect( + InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = dyn_cast<CmpInst>(&I); - setDebugLocFromInst(Builder, Cmp); - const VectorParts &A = getVectorValue(Cmp->getOperand(0)); - const VectorParts &B = getVectorValue(Cmp->getOperand(1)); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *C = nullptr; - if (FCmp) { - C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); - cast<FCmpInst>(C)->copyFastMathFlags(Cmp); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); - } - Entry[Part] = C; + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } + + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = dyn_cast<CmpInst>(&I); + setDebugLocFromInst(Builder, Cmp); + const VectorParts &A = getVectorValue(Cmp->getOperand(0)); + const VectorParts &B = getVectorValue(Cmp->getOperand(1)); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *C = nullptr; + if (FCmp) { + C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); + cast<FCmpInst>(C)->copyFastMathFlags(Cmp); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); } + Entry[Part] = C; + } + + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); + case Instruction::Store: + case Instruction::Load: + vectorizeMemoryInstruction(&I); + break; + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = dyn_cast<CastInst>(&I); + setDebugLocFromInst(Builder, CI); + + // Optimize the special case where the source is a constant integer + // induction variable. Notice that we can only optimize the 'trunc' case + // because (a) FP conversions lose precision, (b) sext/zext may wrap, and + // (c) other casts depend on pointer size. + if (Cost->isOptimizableIVTruncate(CI, VF)) { + widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)), + cast<TruncInst>(CI)); break; } - case Instruction::Store: - case Instruction::Load: - vectorizeMemoryInstruction(&I); + /// Vectorize casts. + Type *DestTy = + (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); + + const VectorParts &A = getVectorValue(CI->getOperand(0)); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } + + case Instruction::Call: { + // Ignore dbg intrinsics. + if (isa<DbgInfoIntrinsic>(I)) break; - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = dyn_cast<CastInst>(&I); - setDebugLocFromInst(Builder, CI); - - // Optimize the special case where the source is a constant integer - // induction variable. Notice that we can only optimize the 'trunc' case - // because (a) FP conversions lose precision, (b) sext/zext may wrap, and - // (c) other casts depend on pointer size. - auto ID = Legal->getInductionVars()->lookup(OldInduction); - if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction && - ID.getConstIntStepValue()) { - widenIntInduction(OldInduction, cast<TruncInst>(CI)); - break; - } + setDebugLocFromInst(Builder, &I); - /// Vectorize casts. - Type *DestTy = - (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); + Module *M = I.getParent()->getParent()->getParent(); + auto *CI = cast<CallInst>(&I); - const VectorParts &A = getVectorValue(CI->getOperand(0)); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); + StringRef FnName = CI->getCalledFunction()->getName(); + Function *F = CI->getCalledFunction(); + Type *RetTy = ToVectorTy(CI->getType(), VF); + SmallVector<Type *, 4> Tys; + for (Value *ArgOperand : CI->arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) { + scalarizeInstruction(&I); + break; + } + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + bool UseVectorIntrinsic = + ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + if (!UseVectorIntrinsic && NeedToScalarize) { + scalarizeInstruction(&I); break; } - case Instruction::Call: { - // Ignore dbg intrinsics. - if (isa<DbgInfoIntrinsic>(I)) - break; - setDebugLocFromInst(Builder, &I); - - Module *M = BB->getParent()->getParent(); - auto *CI = cast<CallInst>(&I); - - StringRef FnName = CI->getCalledFunction()->getName(); - Function *F = CI->getCalledFunction(); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector<Type *, 4> Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(&I); - break; - } - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); - bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(&I); - break; - } - - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 4> Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - Value *Arg = CI->getArgOperand(i); - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { - const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); - Arg = VectorArg[Part]; - } - Args.push_back(Arg); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value *, 4> Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + Value *Arg = CI->getArgOperand(i); + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { + const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); + Arg = VectorArg[Part]; } + Args.push_back(Arg); + } - Function *VectorF; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - } else { - // Use vector version of the library call. - StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); - assert(!VFnName.empty() && "Vector function name is empty."); - VectorF = M->getFunction(VFnName); - if (!VectorF) { - // Generate a declaration - FunctionType *FTy = FunctionType::get(RetTy, Tys, false); - VectorF = - Function::Create(FTy, Function::ExternalLinkage, VFnName, M); - VectorF->copyAttributesFrom(F); - } + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the library call. + StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); + assert(!VFnName.empty() && "Vector function name is empty."); + VectorF = M->getFunction(VFnName); + if (!VectorF) { + // Generate a declaration + FunctionType *FTy = FunctionType::get(RetTy, Tys, false); + VectorF = + Function::Create(FTy, Function::ExternalLinkage, VFnName, M); + VectorF->copyAttributesFrom(F); } - assert(VectorF && "Can't create vector function."); - - SmallVector<OperandBundleDef, 1> OpBundles; - CI->getOperandBundlesAsDefs(OpBundles); - CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); + } + assert(VectorF && "Can't create vector function."); - if (isa<FPMathOperator>(V)) - V->copyFastMathFlags(CI); + SmallVector<OperandBundleDef, 1> OpBundles; + CI->getOperandBundlesAsDefs(OpBundles); + CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); - Entry[Part] = V; - } + if (isa<FPMathOperator>(V)) + V->copyFastMathFlags(CI); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); - break; + Entry[Part] = V; } - default: - // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(&I); - break; - } // end of switch. - } // end of for_each instr. + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } + + default: + // All other instructions are unsupported. Scalarize them. + scalarizeInstruction(&I); + break; + } // end of switch. } void InnerLoopVectorizer::updateAnalysis() { @@ -4976,11 +5059,10 @@ void InnerLoopVectorizer::updateAnalysis() { assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); - // We don't predicate stores by this point, so the vector body should be a - // single loop. - DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); - - DT->addNewBlock(LoopMiddleBlock, LoopVectorBody); + DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(), + LoopVectorPreHeader); + DT->addNewBlock(LoopMiddleBlock, + LI->getLoopFor(LoopVectorBody)->getLoopLatch()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); @@ -5145,12 +5227,6 @@ bool LoopVectorizationLegality::canVectorize() { if (UseInterleaved) InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); - // Collect all instructions that are known to be uniform after vectorization. - collectLoopUniforms(); - - // Collect all instructions that are known to be scalar after vectorization. - collectLoopScalars(); - unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; @@ -5234,8 +5310,8 @@ void LoopVectorizationLegality::addInductionPhi( // one if there are multiple (no good reason for doing this other // than it is expedient). We've checked that it begins at zero and // steps by one, so this is a canonical induction variable. - if (!Induction || PhiTy == WidestIndTy) - Induction = Phi; + if (!PrimaryInduction || PhiTy == WidestIndTy) + PrimaryInduction = Phi; } // Both the PHI node itself, and the "post-increment" value feeding @@ -5398,7 +5474,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } // next instr. } - if (!Induction) { + if (!PrimaryInduction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); if (Inductions.empty()) { ORE->emit(createMissedAnalysis("NoInductionVariable") @@ -5410,46 +5486,166 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Now we know the widest induction type, check if our found induction // is the same size. If it's not, unset it here and InnerLoopVectorizer // will create another. - if (Induction && WidestIndTy != Induction->getType()) - Induction = nullptr; + if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) + PrimaryInduction = nullptr; return true; } -void LoopVectorizationLegality::collectLoopScalars() { +void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { + + // We should not collect Scalars more than once per VF. Right now, this + // function is called from collectUniformsAndScalars(), which already does + // this check. Collecting Scalars for VF=1 does not make any sense. + assert(VF >= 2 && !Scalars.count(VF) && + "This function should not be visited twice for the same VF"); + + SmallSetVector<Instruction *, 8> Worklist; + + // These sets are used to seed the analysis with pointers used by memory + // accesses that will remain scalar. + SmallSetVector<Instruction *, 8> ScalarPtrs; + SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; + + // A helper that returns true if the use of Ptr by MemAccess will be scalar. + // The pointer operands of loads and stores will be scalar as long as the + // memory access is not a gather or scatter operation. The value operand of a + // store will remain scalar if the store is scalarized. + auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { + InstWidening WideningDecision = getWideningDecision(MemAccess, VF); + assert(WideningDecision != CM_Unknown && + "Widening decision should be ready at this moment"); + if (auto *Store = dyn_cast<StoreInst>(MemAccess)) + if (Ptr == Store->getValueOperand()) + return WideningDecision == CM_Scalarize; + assert(Ptr == getPointerOperand(MemAccess) && + "Ptr is neither a value or pointer operand"); + return WideningDecision != CM_GatherScatter; + }; + + // A helper that returns true if the given value is a bitcast or + // getelementptr instruction contained in the loop. + auto isLoopVaryingBitCastOrGEP = [&](Value *V) { + return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || + isa<GetElementPtrInst>(V)) && + !TheLoop->isLoopInvariant(V); + }; - // If an instruction is uniform after vectorization, it will remain scalar. - Scalars.insert(Uniforms.begin(), Uniforms.end()); + // A helper that evaluates a memory access's use of a pointer. If the use + // will be a scalar use, and the pointer is only used by memory accesses, we + // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in + // PossibleNonScalarPtrs. + auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { - // Collect the getelementptr instructions that will not be vectorized. A - // getelementptr instruction is only vectorized if it is used for a legal - // gather or scatter operation. + // We only care about bitcast and getelementptr instructions contained in + // the loop. + if (!isLoopVaryingBitCastOrGEP(Ptr)) + return; + + // If the pointer has already been identified as scalar (e.g., if it was + // also identified as uniform), there's nothing to do. + auto *I = cast<Instruction>(Ptr); + if (Worklist.count(I)) + return; + + // If the use of the pointer will be a scalar use, and all users of the + // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, + // place the pointer in PossibleNonScalarPtrs. + if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) { + return isa<LoadInst>(U) || isa<StoreInst>(U); + })) + ScalarPtrs.insert(I); + else + PossibleNonScalarPtrs.insert(I); + }; + + // We seed the scalars analysis with three classes of instructions: (1) + // instructions marked uniform-after-vectorization, (2) bitcast and + // getelementptr instructions used by memory accesses requiring a scalar use, + // and (3) pointer induction variables and their update instructions (we + // currently only scalarize these). + // + // (1) Add to the worklist all instructions that have been identified as + // uniform-after-vectorization. + Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); + + // (2) Add to the worklist all bitcast and getelementptr instructions used by + // memory accesses requiring a scalar use. The pointer operands of loads and + // stores will be scalar as long as the memory accesses is not a gather or + // scatter operation. The value operand of a store will remain scalar if the + // store is scalarized. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { - if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { - Scalars.insert(GEP); - continue; + if (auto *Load = dyn_cast<LoadInst>(&I)) { + evaluatePtrUse(Load, Load->getPointerOperand()); + } else if (auto *Store = dyn_cast<StoreInst>(&I)) { + evaluatePtrUse(Store, Store->getPointerOperand()); + evaluatePtrUse(Store, Store->getValueOperand()); } - auto *Ptr = getPointerOperand(&I); - if (!Ptr) - continue; - auto *GEP = getGEPInstruction(Ptr); - if (GEP && isLegalGatherOrScatter(&I)) - Scalars.erase(GEP); + } + for (auto *I : ScalarPtrs) + if (!PossibleNonScalarPtrs.count(I)) { + DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); + Worklist.insert(I); } + // (3) Add to the worklist all pointer induction variables and their update + // instructions. + // + // TODO: Once we are able to vectorize pointer induction variables we should + // no longer insert them into the worklist here. + auto *Latch = TheLoop->getLoopLatch(); + for (auto &Induction : *Legal->getInductionVars()) { + auto *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) + continue; + Worklist.insert(Ind); + Worklist.insert(IndUpdate); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); + } + + // Expand the worklist by looking through any bitcasts and getelementptr + // instructions we've already identified as scalar. This is similar to the + // expansion step in collectLoopUniforms(); however, here we're only + // expanding to include additional bitcasts and getelementptr instructions. + unsigned Idx = 0; + while (Idx != Worklist.size()) { + Instruction *Dst = Worklist[Idx++]; + if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) + continue; + auto *Src = cast<Instruction>(Dst->getOperand(0)); + if (all_of(Src->users(), [&](User *U) -> bool { + auto *J = cast<Instruction>(U); + return !TheLoop->contains(J) || Worklist.count(J) || + ((isa<LoadInst>(J) || isa<StoreInst>(J)) && + isScalarUse(J, Src)); + })) { + Worklist.insert(Src); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); + } + } + // An induction variable will remain scalar if all users of the induction // variable and induction variable update remain scalar. - auto *Latch = TheLoop->getLoopLatch(); - for (auto &Induction : *getInductionVars()) { + for (auto &Induction : *Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + // We already considered pointer induction variables, so there's no reason + // to look at their users again. + // + // TODO: Once we are able to vectorize pointer induction variables we + // should no longer skip over them here. + if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) + continue; + // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); }); if (!ScalarInd) continue; @@ -5458,23 +5654,19 @@ void LoopVectorizationLegality::collectLoopScalars() { // scalar after vectorization. auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == Ind || !TheLoop->contains(I) || Scalars.count(I); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I); }); if (!ScalarIndUpdate) continue; // The induction variable and its update instruction will remain scalar. - Scalars.insert(Ind); - Scalars.insert(IndUpdate); + Worklist.insert(Ind); + Worklist.insert(IndUpdate); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); } -} -bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) { - if (isAccessInterleaved(I)) - return true; - if (auto *Ptr = getPointerOperand(I)) - return isConsecutivePtr(Ptr); - return false; + Scalars[VF].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { @@ -5494,48 +5686,48 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { return false; } -bool LoopVectorizationLegality::memoryInstructionMustBeScalarized( - Instruction *I, unsigned VF) { - - // If the memory instruction is in an interleaved group, it will be - // vectorized and its pointer will remain uniform. - if (isAccessInterleaved(I)) - return false; - +bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, + unsigned VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast<LoadInst>(I); StoreInst *SI = dyn_cast<StoreInst>(I); assert((LI || SI) && "Invalid memory instruction"); - // If the pointer operand is uniform (loop invariant), the memory instruction - // will be scalarized. auto *Ptr = getPointerOperand(I); - if (LI && isUniform(Ptr)) - return true; - // If the pointer operand is non-consecutive and neither a gather nor a - // scatter operation is legal, the memory instruction will be scalarized. - if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I)) - return true; + // In order to be widened, the pointer should be consecutive, first of all. + if (!isConsecutivePtr(Ptr)) + return false; // If the instruction is a store located in a predicated block, it will be // scalarized. if (isScalarWithPredication(I)) - return true; + return false; // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); if (hasIrregularType(ScalarTy, DL, VF)) - return true; + return false; - // Otherwise, the memory instruction should be vectorized if the rest of the - // loop is. - return false; + return true; } -void LoopVectorizationLegality::collectLoopUniforms() { +void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { + + // We should not collect Uniforms more than once per VF. Right now, + // this function is called from collectUniformsAndScalars(), which + // already does this check. Collecting Uniforms for VF=1 does not make any + // sense. + + assert(VF >= 2 && !Uniforms.count(VF) && + "This function should not be visited twice for the same VF"); + + // Visit the list of Uniforms. If we'll not find any uniform value, we'll + // not analyze again. Uniforms.count(VF) will return 1. + Uniforms[VF].clear(); + // We now know that the loop is vectorizable! // Collect instructions inside the loop that will remain uniform after // vectorization. @@ -5568,6 +5760,14 @@ void LoopVectorizationLegality::collectLoopUniforms() { // Holds pointer operands of instructions that are possibly non-uniform. SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; + auto isUniformDecision = [&](Instruction *I, unsigned VF) { + InstWidening WideningDecision = getWideningDecision(I, VF); + assert(WideningDecision != CM_Unknown && + "Widening decision should be ready at this moment"); + + return (WideningDecision == CM_Widen || + WideningDecision == CM_Interleave); + }; // Iterate over the instructions in the loop, and collect all // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible // that a consecutive-like pointer operand will be scalarized, we collect it @@ -5590,25 +5790,18 @@ void LoopVectorizationLegality::collectLoopUniforms() { return getPointerOperand(U) == Ptr; }); - // Ensure the memory instruction will not be scalarized, making its - // pointer operand non-uniform. If the pointer operand is used by some - // instruction other than a memory access, we're not going to check if - // that other instruction may be scalarized here. Thus, conservatively - // assume the pointer operand may be non-uniform. - if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I)) + // Ensure the memory instruction will not be scalarized or used by + // gather/scatter, making its pointer operand non-uniform. If the pointer + // operand is used by any instruction other than a memory access, we + // conservatively assume the pointer operand may be non-uniform. + if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) PossibleNonUniformPtrs.insert(Ptr); // If the memory instruction will be vectorized and its pointer operand - // is consecutive-like, the pointer operand should remain uniform. - else if (hasConsecutiveLikePtrOperand(&I)) - ConsecutiveLikePtrs.insert(Ptr); - - // Otherwise, if the memory instruction will be vectorized and its - // pointer operand is non-consecutive-like, the memory instruction should - // be a gather or scatter operation. Its pointer operand will be - // non-uniform. + // is consecutive-like, or interleaving - the pointer operand should + // remain uniform. else - PossibleNonUniformPtrs.insert(Ptr); + ConsecutiveLikePtrs.insert(Ptr); } // Add to the Worklist all consecutive and consecutive-like pointers that @@ -5632,7 +5825,9 @@ void LoopVectorizationLegality::collectLoopUniforms() { continue; auto *OI = cast<Instruction>(OV); if (all_of(OI->users(), [&](User *U) -> bool { - return isOutOfScope(U) || Worklist.count(cast<Instruction>(U)); + auto *J = cast<Instruction>(U); + return !TheLoop->contains(J) || Worklist.count(J) || + (OI == getPointerOperand(J) && isUniformDecision(J, VF)); })) { Worklist.insert(OI); DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); @@ -5643,7 +5838,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { // Returns true if Ptr is the pointer operand of a memory access instruction // I, and I is known to not require scalarization. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I); + return getPointerOperand(I) == Ptr && isUniformDecision(I, VF); }; // For an instruction to be added into Worklist above, all its users inside @@ -5652,7 +5847,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { // nodes separately. An induction variable will remain uniform if all users // of the induction variable and induction variable update remain uniform. // The code below handles both pointer and non-pointer induction variables. - for (auto &Induction : Inductions) { + for (auto &Induction : *Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); @@ -5683,7 +5878,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n"); } - Uniforms.insert(Worklist.begin(), Worklist.end()); + Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationLegality::canVectorizeMemory() { @@ -5823,7 +6018,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses( uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); // An alignment of 0 means target ABI alignment. - unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); + unsigned Align = getMemInstAlignment(&I); if (!Align) Align = DL.getABITypeAlignment(PtrTy->getElementType()); @@ -5978,6 +6173,11 @@ void InterleavedAccessInfo::analyzeInterleaving( if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size) continue; + // Ignore A if the memory object of A and B don't belong to the same + // address space + if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B)) + continue; + // Calculate the distance from A to B. const SCEVConstant *DistToB = dyn_cast<SCEVConstant>( PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev)); @@ -6020,36 +6220,36 @@ void InterleavedAccessInfo::analyzeInterleaving( if (Group->getNumMembers() != Group->getFactor()) releaseGroup(Group); - // Remove interleaved groups with gaps (currently only loads) whose memory - // accesses may wrap around. We have to revisit the getPtrStride analysis, - // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does + // Remove interleaved groups with gaps (currently only loads) whose memory + // accesses may wrap around. We have to revisit the getPtrStride analysis, + // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does // not check wrapping (see documentation there). - // FORNOW we use Assume=false; - // TODO: Change to Assume=true but making sure we don't exceed the threshold + // FORNOW we use Assume=false; + // TODO: Change to Assume=true but making sure we don't exceed the threshold // of runtime SCEV assumptions checks (thereby potentially failing to - // vectorize altogether). + // vectorize altogether). // Additional optional optimizations: - // TODO: If we are peeling the loop and we know that the first pointer doesn't + // TODO: If we are peeling the loop and we know that the first pointer doesn't // wrap then we can deduce that all pointers in the group don't wrap. - // This means that we can forcefully peel the loop in order to only have to - // check the first pointer for no-wrap. When we'll change to use Assume=true + // This means that we can forcefully peel the loop in order to only have to + // check the first pointer for no-wrap. When we'll change to use Assume=true // we'll only need at most one runtime check per interleaved group. // for (InterleaveGroup *Group : LoadGroups) { // Case 1: A full group. Can Skip the checks; For full groups, if the wide - // load would wrap around the address space we would do a memory access at - // nullptr even without the transformation. - if (Group->getNumMembers() == Group->getFactor()) + // load would wrap around the address space we would do a memory access at + // nullptr even without the transformation. + if (Group->getNumMembers() == Group->getFactor()) continue; - // Case 2: If first and last members of the group don't wrap this implies + // Case 2: If first and last members of the group don't wrap this implies // that all the pointers in the group don't wrap. // So we check only group member 0 (which is always guaranteed to exist), - // and group member Factor - 1; If the latter doesn't exist we rely on + // and group member Factor - 1; If the latter doesn't exist we rely on // peeling (if it is a non-reveresed accsess -- see Case 3). Value *FirstMemberPtr = getPointerOperand(Group->getMember(0)); - if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, + if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, /*ShouldCheckWrap=*/true)) { DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " "first group member potentially pointer-wrapping.\n"); @@ -6065,8 +6265,7 @@ void InterleavedAccessInfo::analyzeInterleaving( "last group member potentially pointer-wrapping.\n"); releaseGroup(Group); } - } - else { + } else { // Case 3: A non-reversed interleaved load group with gaps: We need // to execute at least one scalar epilogue iteration. This will ensure // we don't speculatively access memory out-of-bounds. We only need @@ -6082,27 +6281,62 @@ void InterleavedAccessInfo::analyzeInterleaving( } } -LoopVectorizationCostModel::VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { - // Width 1 means no vectorize - VectorizationFactor Factor = {1U, 0U}; - if (OptForSize && Legal->getRuntimePointerChecking()->Need) { +Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { + if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { + ORE->emit(createMissedAnalysis("ConditionalStore") + << "store that is conditionally executed prevents vectorization"); + DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); + return None; + } + + if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. + return computeFeasibleMaxVF(OptForSize); + + if (Legal->getRuntimePointerChecking()->Need) { ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") << "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " "compiling with -Os/-Oz"); DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); - return Factor; + return None; } - if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { - ORE->emit(createMissedAnalysis("ConditionalStore") - << "store that is conditionally executed prevents vectorization"); - DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); - return Factor; + // If we optimize the program for size, avoid creating the tail loop. + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + + // If we don't know the precise trip count, don't try to vectorize. + if (TC < 2) { + ORE->emit( + createMissedAnalysis("UnknownLoopCountComplexCFG") + << "unable to calculate the loop count due to complex control flow"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + return None; } + unsigned MaxVF = computeFeasibleMaxVF(OptForSize); + + if (TC % MaxVF != 0) { + // If the trip count that we found modulo the vectorization factor is not + // zero then we require a tail. + // FIXME: look for a smaller MaxVF that does divide TC rather than give up. + // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a + // smaller MaxVF that does not require a scalar epilog. + + ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") + << "cannot optimize for size and vectorize at the " + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os/-Oz"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + return None; + } + + return MaxVF; +} + +unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -6136,7 +6370,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!"); - unsigned VF = MaxVectorSize; + unsigned MaxVF = MaxVectorSize; if (MaximizeBandwidth && !OptForSize) { // Collect all viable vectorization factors. SmallVector<unsigned, 8> VFs; @@ -6152,54 +6386,16 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); for (int i = RUs.size() - 1; i >= 0; --i) { if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { - VF = VFs[i]; + MaxVF = VFs[i]; break; } } } + return MaxVF; +} - // If we optimize the program for size, avoid creating the tail loop. - if (OptForSize) { - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - - // If we don't know the precise trip count, don't try to vectorize. - if (TC < 2) { - ORE->emit( - createMissedAnalysis("UnknownLoopCountComplexCFG") - << "unable to calculate the loop count due to complex control flow"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); - return Factor; - } - - // Find the maximum SIMD width that can fit within the trip count. - VF = TC % MaxVectorSize; - - if (VF == 0) - VF = MaxVectorSize; - else { - // If the trip count that we found modulo the vectorization factor is not - // zero then we require a tail. - ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") - << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os/-Oz"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); - return Factor; - } - } - - int UserVF = Hints->getWidth(); - if (UserVF != 0) { - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); - DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - - Factor.Width = UserVF; - collectInstsToScalarize(UserVF); - return Factor; - } - +LoopVectorizationCostModel::VectorizationFactor +LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { float Cost = expectedCost(1).first; #ifndef NDEBUG const float ScalarCost = Cost; @@ -6209,12 +6405,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; // Ignore scalar width, because the user explicitly wants vectorization. - if (ForceVectorization && VF > 1) { + if (ForceVectorization && MaxVF > 1) { Width = 2; Cost = expectedCost(Width).first / (float)Width; } - for (unsigned i = 2; i <= VF; i *= 2) { + for (unsigned i = 2; i <= MaxVF; i *= 2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. @@ -6238,8 +6434,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - Factor.Width = Width; - Factor.Cost = Width * Cost; + VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; return Factor; } @@ -6277,9 +6472,16 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { T = ST->getValueOperand()->getType(); // Ignore loaded pointer types and stored pointer types that are not - // consecutive. However, we do want to take consecutive stores/loads of - // pointer vectors into account. - if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I)) + // vectorizable. + // + // FIXME: The check here attempts to predict whether a load or store will + // be vectorized. We only know this for certain after a VF has + // been selected. Here, we assume that if an access can be + // vectorized, it will be. We should also look at extending this + // optimization to non-pointer types. + // + if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && + !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I)) continue; MinWidth = std::min(MinWidth, @@ -6562,12 +6764,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); continue; } - + collectUniformsAndScalars(VFs[j]); // Count the number of live intervals. unsigned RegUsage = 0; for (auto Inst : OpenIntervals) { // Skip ignored values for VF > 1. - if (VecValuesToIgnore.count(Inst)) + if (VecValuesToIgnore.count(Inst) || + isScalarAfterVectorization(Inst, VFs[j])) continue; RegUsage += GetRegUsage(Inst->getType(), VFs[j]); } @@ -6628,6 +6831,9 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { ScalarCostsTy ScalarCosts; if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + + // Remember that BB will remain after vectorization. + PredicatedBBsAfterVectorization.insert(BB); } } } @@ -6636,7 +6842,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, unsigned VF) { - assert(!Legal->isUniformAfterVectorization(PredInst) && + assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the @@ -6657,7 +6863,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // already be scalar to avoid traversing chains that are unlikely to be // beneficial. if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || - Legal->isScalarAfterVectorization(I)) + isScalarAfterVectorization(I, VF)) return false; // If the instruction is scalar with predication, it will be analyzed @@ -6677,7 +6883,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // the lane zero values for uniforms rather than asserting. for (Use &U : I->operands()) if (auto *J = dyn_cast<Instruction>(U.get())) - if (Legal->isUniformAfterVectorization(J)) + if (isUniformAfterVectorization(J, VF)) return false; // Otherwise, we can scalarize the instruction. @@ -6690,7 +6896,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // and their return values are inserted into vectors. Thus, an extract would // still be required. auto needsExtract = [&](Instruction *I) -> bool { - return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I); + return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF); }; // Compute the expected cost discount from scalarizing the entire expression @@ -6717,8 +6923,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) { - ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, - false, TTI); + ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), + true, false); ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); } @@ -6733,8 +6939,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( if (canBeScalarized(J)) Worklist.push_back(J); else if (needsExtract(J)) - ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF), - false, true, TTI); + ScalarCost += TTI.getScalarizationOverhead( + ToVectorTy(J->getType(),VF), false, true); } // Scale the total scalar cost by block probability. @@ -6753,6 +6959,9 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect Uniform and Scalar instructions after vectorization with VF. + collectUniformsAndScalars(VF); + // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. collectInstsToScalarize(VF); @@ -6832,11 +7041,141 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { Legal->hasStride(I->getOperand(1)); } +unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + auto SE = PSE.getSE(); + + unsigned Alignment = getMemInstAlignment(I); + unsigned AS = getMemInstAddressSpace(I); + Value *Ptr = getPointerOperand(I); + Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + + // Figure out whether the access is strided and get the stride value + // if it's known in compile time + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); + + // Get the cost of the scalar memory instruction and address computation. + unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + + Cost += VF * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS, I); + + // Get the overhead of the extractelement and insertelement instructions + // we might create due to scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // If we have a predicated store, it may not be executed for each vector + // lane. Scale the cost by the probability of executing the predicated + // block. + if (Legal->isScalarWithPredication(I)) + Cost /= getReciprocalPredBlockProb(); + + return Cost; +} + +unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = getMemInstAlignment(I); + Value *Ptr = getPointerOperand(I); + unsigned AS = getMemInstAddressSpace(I); + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + + assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && + "Stride should be 1 or -1 for consecutive memory access"); + unsigned Cost = 0; + if (Legal->isMaskRequired(I)) + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + else + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); + + bool Reverse = ConsecutiveStride < 0; + if (Reverse) + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; +} + +unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, + unsigned VF) { + LoadInst *LI = cast<LoadInst>(I); + Type *ValTy = LI->getType(); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = LI->getAlignment(); + unsigned AS = LI->getPointerAddressSpace(); + + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); +} + +unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = getMemInstAlignment(I); + Value *Ptr = getPointerOperand(I); + + return TTI.getAddressComputationCost(VectorTy) + + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment); +} + +unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned AS = getMemInstAddressSpace(I); + + auto Group = Legal->getInterleavedAccessGroup(I); + assert(Group && "Fail to get an interleaved access group."); + + unsigned InterleaveFactor = Group->getFactor(); + Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + + // Holds the indices of existing members in an interleaved load group. + // An interleaved store group doesn't need this as it doesn't allow gaps. + SmallVector<unsigned, 4> Indices; + if (isa<LoadInst>(I)) { + for (unsigned i = 0; i < InterleaveFactor; i++) + if (Group->getMember(i)) + Indices.push_back(i); + } + + // Calculate the cost of the whole interleaved group. + unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, + Group->getFactor(), Indices, + Group->getAlignment(), AS); + + if (Group->isReverse()) + Cost += Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; +} + +unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, + unsigned VF) { + + // Calculate scalar cost only. Vectorization cost should be ready at this + // moment. + if (VF == 1) { + Type *ValTy = getMemInstValueType(I); + unsigned Alignment = getMemInstAlignment(I); + unsigned AS = getMemInstAlignment(I); + + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); + } + return getWideningCost(I, VF); +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. - if (Legal->isUniformAfterVectorization(I)) + if (isUniformAfterVectorization(I, VF)) VF = 1; if (VF > 1 && isProfitableToScalarize(I, VF)) @@ -6850,6 +7189,79 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return VectorizationCostTy(C, TypeNotScalarized); } +void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { + if (VF == 1) + return; + for (BasicBlock *BB : TheLoop->blocks()) { + // For each instruction in the old loop. + for (Instruction &I : *BB) { + Value *Ptr = getPointerOperand(&I); + if (!Ptr) + continue; + + if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) { + // Scalar load + broadcast + unsigned Cost = getUniformMemOpCost(&I, VF); + setWideningDecision(&I, VF, CM_Scalarize, Cost); + continue; + } + + // We assume that widening is the best solution when possible. + if (Legal->memoryInstructionCanBeWidened(&I, VF)) { + unsigned Cost = getConsecutiveMemOpCost(&I, VF); + setWideningDecision(&I, VF, CM_Widen, Cost); + continue; + } + + // Choose between Interleaving, Gather/Scatter or Scalarization. + unsigned InterleaveCost = UINT_MAX; + unsigned NumAccesses = 1; + if (Legal->isAccessInterleaved(&I)) { + auto Group = Legal->getInterleavedAccessGroup(&I); + assert(Group && "Fail to get an interleaved access group."); + + // Make one decision for the whole group. + if (getWideningDecision(&I, VF) != CM_Unknown) + continue; + + NumAccesses = Group->getNumMembers(); + InterleaveCost = getInterleaveGroupCost(&I, VF); + } + + unsigned GatherScatterCost = + Legal->isLegalGatherOrScatter(&I) + ? getGatherScatterCost(&I, VF) * NumAccesses + : UINT_MAX; + + unsigned ScalarizationCost = + getMemInstScalarizationCost(&I, VF) * NumAccesses; + + // Choose better solution for the current VF, + // write down this decision and use it during vectorization. + unsigned Cost; + InstWidening Decision; + if (InterleaveCost <= GatherScatterCost && + InterleaveCost < ScalarizationCost) { + Decision = CM_Interleave; + Cost = InterleaveCost; + } else if (GatherScatterCost < ScalarizationCost) { + Decision = CM_GatherScatter; + Cost = GatherScatterCost; + } else { + Decision = CM_Scalarize; + Cost = ScalarizationCost; + } + // If the instructions belongs to an interleave group, the whole group + // receives the same decision. The whole group receives the cost, but + // the cost will actually be assigned to one instruction. + if (auto Group = Legal->getInterleavedAccessGroup(&I)) + setWideningDecision(Group, VF, Decision, Cost); + else + setWideningDecision(&I, VF, Decision, Cost); + } + } +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy) { @@ -6868,7 +7280,31 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // instruction cost. return 0; case Instruction::Br: { - return TTI.getCFInstrCost(I->getOpcode()); + // In cases of scalarized and predicated instructions, there will be VF + // predicated blocks in the vectorized loop. Each branch around these + // blocks requires also an extract of its vector compare i1 element. + bool ScalarPredicatedBB = false; + BranchInst *BI = cast<BranchInst>(I); + if (VF > 1 && BI->isConditional() && + (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) + ScalarPredicatedBB = true; + + if (ScalarPredicatedBB) { + // Return cost for branches around scalarized and predicated blocks. + Type *Vec_i1Ty = + VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + + (TTI.getCFInstrCost(Instruction::Br) * VF)); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) + // The back-edge branch will remain, as will all scalar branches. + return TTI.getCFInstrCost(Instruction::Br); + else + // This branch will be eliminated by if-conversion. + return 0; + // Note: We currently assume zero cost for an unconditional branch inside + // a predicated block since it will become a fall-through, although we + // may decide in the future to call TTI for all branches. } case Instruction::PHI: { auto *Phi = cast<PHINode>(I); @@ -6969,7 +7405,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -6978,130 +7414,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); } case Instruction::Store: case Instruction::Load: { - StoreInst *SI = dyn_cast<StoreInst>(I); - LoadInst *LI = dyn_cast<LoadInst>(I); - Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); - VectorTy = ToVectorTy(ValTy, VF); - - unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); - unsigned AS = - SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); - Value *Ptr = getPointerOperand(I); - // We add the cost of address computation here instead of with the gep - // instruction because only here we know whether the operation is - // scalarized. - if (VF == 1) - return TTI.getAddressComputationCost(VectorTy) + - TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - - if (LI && Legal->isUniform(Ptr)) { - // Scalar load + broadcast - unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType()); - Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS); - return Cost + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy); - } - - // For an interleaved access, calculate the total cost of the whole - // interleave group. - if (Legal->isAccessInterleaved(I)) { - auto Group = Legal->getInterleavedAccessGroup(I); - assert(Group && "Fail to get an interleaved access group."); - - // Only calculate the cost once at the insert position. - if (Group->getInsertPos() != I) - return 0; - - unsigned InterleaveFactor = Group->getFactor(); - Type *WideVecTy = - VectorType::get(VectorTy->getVectorElementType(), - VectorTy->getVectorNumElements() * InterleaveFactor); - - // Holds the indices of existing members in an interleaved load group. - // An interleaved store group doesn't need this as it doesn't allow gaps. - SmallVector<unsigned, 4> Indices; - if (LI) { - for (unsigned i = 0; i < InterleaveFactor; i++) - if (Group->getMember(i)) - Indices.push_back(i); - } - - // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS); - - if (Group->isReverse()) - Cost += - Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); - - // FIXME: The interleaved load group with a huge gap could be even more - // expensive than scalar operations. Then we could ignore such group and - // use scalar operations instead. - return Cost; - } - - // Check if the memory instruction will be scalarized. - if (Legal->memoryInstructionMustBeScalarized(I, VF)) { - unsigned Cost = 0; - Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - - // Figure out whether the access is strided and get the stride value - // if it's known in compile time - const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); - - // Get the cost of the scalar memory instruction and address computation. - Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); - Cost += VF * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS); - - // Get the overhead of the extractelement and insertelement instructions - // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); - - // If we have a predicated store, it may not be executed for each vector - // lane. Scale the cost by the probability of executing the predicated - // block. - if (Legal->isScalarWithPredication(I)) - Cost /= getReciprocalPredBlockProb(); - - return Cost; - } - - // Determine if the pointer operand of the access is either consecutive or - // reverse consecutive. - int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); - bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. - bool UseGatherOrScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(I); - - unsigned Cost = TTI.getAddressComputationCost(VectorTy); - if (UseGatherOrScatter) { - assert(ConsecutiveStride == 0 && - "Gather/Scatter are not used for consecutive stride"); - return Cost + - TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), Alignment); - } - // Wide load/stores. - if (Legal->isMaskRequired(I)) - Cost += - TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - - if (Reverse) - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); - return Cost; + VectorTy = ToVectorTy(getMemInstValueType(I), VF); + return getMemoryInstructionCost(I, VF); } case Instruction::ZExt: case Instruction::SExt: @@ -7115,12 +7433,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - // We optimize the truncation of induction variable. - // The cost of these is the same as the scalar operation. - if (I->getOpcode() == Instruction::Trunc && - Legal->isInductionVariable(I->getOperand(0))) - return TTI.getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType()); + // We optimize the truncation of induction variables having constant + // integer steps. The cost of these truncations is the same as the scalar + // operation. + if (isOptimizableIVTruncate(I, VF)) { + auto *Trunc = cast<TruncInst>(I); + return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), + Trunc->getSrcTy(), Trunc); + } Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); @@ -7143,7 +7463,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } } - return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); } case Instruction::Call: { bool NeedToScalarize; @@ -7172,9 +7492,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) @@ -7206,81 +7524,34 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } - - // Insert values known to be scalar into VecValuesToIgnore. This is a - // conservative estimation of the values that will later be scalarized. - // - // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may - // still be scalarized. For example, we may find an instruction to be - // more profitable for a given vectorization factor if it were to be - // scalarized. But at this point, we haven't yet computed the - // vectorization factor. - for (auto *BB : TheLoop->getBlocks()) - for (auto &I : *BB) - if (Legal->isScalarAfterVectorization(&I)) - VecValuesToIgnore.insert(&I); } -void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, - bool IfPredicateInstr) { - assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); - // Holds vector parameters or scalars, in case of uniform vals. - SmallVector<VectorParts, 4> Params; - - setDebugLocFromInst(Builder, Instr); - - // Does this instruction return a value ? - bool IsVoidRetTy = Instr->getType()->isVoidTy(); - - // Initialize a new scalar map entry. - ScalarParts Entry(UF); - - VectorParts Cond; - if (IfPredicateInstr) - Cond = createBlockInMask(Instr->getParent()); - - // For each vector unroll 'part': - for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part].resize(1); - // For each scalar that we create: - - // Start an "if (pred) a[i] = ..." block. - Value *Cmp = nullptr; - if (IfPredicateInstr) { - if (Cond[Part]->getType()->isVectorTy()) - Cond[Part] = - Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], - ConstantInt::get(Cond[Part]->getType(), 1)); - } - - Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) - Cloned->setName(Instr->getName() + ".cloned"); - - // Replace the operands of the cloned instructions with their scalar - // equivalents in the new loop. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0); - Cloned->setOperand(op, NewOp); - } +LoopVectorizationCostModel::VectorizationFactor +LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { - // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + // Width 1 means no vectorize, cost 0 means uncomputed cost. + const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U, + 0U}; + Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); + if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. + return NoVectorization; - // Add the cloned scalar to the scalar map entry. - Entry[Part][0] = Cloned; + if (UserVF) { + DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + CM.selectUserVectorizationFactor(UserVF); + return {UserVF, 0}; + } - // If we just cloned a new assumption, add it the assumption cache. - if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); + unsigned MaxVF = MaybeMaxVF.getValue(); + assert(MaxVF != 0 && "MaxVF is zero."); + if (MaxVF == 1) + return NoVectorization; - // End if-block. - if (IfPredicateInstr) - PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); - } - VectorLoopValueMap.initScalar(Instr, Entry); + // Select the optimal vectorization factor. + return CM.selectVectorizationFactor(MaxVF); } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { @@ -7414,11 +7685,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - // Use the cost model. - LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints); - CM.collectValuesToIgnore(); - // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = @@ -7464,9 +7730,20 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - // Select the optimal vectorization factor. - const LoopVectorizationCostModel::VectorizationFactor VF = - CM.selectVectorizationFactor(OptForSize); + // Use the cost model. + LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, + &Hints); + CM.collectValuesToIgnore(); + + // Use the planner for vectorization. + LoopVectorizationPlanner LVP(CM); + + // Get user vectorization factor. + unsigned UserVF = Hints.getWidth(); + + // Plan how to best vectorize, return the best VF and its cost. + LoopVectorizationCostModel::VectorizationFactor VF = + LVP.plan(OptForSize, UserVF); // Select the interleave count. unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); @@ -7522,10 +7799,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { // Do not vectorize or interleaving the loop. - ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, + ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, L->getStartLoc(), L->getHeader()) << VecDiagMsg.second); - ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, + ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, L->getStartLoc(), L->getHeader()) << IntDiagMsg.second); return false; @@ -7621,6 +7898,16 @@ bool LoopVectorizePass::runImpl( if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) return false; + bool Changed = false; + + // The vectorizer requires loops to be in simplified form. + // Since simplification may add new inner loops, it has to run before the + // legality and profitability checks. This means running the loop vectorizer + // will simplify all loops, regardless of whether anything end up being + // vectorized. + for (auto &L : *LI) + Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */); + // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops // and can invalidate iterators across the loops. @@ -7632,9 +7919,15 @@ bool LoopVectorizePass::runImpl( LoopsAnalyzed += Worklist.size(); // Now walk the identified inner loops. - bool Changed = false; - while (!Worklist.empty()) - Changed |= processLoop(Worklist.pop_back_val()); + while (!Worklist.empty()) { + Loop *L = Worklist.pop_back_val(); + + // For the inner loops we actually process, form LCSSA to simplify the + // transform. + Changed |= formLCSSARecursively(*L, *DT, LI, SE); + + Changed |= processLoop(L); + } // Process each loop nest in the function. return Changed; |