diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 2906 |
1 files changed, 1913 insertions, 993 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ee5733d20f4f..8cde0c4cd607 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -191,7 +191,7 @@ static cl::opt<bool> EnableIndVarRegisterHeur( cl::desc("Count the induction variable only once when interleaving")); static cl::opt<bool> EnableCondStoresVectorization( - "enable-cond-stores-vec", cl::init(false), cl::Hidden, + "enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization.")); static cl::opt<unsigned> MaxNestedScalarReductionIC( @@ -213,6 +213,32 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); +/// Create an analysis remark that explains why vectorization failed +/// +/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p +/// RemarkName is the identifier for the remark. If \p I is passed it is an +/// instruction that prevents vectorization. Otherwise \p TheLoop is used for +/// the location of the remark. \return the remark object that can be +/// streamed to. +static OptimizationRemarkAnalysis +createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, + Instruction *I = nullptr) { + Value *CodeRegion = TheLoop->getHeader(); + DebugLoc DL = TheLoop->getStartLoc(); + + if (I) { + CodeRegion = I->getParent(); + // If there is no debug location attached to the instruction, revert back to + // using the loop's. + if (I->getDebugLoc()) + DL = I->getDebugLoc(); + } + + OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); + R << "loop not vectorized: "; + return R; +} + namespace { // Forward declarations. @@ -221,70 +247,13 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; -// A traits type that is intended to be used in graph algorithms. The graph it -// models starts at the loop header, and traverses the BasicBlocks that are in -// the loop body, but not the loop header. Since the loop header is skipped, -// the back edges are excluded. -struct LoopBodyTraits { - using NodeRef = std::pair<const Loop *, BasicBlock *>; - - // This wraps a const Loop * into the iterator, so we know which edges to - // filter out. - class WrappedSuccIterator - : public iterator_adaptor_base< - WrappedSuccIterator, succ_iterator, - typename std::iterator_traits<succ_iterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { - using BaseT = iterator_adaptor_base< - WrappedSuccIterator, succ_iterator, - typename std::iterator_traits<succ_iterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>; - - const Loop *L; - - public: - WrappedSuccIterator(succ_iterator Begin, const Loop *L) - : BaseT(Begin), L(L) {} - - NodeRef operator*() const { return {L, *I}; } - }; - - struct LoopBodyFilter { - bool operator()(NodeRef N) const { - const Loop *L = N.first; - return N.second != L->getHeader() && L->contains(N.second); - } - }; - - using ChildIteratorType = - filter_iterator<WrappedSuccIterator, LoopBodyFilter>; - - static NodeRef getEntryNode(const Loop &G) { return {&G, G.getHeader()}; } - - static ChildIteratorType child_begin(NodeRef Node) { - return make_filter_range(make_range<WrappedSuccIterator>( - {succ_begin(Node.second), Node.first}, - {succ_end(Node.second), Node.first}), - LoopBodyFilter{}) - .begin(); - } - - static ChildIteratorType child_end(NodeRef Node) { - return make_filter_range(make_range<WrappedSuccIterator>( - {succ_begin(Node.second), Node.first}, - {succ_end(Node.second), Node.first}), - LoopBodyFilter{}) - .end(); - } -}; - /// Returns true if the given loop body has a cycle, excluding the loop /// itself. static bool hasCyclesInLoopBody(const Loop &L) { if (!L.empty()) return true; - for (const auto SCC : + for (const auto &SCC : make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L), scc_iterator<Loop, LoopBodyTraits>::end(L))) { if (SCC.size() > 1) { @@ -346,6 +315,41 @@ static GetElementPtrInst *getGEPInstruction(Value *Ptr) { return nullptr; } +/// A helper function that returns the pointer operand of a load or store +/// instruction. +static Value *getPointerOperand(Value *I) { + if (auto *LI = dyn_cast<LoadInst>(I)) + return LI->getPointerOperand(); + if (auto *SI = dyn_cast<StoreInst>(I)) + return SI->getPointerOperand(); + return nullptr; +} + +/// A helper function that returns true if the given type is irregular. The +/// type is irregular if its allocated size doesn't equal the store size of an +/// element of the corresponding vector type at the given vectorization factor. +static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { + + // Determine if an array of VF elements of type Ty is "bitcast compatible" + // with a <VF x Ty> vector. + if (VF > 1) { + auto *VectorTy = VectorType::get(Ty, VF); + return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); + } + + // If the vectorization factor is one, we just check if an array of type Ty + // requires padding between elements. + return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); +} + +/// A helper function that returns the reciprocal of the block probability of +/// predicated blocks. If we return X, we are assuming the predicated block +/// will execute once for for every X iterations of the loop header. +/// +/// TODO: We should use actual block probability here, if available. Currently, +/// we always assume predicated blocks have a 50% chance of executing. +static unsigned getReciprocalPredBlockProb() { return 2; } + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -366,29 +370,21 @@ public: LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - unsigned VecWidth, unsigned UnrollFactor) + OptimizationRemarkEmitter *ORE, unsigned VecWidth, + unsigned UnrollFactor, LoopVectorizationLegality *LVL, + LoopVectorizationCostModel *CM) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), - AC(AC), VF(VecWidth), UF(UnrollFactor), + AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), - OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr), - VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {} + OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth), + TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). - // MinimumBitWidths maps scalar integer values to the smallest bitwidth they - // can be validly truncated to. The cost model has assumed this truncation - // will happen when vectorizing. VecValuesToIgnore contains scalar values - // that the cost model has chosen to ignore because they will not be - // vectorized. - void vectorize(LoopVectorizationLegality *L, - const MapVector<Instruction *, uint64_t> &MinimumBitWidths, - SmallPtrSetImpl<const Value *> &VecValuesToIgnore) { - MinBWs = &MinimumBitWidths; - ValuesNotWidened = &VecValuesToIgnore; - Legal = L; + void vectorize() { // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); // Widen each instruction in the old loop to a new one in the new loop. - // Use the Legality module to find the induction and reduction variables. vectorizeLoop(); } @@ -400,11 +396,18 @@ public: protected: /// A small list of PHINodes. typedef SmallVector<PHINode *, 4> PhiVector; - /// When we unroll loops we have multiple vector values for each scalar. - /// This data structure holds the unrolled and vectorized values that - /// originated from one scalar instruction. + + /// A type for vectorized values in the new loop. Each value from the + /// original loop, when vectorized, is represented by UF vector values in the + /// new unrolled loop, where UF is the unroll factor. typedef SmallVector<Value *, 2> VectorParts; + /// A type for scalarized values in the new loop. Each value from the + /// original loop, when scalarized, is represented by UF x VF scalar values + /// in the new unrolled loop, where UF is the unroll factor and VF is the + /// vectorization factor. + typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts; + // When we if-convert we need to create edge masks. We have to cache values // so that we don't end up with exponential recursion/IR. typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts> @@ -434,7 +437,20 @@ protected: /// See PR14725. void fixLCSSAPHIs(); - /// Shrinks vector element sizes based on information in "MinBWs". + /// Iteratively sink the scalarized operands of a predicated instruction into + /// the block that was created for it. + void sinkScalarOperands(Instruction *PredInst); + + /// Predicate conditional instructions that require predication on their + /// respective conditions. + void predicateInstructions(); + + /// Collect the instructions from the original loop that would be trivially + /// dead in the vectorized loop if generated. + void collectTriviallyDeadInstructions(); + + /// Shrinks vector element sizes to the smallest bitwidth they can be legally + /// represented as. void truncateToMinimalBitwidths(); /// A helper function that computes the predicate of the block BB, assuming @@ -451,19 +467,19 @@ protected: /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, VectorParts &Entry, unsigned UF, - unsigned VF, PhiVector *PV); + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF, + PhiVector *PV); /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. If \p IfPredicateStore is true we need to 'hide' each + /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each /// scalarized instruction behind an if block predicated on the control /// dependence of the instruction. virtual void scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore = false); + bool IfPredicateInstr = false); /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -477,7 +493,10 @@ protected: /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) /// to each vector element of Val. The sequence starts at StartIndex. - virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); + /// \p Opcode is relevant for FP induction variable. + virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps Opcode = + Instruction::BinaryOpsEnd); /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and @@ -488,23 +507,39 @@ protected: /// Create a vector induction phi node based on an existing scalar one. This /// currently only works for integer induction variables with a constant - /// step. If \p TruncType is non-null, instead of widening the original IV, - /// we widen a version of the IV truncated to \p TruncType. + /// step. \p EntryVal is the value from the original loop that maps to the + /// vector phi node. If \p EntryVal is a truncate instruction, instead of + /// widening the original IV, we widen a version of the IV truncated to \p + /// EntryVal's type. void createVectorIntInductionPHI(const InductionDescriptor &II, - VectorParts &Entry, IntegerType *TruncType); + Instruction *EntryVal); /// Widen an integer induction variable \p IV. If \p Trunc is provided, the - /// induction variable will first be truncated to the corresponding type. The - /// widened values are placed in \p Entry. - void widenIntInduction(PHINode *IV, VectorParts &Entry, - TruncInst *Trunc = nullptr); - - /// When we go over instructions in the basic block we rely on previous - /// values within the current basic block or on loop invariant values. - /// When we widen (vectorize) values we place them in the map. If the values - /// are not within the map, they have to be loop invariant, so we simply - /// broadcast them into a vector. - VectorParts &getVectorValue(Value *V); + /// induction variable will first be truncated to the corresponding type. + void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr); + + /// Returns true if an instruction \p I should be scalarized instead of + /// vectorized for the chosen vectorization factor. + bool shouldScalarizeInstruction(Instruction *I) const; + + /// Returns true if we should generate a scalar version of \p IV. + bool needsScalarInduction(Instruction *IV) const; + + /// Return a constant reference to the VectorParts corresponding to \p V from + /// the original loop. If the value has already been vectorized, the + /// corresponding vector entry in VectorLoopValueMap is returned. If, + /// however, the value has a scalar entry in VectorLoopValueMap, we construct + /// new vector values on-demand by inserting the scalar values into vectors + /// with an insertelement sequence. If the value has been neither vectorized + /// nor scalarized, it must be loop invariant, so we simply broadcast the + /// value into vectors. + const VectorParts &getVectorValue(Value *V); + + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part and vector index \p Lane. If the value has + /// been vectorized but not scalarized, the necessary extractelement + /// instruction will be generated. + Value *getScalarValue(Value *V, unsigned Part, unsigned Lane); /// Try to vectorize the interleaved access group that \p Instr belongs to. void vectorizeInterleaveGroup(Instruction *Instr); @@ -547,44 +582,112 @@ protected: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); - /// This is a helper class that holds the vectorizer state. It maps scalar - /// instructions to vector instructions. When the code is 'unrolled' then - /// then a single scalar value is mapped to multiple vector parts. The parts - /// are stored in the VectorPart type. + /// This is a helper class for maintaining vectorization state. It's used for + /// mapping values from the original loop to their corresponding values in + /// the new loop. Two mappings are maintained: one for vectorized values and + /// one for scalarized values. Vectorized values are represented with UF + /// vector values in the new loop, and scalarized values are represented with + /// UF x VF scalar values in the new loop. UF and VF are the unroll and + /// vectorization factors, respectively. + /// + /// Entries can be added to either map with initVector and initScalar, which + /// initialize and return a constant reference to the new entry. If a + /// non-constant reference to a vector entry is required, getVector can be + /// used to retrieve a mutable entry. We currently directly modify the mapped + /// values during "fix-up" operations that occur once the first phase of + /// widening is complete. These operations include type truncation and the + /// second phase of recurrence widening. + /// + /// Otherwise, entries from either map should be accessed using the + /// getVectorValue or getScalarValue functions from InnerLoopVectorizer. + /// getVectorValue and getScalarValue coordinate to generate a vector or + /// scalar value on-demand if one is not yet available. When vectorizing a + /// loop, we visit the definition of an instruction before its uses. When + /// visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. struct ValueMap { - /// C'tor. UnrollFactor controls the number of vectors ('parts') that - /// are mapped. - ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} - - /// \return True if 'Key' is saved in the Value Map. - bool has(Value *Key) const { return MapStorage.count(Key); } - - /// Initializes a new entry in the map. Sets all of the vector parts to the - /// save value in 'Val'. - /// \return A reference to a vector with splat values. - VectorParts &splat(Value *Key, Value *Val) { - VectorParts &Entry = MapStorage[Key]; - Entry.assign(UF, Val); - return Entry; + + /// Construct an empty map with the given unroll and vectorization factors. + ValueMap(unsigned UnrollFactor, unsigned VecWidth) + : UF(UnrollFactor), VF(VecWidth) { + // The unroll and vectorization factors are only used in asserts builds + // to verify map entries are sized appropriately. + (void)UF; + (void)VF; + } + + /// \return True if the map has a vector entry for \p Key. + bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); } + + /// \return True if the map has a scalar entry for \p Key. + bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); } + + /// \brief Map \p Key to the given VectorParts \p Entry, and return a + /// constant reference to the new vector map entry. The given key should + /// not already be in the map, and the given VectorParts should be + /// correctly sized for the current unroll factor. + const VectorParts &initVector(Value *Key, const VectorParts &Entry) { + assert(!hasVector(Key) && "Vector entry already initialized"); + assert(Entry.size() == UF && "VectorParts has wrong dimensions"); + VectorMapStorage[Key] = Entry; + return VectorMapStorage[Key]; + } + + /// \brief Map \p Key to the given ScalarParts \p Entry, and return a + /// constant reference to the new scalar map entry. The given key should + /// not already be in the map, and the given ScalarParts should be + /// correctly sized for the current unroll and vectorization factors. + const ScalarParts &initScalar(Value *Key, const ScalarParts &Entry) { + assert(!hasScalar(Key) && "Scalar entry already initialized"); + assert(Entry.size() == UF && + all_of(make_range(Entry.begin(), Entry.end()), + [&](const SmallVectorImpl<Value *> &Values) -> bool { + return Values.size() == VF; + }) && + "ScalarParts has wrong dimensions"); + ScalarMapStorage[Key] = Entry; + return ScalarMapStorage[Key]; } - ///\return A reference to the value that is stored at 'Key'. - VectorParts &get(Value *Key) { - VectorParts &Entry = MapStorage[Key]; - if (Entry.empty()) - Entry.resize(UF); - assert(Entry.size() == UF); - return Entry; + /// \return A reference to the vector map entry corresponding to \p Key. + /// The key should already be in the map. This function should only be used + /// when it's necessary to update values that have already been vectorized. + /// This is the case for "fix-up" operations including type truncation and + /// the second phase of recurrence vectorization. If a non-const reference + /// isn't required, getVectorValue should be used instead. + VectorParts &getVector(Value *Key) { + assert(hasVector(Key) && "Vector entry not initialized"); + return VectorMapStorage.find(Key)->second; } + /// Retrieve an entry from the vector or scalar maps. The preferred way to + /// access an existing mapped entry is with getVectorValue or + /// getScalarValue from InnerLoopVectorizer. Until those functions can be + /// moved inside ValueMap, we have to declare them as friends. + friend const VectorParts &InnerLoopVectorizer::getVectorValue(Value *V); + friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, + unsigned Lane); + private: - /// The unroll factor. Each entry in the map stores this number of vector - /// elements. + /// The unroll factor. Each entry in the vector map contains UF vector + /// values. unsigned UF; - /// Map storage. We use std::map and not DenseMap because insertions to a - /// dense map invalidates its iterators. - std::map<Value *, VectorParts> MapStorage; + /// The vectorization factor. Each entry in the scalar map contains UF x VF + /// scalar values. + unsigned VF; + + /// The vector and scalar map storage. We use std::map and not DenseMap + /// because insertions to DenseMap invalidate its iterators. + std::map<Value *, VectorParts> VectorMapStorage; + std::map<Value *, ScalarParts> ScalarMapStorage; }; /// The original loop. @@ -605,6 +708,8 @@ protected: const TargetTransformInfo *TTI; /// Assumption Cache. AssumptionCache *AC; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; /// \brief LoopVersioning. It's only set up (non-null) if memchecks were /// used. @@ -646,41 +751,38 @@ protected: PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; - /// Maps scalars to widened vectors. - ValueMap WidenMap; - - /// A map of induction variables from the original loop to their - /// corresponding VF * UF scalarized values in the vectorized loop. The - /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap - /// maps original loop values to their vector versions in the new loop, - /// ScalarIVMap maps induction variables from the original loop that are not - /// vectorized to their scalar equivalents in the vector loop. Maintaining a - /// separate map for scalarized induction variables allows us to avoid - /// unnecessary scalar-to-vector-to-scalar conversions. - DenseMap<Value *, SmallVector<Value *, 8>> ScalarIVMap; + + /// Maps values from the original loop to their corresponding values in the + /// vectorized loop. A key value can map to either vector values, scalar + /// values or both kinds of values, depending on whether the key was + /// vectorized and scalarized. + ValueMap VectorLoopValueMap; /// Store instructions that should be predicated, as a pair /// <StoreInst, Predicate> - SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores; + SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions; EdgeMaskCache MaskCache; /// Trip count of the original loop. Value *TripCount; /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) Value *VectorTripCount; - /// Map of scalar integer values to the smallest bitwidth they can be legally - /// represented as. The vector equivalents of these values should be truncated - /// to this type. - const MapVector<Instruction *, uint64_t> *MinBWs; - - /// A set of values that should not be widened. This is taken from - /// VecValuesToIgnore in the cost model. - SmallPtrSetImpl<const Value *> *ValuesNotWidened; - + /// The legality analysis. LoopVectorizationLegality *Legal; + /// The profitablity analysis. + LoopVectorizationCostModel *Cost; + // Record whether runtime checks are added. bool AddedSafetyChecks; + + // Holds instructions from the original loop whose counterparts in the + // vectorized loop would be trivially dead if generated. For example, + // original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet<Instruction *, 4> DeadInstructions; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -689,16 +791,20 @@ public: LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - unsigned UnrollFactor) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, 1, - UnrollFactor) {} + OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, + LoopVectorizationLegality *LVL, + LoopVectorizationCostModel *CM) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, + UnrollFactor, LVL, CM) {} private: void scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore = false) override; + bool IfPredicateInstr = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; - Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; + Value *getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps Opcode = + Instruction::BinaryOpsEnd) override; Value *reverseVector(Value *Vec) override; }; @@ -1149,12 +1255,13 @@ public: FK_Enabled = 1, ///< Forcing enabled. }; - LoopVectorizeHints(const Loop *L, bool DisableInterleaving) + LoopVectorizeHints(const Loop *L, bool DisableInterleaving, + OptimizationRemarkEmitter &ORE) : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), - PotentiallyUnsafe(false), TheLoop(L) { + PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -1176,17 +1283,13 @@ public: bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { if (getForce() == LoopVectorizeHints::FK_Disabled) { DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), - vectorizeAnalysisPassName(), *F, - L->getStartLoc(), emitRemark()); + emitRemarkWithHints(); return false; } if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), - vectorizeAnalysisPassName(), *F, - L->getStartLoc(), emitRemark()); + emitRemarkWithHints(); return false; } @@ -1197,11 +1300,12 @@ public: // FIXME: Add interleave.disable metadata. This will allow // vectorize.disable to be used without disabling the pass and errors // to differentiate between disabled vectorization and a width of 1. - emitOptimizationRemarkAnalysis( - F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(), - "loop not vectorized: vectorization and interleaving are explicitly " - "disabled, or vectorize width and interleave count are both set to " - "1"); + ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), + "AllDisabled", L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: vectorization and interleaving are " + "explicitly disabled, or vectorize width and interleave " + "count are both set to 1"); return false; } @@ -1209,23 +1313,27 @@ public: } /// Dumps all the hint information. - std::string emitRemark() const { - VectorizationReport R; + void emitRemarkWithHints() const { + using namespace ore; if (Force.Value == LoopVectorizeHints::FK_Disabled) - R << "vectorization is explicitly disabled"; + ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "loop not vectorized: vectorization is explicitly disabled"); else { - R << "use -Rpass-analysis=loop-vectorize for more info"; + OptimizationRemarkMissed R(LV_NAME, "MissedDetails", + TheLoop->getStartLoc(), TheLoop->getHeader()); + R << "loop not vectorized"; if (Force.Value == LoopVectorizeHints::FK_Enabled) { - R << " (Force=true"; + R << " (Force=" << NV("Force", true); if (Width.Value != 0) - R << ", Vector Width=" << Width.Value; + R << ", Vector Width=" << NV("VectorWidth", Width.Value); if (Interleave.Value != 0) - R << ", Interleave Count=" << Interleave.Value; + R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); R << ")"; } + ORE.emit(R); } - - return R.str(); } unsigned getWidth() const { return Width.Value; } @@ -1241,7 +1349,7 @@ public: return LV_NAME; if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) return LV_NAME; - return DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint; + return OptimizationRemarkAnalysis::AlwaysPrint; } bool allowReordering() const { @@ -1379,19 +1487,23 @@ private: /// The loop these hints belong to. const Loop *TheLoop; + + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter &ORE; }; -static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop, +static void emitAnalysisDiag(const Loop *TheLoop, const LoopVectorizeHints &Hints, + OptimizationRemarkEmitter &ORE, const LoopAccessReport &Message) { const char *Name = Hints.vectorizeAnalysisPassName(); - LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name); + LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE); } static void emitMissedWarning(Function *F, Loop *L, - const LoopVectorizeHints &LH) { - emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(), - LH.emitRemark()); + const LoopVectorizeHints &LH, + OptimizationRemarkEmitter *ORE) { + LH.emitRemarkWithHints(); if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { if (LH.getWidth() != 1) @@ -1425,12 +1537,12 @@ public: TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI, - LoopVectorizationRequirements *R, LoopVectorizeHints *H) - : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), GetLAA(GetLAA), LAI(nullptr), - InterleaveInfo(PSE, L, DT, LI), Induction(nullptr), - WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), - Hints(H) {} + OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, + LoopVectorizeHints *H) + : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), + GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI), + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), + Requirements(R), Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1490,9 +1602,12 @@ public: /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); - /// Returns true if this instruction will remain scalar after vectorization. + /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); } + /// Returns true if \p I is known to be scalar after vectorization. + bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); } + /// Returns the information that we collected about runtime memory check. const RuntimePointerChecking *getRuntimePointerChecking() const { return LAI->getRuntimePointerChecking(); @@ -1545,6 +1660,17 @@ public: bool isLegalMaskedGather(Type *DataType) { return TTI->isLegalMaskedGather(DataType); } + /// Returns true if the target machine can represent \p V as a masked gather + /// or scatter operation. + bool isLegalGatherOrScatter(Value *V) { + auto *LI = dyn_cast<LoadInst>(V); + auto *SI = dyn_cast<StoreInst>(V); + if (!LI && !SI) + return false; + auto *Ptr = getPointerOperand(V); + auto *Ty = cast<PointerType>(Ptr->getType())->getElementType(); + return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + } /// Returns true if vector representation of the instruction \p I /// requires mask. @@ -1553,6 +1679,21 @@ public: unsigned getNumLoads() const { return LAI->getNumLoads(); } unsigned getNumPredStores() const { return NumPredStores; } + /// Returns true if \p I is an instruction that will be scalarized with + /// predication. Such instructions include conditional stores and + /// instructions that may divide by zero. + bool isScalarWithPredication(Instruction *I); + + /// Returns true if \p I is a memory instruction that has a consecutive or + /// consecutive-like pointer operand. Consecutive-like pointers are pointers + /// that are treated like consecutive pointers during vectorization. The + /// pointer operands of interleaved accesses are an example. + bool hasConsecutiveLikePtrOperand(Instruction *I); + + /// Returns true if \p I is a memory instruction that must be scalarized + /// during vectorization. + bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1); + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -1569,9 +1710,24 @@ private: /// transformation. bool canVectorizeWithIfConvert(); - /// Collect the variables that need to stay uniform after vectorization. + /// Collect the instructions that are uniform after vectorization. An + /// instruction is uniform if we represent it with a single scalar value in + /// the vectorized loop corresponding to each vector iteration. Examples of + /// uniform instructions include pointer operands of consecutive or + /// interleaved memory accesses. Note that although uniformity implies an + /// instruction will be scalar, the reverse is not true. In general, a + /// scalarized instruction will be represented by VF scalar values in the + /// vectorized loop, each corresponding to an iteration of the original + /// scalar loop. void collectLoopUniforms(); + /// Collect the instructions that are scalar after vectorization. An + /// instruction is scalar if it is known to be uniform or will be scalarized + /// during vectorization. Non-uniform scalarized instructions will be + /// represented by VF values in the vectorized loop, each corresponding to an + /// iteration of the original scalar loop. + void collectLoopScalars(); + /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -1588,7 +1744,19 @@ private: /// VectorizationReport because the << operator of VectorizationReport returns /// LoopAccessReport. void emitAnalysis(const LoopAccessReport &Message) const { - emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); + emitAnalysisDiag(TheLoop, *Hints, *ORE, Message); + } + + /// Create an analysis remark that explains why vectorization failed + /// + /// \p RemarkName is the identifier for the remark. If \p I is passed it is + /// an instruction that prevents vectorization. Otherwise the loop is used + /// for the location of the remark. \return the remark object that can be + /// streamed to. + OptimizationRemarkAnalysis + createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const { + return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), + RemarkName, TheLoop, I); } /// \brief If an access has a symbolic strides, this maps the pointer value to @@ -1613,8 +1781,6 @@ private: PredicatedScalarEvolution &PSE; /// Target Library Info. TargetLibraryInfo *TLI; - /// Parent function - Function *TheFunction; /// Target Transform Info const TargetTransformInfo *TTI; /// Dominator Tree. @@ -1624,6 +1790,8 @@ private: // And the loop-accesses info corresponding to this loop. This pointer is // null until canVectorizeMemory sets it up. const LoopAccessInfo *LAI; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; /// The interleave access information contains groups of interleaved accesses /// with the same stride and close to each other. @@ -1648,10 +1816,13 @@ private: /// Allowed outside users. This holds the induction and reduction /// vars which can be accessed from outside the loop. SmallPtrSet<Value *, 4> AllowedExit; - /// This set holds the variables which are known to be uniform after - /// vectorization. + + /// Holds the instructions known to be uniform after vectorization. SmallPtrSet<Instruction *, 4> Uniforms; + /// Holds the instructions known to be scalar after vectorization. + SmallPtrSet<Instruction *, 4> Scalars; + /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; @@ -1679,10 +1850,11 @@ public: LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, - AssumptionCache *AC, const Function *F, + AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints) : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), TheFunction(F), Hints(Hints) {} + AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} /// Information about vectorization costs struct VectorizationFactor { @@ -1732,6 +1904,29 @@ public: /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); + /// \returns The smallest bitwidth each instruction can be represented with. + /// The vector equivalents of these instructions should be truncated to this + /// type. + const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { + return MinBWs; + } + + /// \returns True if it is more profitable to scalarize instruction \p I for + /// vectorization factor \p VF. + bool isProfitableToScalarize(Instruction *I, unsigned VF) const { + auto Scalars = InstsToScalarize.find(VF); + assert(Scalars != InstsToScalarize.end() && + "VF not yet analyzed for scalarization profitability"); + return Scalars->second.count(I); + } + + /// \returns True if instruction \p I can be truncated to a smaller bitwidth + /// for vectorization factor \p VF. + bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { + return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) && + !Legal->isScalarAfterVectorization(I); + } + private: /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1760,20 +1955,44 @@ private: /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); - /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. These are handled as LoopAccessReport rather than - /// VectorizationReport because the << operator of VectorizationReport returns - /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) const { - emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); + /// Create an analysis remark that explains why vectorization failed + /// + /// \p RemarkName is the identifier for the remark. \return the remark object + /// that can be streamed to. + OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { + return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), + RemarkName, TheLoop); } -public: /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated /// to this type. MapVector<Instruction *, uint64_t> MinBWs; + /// A type representing the costs for instructions if they were to be + /// scalarized rather than vectorized. The entries are Instruction-Cost + /// pairs. + typedef DenseMap<Instruction *, unsigned> ScalarCostsTy; + + /// A map holding scalar costs for different vectorization factors. The + /// presence of a cost for an instruction in the mapping indicates that the + /// instruction will be scalarized when vectorizing with the associated + /// vectorization factor. The entries are VF-ScalarCostTy pairs. + DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; + + /// Returns the expected difference in cost from scalarizing the expression + /// feeding a predicated instruction \p PredInst. The instructions to + /// scalarize and their scalar costs are collected in \p ScalarCosts. A + /// non-negative return value implies the expression will be scalarized. + /// Currently, only single-use chains are considered for scalarization. + int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, + unsigned VF); + + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + +public: /// The loop that we evaluate. Loop *TheLoop; /// Predicated scalar evolution analysis. @@ -1790,6 +2009,9 @@ public: DemandedBits *DB; /// Assumption cache. AssumptionCache *AC; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; + const Function *TheFunction; /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; @@ -1813,8 +2035,8 @@ public: /// followed by a non-expert user. class LoopVectorizationRequirements { public: - LoopVectorizationRequirements() - : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {} + LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) + : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {} void addUnsafeAlgebraInst(Instruction *I) { // First unsafe algebra instruction. @@ -1825,13 +2047,15 @@ public: void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { - const char *Name = Hints.vectorizeAnalysisPassName(); + const char *PassName = Hints.vectorizeAnalysisPassName(); bool Failed = false; if (UnsafeAlgebraInst && !Hints.allowReordering()) { - emitOptimizationRemarkAnalysisFPCommute( - F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), - VectorizationReport() << "cannot prove it is safe to reorder " - "floating-point operations"); + ORE.emit( + OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps", + UnsafeAlgebraInst->getDebugLoc(), + UnsafeAlgebraInst->getParent()) + << "loop not vectorized: cannot prove it is safe to reorder " + "floating-point operations"); Failed = true; } @@ -1842,10 +2066,11 @@ public: NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; if ((ThresholdReached && !Hints.allowReordering()) || PragmaThresholdReached) { - emitOptimizationRemarkAnalysisAliasing( - F->getContext(), Name, *F, L->getStartLoc(), - VectorizationReport() - << "cannot prove it is safe to reorder memory operations"); + ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", + L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"); DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); Failed = true; } @@ -1856,6 +2081,9 @@ public: private: unsigned NumRuntimePointerChecks; Instruction *UnsafeAlgebraInst; + + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter &ORE; }; static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { @@ -1897,12 +2125,13 @@ struct LoopVectorize : public FunctionPass { auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); + auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, - GetLAA); + GetLAA, *ORE); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -1917,6 +2146,7 @@ struct LoopVectorize : public FunctionPass { AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LoopAccessLegacyAnalysis>(); AU.addRequired<DemandedBitsWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<BasicAAWrapperPass>(); @@ -1949,7 +2179,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { } void InnerLoopVectorizer::createVectorIntInductionPHI( - const InductionDescriptor &II, VectorParts &Entry, IntegerType *TruncType) { + const InductionDescriptor &II, Instruction *EntryVal) { Value *Start = II.getStartValue(); ConstantInt *Step = II.getConstIntStepValue(); assert(Step && "Can not widen an IV with a non-constant step"); @@ -1957,7 +2187,8 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - if (TruncType) { + if (isa<TruncInst>(EntryVal)) { + auto *TruncType = cast<IntegerType>(EntryVal->getType()); Step = ConstantInt::getSigned(TruncType, Step->getSExtValue()); Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } @@ -1972,18 +2203,45 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( // factor. The last of those goes into the PHI. PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", &*LoopVectorBody->getFirstInsertionPt()); - Value *LastInduction = VecInd; + Instruction *LastInduction = VecInd; + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = LastInduction; - LastInduction = Builder.CreateAdd(LastInduction, SplatVF, "step.add"); + LastInduction = cast<Instruction>( + Builder.CreateAdd(LastInduction, SplatVF, "step.add")); } + VectorLoopValueMap.initVector(EntryVal, Entry); + if (isa<TruncInst>(EntryVal)) + addMetadata(Entry, EntryVal); + + // Move the last step to the end of the latch block. This ensures consistent + // placement of all induction updates. + auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); + auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); + auto *ICmp = cast<Instruction>(Br->getCondition()); + LastInduction->moveBefore(ICmp); + LastInduction->setName("vec.ind.next"); VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); - VecInd->addIncoming(LastInduction, LoopVectorBody); + VecInd->addIncoming(LastInduction, LoopVectorLatch); } -void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, - TruncInst *Trunc) { +bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { + return Legal->isScalarAfterVectorization(I) || + Cost->isProfitableToScalarize(I, VF); +} + +bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { + if (shouldScalarizeInstruction(IV)) + return true; + auto isScalarInst = [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); + }; + return any_of(IV->users(), isScalarInst); +} + +void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { auto II = Legal->getInductionVars()->find(IV); assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); @@ -1991,12 +2249,25 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, auto ID = II->second; assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); - // If a truncate instruction was provided, get the smaller type. - auto *TruncType = Trunc ? cast<IntegerType>(Trunc->getType()) : nullptr; + // The scalar value to broadcast. This will be derived from the canonical + // induction variable. + Value *ScalarIV = nullptr; // The step of the induction. Value *Step = nullptr; + // The value from the original loop to which we are mapping the new induction + // variable. + Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; + + // True if we have vectorized the induction variable. + auto VectorizedIV = false; + + // Determine if we want a scalar version of the induction variable. This is + // true if the induction variable itself is not widened, or if it has at + // least one user in the loop that is not widened. + auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); + // If the induction variable has a constant integer step value, go ahead and // get it now. if (ID.getConstIntStepValue()) @@ -2006,40 +2277,50 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (VF > 1 && IV->getType() == Induction->getType() && Step && - !ValuesNotWidened->count(IV)) - return createVectorIntInductionPHI(ID, Entry, TruncType); - - // The scalar value to broadcast. This will be derived from the canonical - // induction variable. - Value *ScalarIV = nullptr; - - // Define the scalar induction variable and step values. If we were given a - // truncation type, truncate the canonical induction variable and constant - // step. Otherwise, derive these values from the induction descriptor. - if (TruncType) { - assert(Step && "Truncation requires constant integer step"); - auto StepInt = cast<ConstantInt>(Step)->getSExtValue(); - ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType); - Step = ConstantInt::getSigned(TruncType, StepInt); - } else { - ScalarIV = Induction; - auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - if (IV != OldInduction) { - ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()); - ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); - ScalarIV->setName("offset.idx"); - } - if (!Step) { - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), - &*Builder.GetInsertPoint()); + !shouldScalarizeInstruction(EntryVal)) { + createVectorIntInductionPHI(ID, EntryVal); + VectorizedIV = true; + } + + // If we haven't yet vectorized the induction variable, or if we will create + // a scalar one, we need to define the scalar induction variable and step + // values. If we were given a truncation type, truncate the canonical + // induction variable and constant step. Otherwise, derive these values from + // the induction descriptor. + if (!VectorizedIV || NeedsScalarIV) { + if (Trunc) { + auto *TruncType = cast<IntegerType>(Trunc->getType()); + assert(Step && "Truncation requires constant integer step"); + auto StepInt = cast<ConstantInt>(Step)->getSExtValue(); + ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType); + Step = ConstantInt::getSigned(TruncType, StepInt); + } else { + ScalarIV = Induction; + auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + if (IV != OldInduction) { + ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()); + ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); + ScalarIV->setName("offset.idx"); + } + if (!Step) { + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), + &*Builder.GetInsertPoint()); + } } } - // Splat the scalar induction variable, and build the necessary step vectors. - Value *Broadcasted = getBroadcastInstrs(ScalarIV); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + // If we haven't yet vectorized the induction variable, splat the scalar + // induction variable, and build the necessary step vectors. + if (!VectorizedIV) { + Value *Broadcasted = getBroadcastInstrs(ScalarIV); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + VectorLoopValueMap.initVector(EntryVal, Entry); + if (Trunc) + addMetadata(Entry, Trunc); + } // If an induction variable is only used for counting loop iterations or // calculating addresses, it doesn't need to be widened. Create scalar steps @@ -2047,38 +2328,64 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, // addition of the scalar steps will not increase the number of instructions // in the loop in the common case prior to InstCombine. We will be trading // one vector extract for each scalar step. - if (VF > 1 && ValuesNotWidened->count(IV)) { - auto *EntryVal = Trunc ? cast<Value>(Trunc) : IV; + if (NeedsScalarIV) buildScalarSteps(ScalarIV, Step, EntryVal); - } } -Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, - Value *Step) { +Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps BinOp) { + // Create and check the types. assert(Val->getType()->isVectorTy() && "Must be a vector"); - assert(Val->getType()->getScalarType()->isIntegerTy() && - "Elem must be an integer"); - assert(Step->getType() == Val->getType()->getScalarType() && - "Step has wrong type"); - // Create the types. - Type *ITy = Val->getType()->getScalarType(); - VectorType *Ty = cast<VectorType>(Val->getType()); - int VLen = Ty->getNumElements(); + int VLen = Val->getType()->getVectorNumElements(); + + Type *STy = Val->getType()->getScalarType(); + assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && + "Induction Step must be an integer or FP"); + assert(Step->getType() == STy && "Step has wrong type"); + SmallVector<Constant *, 8> Indices; + if (STy->isIntegerTy()) { + // Create a vector of consecutive numbers from zero to VF. + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(STy, StartIdx + i)); + + // Add the consecutive indices to the vector value. + Constant *Cv = ConstantVector::get(Indices); + assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. + Step = Builder.CreateMul(Cv, Step); + return Builder.CreateAdd(Val, Step, "induction"); + } + + // Floating point induction. + assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && + "Binary Opcode should be specified for FP induction"); // Create a vector of consecutive numbers from zero to VF. for (int i = 0; i < VLen; ++i) - Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); + Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); - assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + Step = Builder.CreateVectorSplat(VLen, Step); - assert(Step->getType() == Val->getType() && "Invalid step vec"); - // FIXME: The newly created binary instructions should contain nsw/nuw flags, - // which can be found from the original scalar operations. - Step = Builder.CreateMul(Cv, Step); - return Builder.CreateAdd(Val, Step, "induction"); + + // Floating point operations had to be 'fast' to enable the induction. + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + + Value *MulOp = Builder.CreateFMul(Cv, Step); + if (isa<Instruction>(MulOp)) + // Have to check, MulOp may be a constant + cast<Instruction>(MulOp)->setFastMathFlags(Flags); + + Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); + if (isa<Instruction>(BOp)) + cast<Instruction>(BOp)->setFastMathFlags(Flags); + return BOp; } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, @@ -2092,98 +2399,34 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && "Val and Step should have the same integer type"); - // Compute the scalar steps and save the results in ScalarIVMap. - for (unsigned Part = 0; Part < UF; ++Part) - for (unsigned I = 0; I < VF; ++I) { - auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I); + // Determine the number of scalars we need to generate for each unroll + // iteration. If EntryVal is uniform, we only need to generate the first + // lane. Otherwise, we generate all VF values. + unsigned Lanes = + Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF; + + // Compute the scalar steps and save the results in VectorLoopValueMap. + ScalarParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(VF); + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane); auto *Mul = Builder.CreateMul(StartIdx, Step); auto *Add = Builder.CreateAdd(ScalarIV, Mul); - ScalarIVMap[EntryVal].push_back(Add); + Entry[Part][Lane] = Add; } + } + VectorLoopValueMap.initScalar(EntryVal, Entry); } int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { - assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); - auto *SE = PSE.getSE(); - // Make sure that the pointer does not point to structs. - if (Ptr->getType()->getPointerElementType()->isAggregateType()) - return 0; - - // If this value is a pointer induction variable, we know it is consecutive. - PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); - if (Phi && Inductions.count(Phi)) { - InductionDescriptor II = Inductions[Phi]; - return II.getConsecutiveDirection(); - } - - GetElementPtrInst *Gep = getGEPInstruction(Ptr); - if (!Gep) - return 0; - - unsigned NumOperands = Gep->getNumOperands(); - Value *GpPtr = Gep->getPointerOperand(); - // If this GEP value is a consecutive pointer induction variable and all of - // the indices are constant, then we know it is consecutive. - Phi = dyn_cast<PHINode>(GpPtr); - if (Phi && Inductions.count(Phi)) { - - // Make sure that the pointer does not point to structs. - PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); - if (GepPtrType->getElementType()->isAggregateType()) - return 0; - - // Make sure that all of the index operands are loop invariant. - for (unsigned i = 1; i < NumOperands; ++i) - if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) - return 0; - - InductionDescriptor II = Inductions[Phi]; - return II.getConsecutiveDirection(); - } - - unsigned InductionOperand = getGEPInductionOperand(Gep); - - // Check that all of the gep indices are uniform except for our induction - // operand. - for (unsigned i = 0; i != NumOperands; ++i) - if (i != InductionOperand && - !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) - return 0; - // We can emit wide load/stores only if the last non-zero index is the - // induction variable. - const SCEV *Last = nullptr; - if (!getSymbolicStrides() || !getSymbolicStrides()->count(Gep)) - Last = PSE.getSCEV(Gep->getOperand(InductionOperand)); - else { - // Because of the multiplication by a stride we can have a s/zext cast. - // We are going to replace this stride by 1 so the cast is safe to ignore. - // - // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - // %0 = trunc i64 %indvars.iv to i32 - // %mul = mul i32 %0, %Stride1 - // %idxprom = zext i32 %mul to i64 << Safe cast. - // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom - // - Last = replaceSymbolicStrideSCEV(PSE, *getSymbolicStrides(), - Gep->getOperand(InductionOperand), Gep); - if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) - Last = - (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) - ? C->getOperand() - : Last; - } - if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { - const SCEV *Step = AR->getStepRecurrence(*SE); - - // The memory is consecutive because the last index is consecutive - // and all other indices are loop invariant. - if (Step->isOne()) - return 1; - if (Step->isAllOnesValue()) - return -1; - } + const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : + ValueToValueMap(); + int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); + if (Stride == 1 || Stride == -1) + return Stride; return 0; } @@ -2191,23 +2434,112 @@ bool LoopVectorizationLegality::isUniform(Value *V) { return LAI->isUniform(V); } -InnerLoopVectorizer::VectorParts & +const InnerLoopVectorizer::VectorParts & InnerLoopVectorizer::getVectorValue(Value *V) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); + assert(!V->getType()->isVoidTy() && "Type does not produce a value"); // If we have a stride that is replaced by one, do it here. if (Legal->hasStride(V)) V = ConstantInt::get(V->getType(), 1); // If we have this scalar in the map, return it. - if (WidenMap.has(V)) - return WidenMap.get(V); + if (VectorLoopValueMap.hasVector(V)) + return VectorLoopValueMap.VectorMapStorage[V]; + + // If the value has not been vectorized, check if it has been scalarized + // instead. If it has been scalarized, and we actually need the value in + // vector form, we will construct the vector values on demand. + if (VectorLoopValueMap.hasScalar(V)) { + + // Initialize a new vector map entry. + VectorParts Entry(UF); + + // If we've scalarized a value, that value should be an instruction. + auto *I = cast<Instruction>(V); + + // If we aren't vectorizing, we can just copy the scalar map values over to + // the vector map. + if (VF == 1) { + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getScalarValue(V, Part, 0); + return VectorLoopValueMap.initVector(V, Entry); + } + + // Get the last scalar instruction we generated for V. If the value is + // known to be uniform after vectorization, this corresponds to lane zero + // of the last unroll iteration. Otherwise, the last instruction is the one + // we created for the last vector lane of the last unroll iteration. + unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1; + auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane)); + + // Set the insert point after the last scalarized instruction. This ensures + // the insertelement sequence will directly follow the scalar definitions. + auto OldIP = Builder.saveIP(); + auto NewIP = std::next(BasicBlock::iterator(LastInst)); + Builder.SetInsertPoint(&*NewIP); + + // However, if we are vectorizing, we need to construct the vector values. + // If the value is known to be uniform after vectorization, we can just + // broadcast the scalar value corresponding to lane zero for each unroll + // iteration. Otherwise, we construct the vector values using insertelement + // instructions. Since the resulting vectors are stored in + // VectorLoopValueMap, we will only generate the insertelements once. + for (unsigned Part = 0; Part < UF; ++Part) { + Value *VectorValue = nullptr; + if (Legal->isUniformAfterVectorization(I)) { + VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0)); + } else { + VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); + for (unsigned Lane = 0; Lane < VF; ++Lane) + VectorValue = Builder.CreateInsertElement( + VectorValue, getScalarValue(V, Part, Lane), + Builder.getInt32(Lane)); + } + Entry[Part] = VectorValue; + } + Builder.restoreIP(OldIP); + return VectorLoopValueMap.initVector(V, Entry); + } // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - return WidenMap.splat(V, B); + return VectorLoopValueMap.initVector(V, VectorParts(UF, B)); +} + +Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, + unsigned Lane) { + + // If the value is not an instruction contained in the loop, it should + // already be scalar. + if (OrigLoop->isLoopInvariant(V)) + return V; + + assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V)) + : true && "Uniform values only have lane zero"); + + // If the value from the original loop has not been vectorized, it is + // represented by UF x VF scalar values in the new loop. Return the requested + // scalar value. + if (VectorLoopValueMap.hasScalar(V)) + return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane]; + + // If the value has not been scalarized, get its entry in VectorLoopValueMap + // for the given unroll part. If this entry is not a vector type (i.e., the + // vectorization factor is one), there is no need to generate an + // extractelement instruction. + auto *U = getVectorValue(V)[Part]; + if (!U->getType()->isVectorTy()) { + assert(VF == 1 && "Value not scalarized has non-vector type"); + return U; + } + + // Otherwise, the value from the original loop has been vectorized and is + // represented by UF vector values. Extract and return the requested scalar + // value from the appropriate vector lane. + return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2355,7 +2687,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); - Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + Value *Ptr = getPointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); @@ -2365,15 +2697,20 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // Prepare for the new pointers. setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrParts = getVectorValue(Ptr); SmallVector<Value *, 2> NewPtrs; unsigned Index = Group->getIndex(Instr); + + // If the group is reverse, adjust the index to refer to the last vector lane + // instead of the first. We adjust the index from the first vector lane, + // rather than directly getting the pointer for lane VF - 1, because the + // pointer operand of the interleaved access is supposed to be uniform. For + // uniform instructions, we're only required to generate a value for the + // first vector lane in each unroll iteration. + if (Group->isReverse()) + Index += (VF - 1) * Group->getFactor(); + for (unsigned Part = 0; Part < UF; Part++) { - // Extract the pointer for current instruction from the pointer vector. A - // reverse access uses the pointer in the last lane. - Value *NewPtr = Builder.CreateExtractElement( - PtrParts[Part], - Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + Value *NewPtr = getScalarValue(Ptr, Part, 0); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2397,20 +2734,30 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // Vectorize the interleaved load group. if (LI) { + + // For each unroll part, create a wide load for the group. + SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { - Instruction *NewLoadInstr = Builder.CreateAlignedLoad( + auto *NewLoad = Builder.CreateAlignedLoad( NewPtrs[Part], Group->getAlignment(), "wide.vec"); + addMetadata(NewLoad, Instr); + NewLoads.push_back(NewLoad); + } - for (unsigned i = 0; i < InterleaveFactor; i++) { - Instruction *Member = Group->getMember(i); + // For each member in the group, shuffle out the appropriate data from the + // wide loads. + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); - // Skip the gaps in the group. - if (!Member) - continue; + // Skip the gaps in the group. + if (!Member) + continue; - Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF); + VectorParts Entry(UF); + Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF); + for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( - NewLoadInstr, UndefVec, StrideMask, "strided.vec"); + NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { @@ -2418,12 +2765,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); } - VectorParts &Entry = WidenMap.get(Member); Entry[Part] = Group->isReverse() ? reverseVector(StridedVec) : StridedVec; } - - addMetadata(NewLoadInstr, Instr); + VectorLoopValueMap.initVector(Member, Entry); } return; } @@ -2479,7 +2824,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + Value *Ptr = getPointerOperand(Instr); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. @@ -2487,93 +2832,57 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (!Alignment) Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy); - uint64_t VectorElementSize = DL.getTypeStoreSize(DataTy) / VF; - - if (SI && Legal->blockNeedsPredication(SI->getParent()) && - !Legal->isMaskRequired(SI)) - return scalarizeInstruction(Instr, true); - if (ScalarAllocatedSize != VectorElementSize) - return scalarizeInstruction(Instr); + // Scalarize the memory instruction if necessary. + if (Legal->memoryInstructionMustBeScalarized(Instr, VF)) + return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); - // If the pointer is loop invariant scalarize the load. - if (LI && Legal->isUniform(Ptr)) - return scalarizeInstruction(Instr); - - // If the pointer is non-consecutive and gather/scatter is not supported - // scalarize the instruction. + // Determine if the pointer operand of the access is either consecutive or + // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - bool CreateGatherScatter = - !ConsecutiveStride && ((LI && Legal->isLegalMaskedGather(ScalarDataTy)) || - (SI && Legal->isLegalMaskedScatter(ScalarDataTy))); - if (!ConsecutiveStride && !CreateGatherScatter) - return scalarizeInstruction(Instr); + // Determine if either a gather or scatter operation is legal. + bool CreateGatherScatter = + !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr); - Constant *Zero = Builder.getInt32(0); - VectorParts &Entry = WidenMap.get(Instr); VectorParts VectorGep; // Handle consecutive loads/stores. GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (ConsecutiveStride) { - if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { - setDebugLocFromInst(Builder, Gep); - Value *PtrOperand = Gep->getPointerOperand(); - Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; - FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - Gep2->setOperand(0, FirstBasePtr); - Gep2->setName("gep.indvar.base"); - Ptr = Builder.Insert(Gep2); - } else if (Gep) { - setDebugLocFromInst(Builder, Gep); - assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), - OrigLoop) && - "Base ptr must be invariant"); - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; + if (Gep) { unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(Gep); - // Create the new GEP with the new induction variable. +#ifndef NDEBUG + // The original GEP that identified as a consecutive memory access + // should have only one loop-variant operand. + unsigned NumOfLoopVariantOps = 0; + for (unsigned i = 0; i < NumOperands; ++i) + if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), + OrigLoop)) + NumOfLoopVariantOps++; + assert(NumOfLoopVariantOps == 1 && + "Consecutive GEP should have only one loop-variant operand"); +#endif GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - - for (unsigned i = 0; i < NumOperands; ++i) { - Value *GepOperand = Gep->getOperand(i); - Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); - - // Update last index or loop invariant instruction anchored in loop. - if (i == InductionOperand || - (GepOperandInst && OrigLoop->contains(GepOperandInst))) { - assert((i == InductionOperand || - PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), - OrigLoop)) && - "Must be last index or loop invariant"); - - VectorParts &GEPParts = getVectorValue(GepOperand); - - // If GepOperand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction. - Value *Index = ScalarIVMap.count(GepOperand) - ? ScalarIVMap[GepOperand][0] - : Builder.CreateExtractElement(GEPParts[0], Zero); - - Gep2->setOperand(i, Index); - Gep2->setName("gep.indvar.idx"); - } - } + Gep2->setName("gep.indvar"); + + // A new GEP is created for a 0-lane value of the first unroll iteration. + // The GEPs for the rest of the unroll iterations are computed below as an + // offset from this GEP. + for (unsigned i = 0; i < NumOperands; ++i) + // We can apply getScalarValue() for all GEP indices. It returns an + // original value for loop-invariant operand and 0-lane for consecutive + // operand. + Gep2->setOperand(i, getScalarValue(Gep->getOperand(i), + 0, /* First unroll iteration */ + 0 /* 0-lane of the vector */ )); + setDebugLocFromInst(Builder, Gep); Ptr = Builder.Insert(Gep2); + } else { // No GEP - // Use the induction element ptr. - assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrVal = getVectorValue(Ptr); - Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + Ptr = getScalarValue(Ptr, 0, 0); } } else { // At this point we should vector version of GEP for Gather or Scatter @@ -2660,6 +2969,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Handle loads. assert(LI && "Must have a load instruction"); setDebugLocFromInst(Builder, LI); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Instruction *NewLI; if (CreateGatherScatter) { @@ -2692,70 +3002,45 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } addMetadata(NewLI, LI); } + VectorLoopValueMap.initVector(Instr, Entry); } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore) { + bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + DEBUG(dbgs() << "LV: Scalarizing" + << (IfPredicateInstr ? " and predicating:" : ":") << *Instr + << '\n'); // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - auto *SrcInst = dyn_cast<Instruction>(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block, - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = - IsVoidRetTy ? nullptr - : UndefValue::get(VectorType::get(Instr->getType(), VF)); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // Initialize a new scalar map entry. + ScalarParts Entry(UF); VectorParts Cond; - if (IfPredicateStore) { - assert(Instr->getParent()->getSinglePredecessor() && - "Only support single predecessor blocks"); - Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), - Instr->getParent()); - } + if (IfPredicateInstr) + Cond = createBlockInMask(Instr->getParent()); + + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF; // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(VF); // For each scalar that we create: - for (unsigned Width = 0; Width < VF; ++Width) { + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { // Start if-block. Value *Cmp = nullptr; - if (IfPredicateStore) { - Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); + if (IfPredicateInstr) { + Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); } @@ -2763,18 +3048,11 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - // If the operand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction if vectorizing. - auto *NewOp = Params[op][Part]; - auto *ScalarOp = Instr->getOperand(op); - if (ScalarIVMap.count(ScalarOp)) - NewOp = ScalarIVMap[ScalarOp][VF * Part + Width]; - else if (NewOp->getType()->isVectorTy()) - NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width)); + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Lane); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2782,22 +3060,20 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to the scalar map entry. + Entry[Part][Lane] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, - Builder.getInt32(Width)); // End if-block. - if (IfPredicateStore) - PredicatedStores.push_back( - std::make_pair(cast<StoreInst>(Cloned), Cmp)); + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } } + VectorLoopValueMap.initScalar(Instr, Entry); } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -2811,10 +3087,12 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, Latch = Header; IRBuilder<> Builder(&*Header->getFirstInsertionPt()); - setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); + Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); + setDebugLocFromInst(Builder, OldInst); auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); Builder.SetInsertPoint(Latch->getTerminator()); + setDebugLocFromInst(Builder, OldInst); // Create i+1 and fill the PHINode. Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); @@ -3152,8 +3430,10 @@ void InnerLoopVectorizer::createEmptyLoop() { EndValue = CountRoundDown; } else { IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); - Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, - II.getStep()->getType(), "cast.crd"); + Type *StepType = II.getStep()->getType(); + Instruction::CastOps CastOp = + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); EndValue = II.transform(B, CRD, PSE.getSE(), DL); EndValue->setName("ind.end"); @@ -3201,7 +3481,7 @@ void InnerLoopVectorizer::createEmptyLoop() { if (MDNode *LID = OrigLoop->getLoopID()) Lp->setLoopID(LID); - LoopVectorizeHints Hints(Lp, true); + LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); } @@ -3324,8 +3604,9 @@ static Value *addFastMathFlag(Value *V) { return V; } -/// Estimate the overhead of scalarizing a value. Insert and Extract are set if -/// the result needs to be inserted and/or extracted from vectors. +/// \brief Estimate the overhead of scalarizing a value based on its type. +/// Insert and Extract are set if the result needs to be inserted and/or +/// extracted from vectors. static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, const TargetTransformInfo &TTI) { if (Ty->isVoidTy()) @@ -3335,15 +3616,46 @@ static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, unsigned Cost = 0; for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) { - if (Insert) - Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); if (Extract) Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I); + if (Insert) + Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); } return Cost; } +/// \brief Estimate the overhead of scalarizing an Instruction based on the +/// types of its operands and return value. +static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys, + Type *RetTy, + const TargetTransformInfo &TTI) { + unsigned ScalarizationCost = + getScalarizationOverhead(RetTy, true, false, TTI); + + for (Type *Ty : OpTys) + ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); + + return ScalarizationCost; +} + +/// \brief Estimate the overhead of scalarizing an instruction. This is a +/// convenience wrapper for the type-based getScalarizationOverhead API. +static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, + const TargetTransformInfo &TTI) { + if (VF == 1) + return 0; + + Type *RetTy = ToVectorTy(I->getType(), VF); + + SmallVector<Type *, 4> OpTys; + unsigned OperandsNum = I->getNumOperands(); + for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd) + OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF)); + + return getScalarizationOverhead(OpTys, RetTy, TTI); +} + // Estimate cost of a call instruction CI if it were vectorized with factor VF. // Return the cost of the instruction, including scalarization overhead if it's // needed. The flag NeedToScalarize shows if the call needs to be scalarized - @@ -3374,10 +3686,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = - getScalarizationOverhead(RetTy, true, false, TTI); - for (Type *Ty : Tys) - ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3434,8 +3743,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { // later and will remove any ext/trunc pairs. // SmallPtrSet<Value *, 4> Erased; - for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + for (const auto &KV : Cost->getMinimalBitwidths()) { + // If the value wasn't vectorized, we must maintain the original scalar + // type. The absence of the value from VectorLoopValueMap indicates that it + // wasn't vectorized. + if (!VectorLoopValueMap.hasVector(KV.first)) + continue; + VectorParts &Parts = VectorLoopValueMap.getVector(KV.first); for (Value *&I : Parts) { if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) continue; @@ -3526,8 +3840,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { } // We'll have created a bunch of ZExts that are now parentless. Clean up. - for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + for (const auto &KV : Cost->getMinimalBitwidths()) { + // If the value wasn't vectorized, we must maintain the original scalar + // type. The absence of the value from VectorLoopValueMap indicates that it + // wasn't vectorized. + if (!VectorLoopValueMap.hasVector(KV.first)) + continue; + VectorParts &Parts = VectorLoopValueMap.getVector(KV.first); for (Value *&I : Parts) { ZExtInst *Inst = dyn_cast<ZExtInst>(I); if (Inst && Inst->use_empty()) { @@ -3558,6 +3877,11 @@ void InnerLoopVectorizer::vectorizeLoop() { // are vectorized, so we can use them to construct the PHI. PhiVector PHIsToFix; + // Collect instructions from the original loop that will become trivially + // dead in the vectorized loop. We don't need to vectorize these + // instructions. + collectTriviallyDeadInstructions(); + // Scan the loop in a topological order to ensure that defs are vectorized // before users. LoopBlocksDFS DFS(OrigLoop); @@ -3605,7 +3929,7 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. - VectorParts &VectorExit = getVectorValue(LoopExitInst); + const VectorParts &VectorExit = getVectorValue(LoopExitInst); Type *VecTy = VectorExit[0]->getType(); // Find the reduction identity variable. Zero for addition, or, xor, @@ -3644,10 +3968,10 @@ void InnerLoopVectorizer::vectorizeLoop() { // Reductions do not have to start at zero. They can start with // any loop invariant values. - VectorParts &VecRdxPhi = WidenMap.get(Phi); + const VectorParts &VecRdxPhi = getVectorValue(Phi); BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); - VectorParts &Val = getVectorValue(LoopVal); + const VectorParts &Val = getVectorValue(LoopVal); for (unsigned part = 0; part < UF; ++part) { // Make sure to add the reduction stat value only to the // first unroll part. @@ -3664,7 +3988,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // instructions. Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - VectorParts RdxParts = getVectorValue(LoopExitInst); + VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst); setDebugLocFromInst(Builder, LoopExitInst); // If the vector reduction can be performed in a smaller type, we truncate @@ -3797,17 +4121,8 @@ void InnerLoopVectorizer::vectorizeLoop() { // Make sure DomTree is updated. updateAnalysis(); - // Predicate any stores. - for (auto KV : PredicatedStores) { - BasicBlock::iterator I(KV.first); - auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); - auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, - /*BranchWeights=*/nullptr, DT, LI); - I->moveBefore(T); - I->getParent()->setName("pred.store.if"); - BB->setName("pred.store.continue"); - } - DEBUG(DT->verifyDomTree()); + predicateInstructions(); + // Remove redundant induction instructions. cse(LoopVectorBody); } @@ -3882,7 +4197,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We constructed a temporary phi node in the first phase of vectorization. // This phi node will eventually be deleted. - auto &PhiParts = getVectorValue(Phi); + VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi); Builder.SetInsertPoint(cast<Instruction>(PhiParts[0])); // Create a phi node for the new recurrence. The current value will either be @@ -3974,10 +4289,217 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { } } +void InnerLoopVectorizer::collectTriviallyDeadInstructions() { + BasicBlock *Latch = OrigLoop->getLoopLatch(); + + // We create new control-flow for the vectorized loop, so the original + // condition will be dead after vectorization if it's only used by the + // branch. + auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); + if (Cmp && Cmp->hasOneUse()) + DeadInstructions.insert(Cmp); + + // We create new "steps" for induction variable updates to which the original + // induction variables map. An original update instruction will be dead if + // all its users except the induction variable are dead. + for (auto &Induction : *Legal->getInductionVars()) { + PHINode *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + if (all_of(IndUpdate->users(), [&](User *U) -> bool { + return U == Ind || DeadInstructions.count(cast<Instruction>(U)); + })) + DeadInstructions.insert(IndUpdate); + } +} + +void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { + + // The basic block and loop containing the predicated instruction. + auto *PredBB = PredInst->getParent(); + auto *VectorLoop = LI->getLoopFor(PredBB); + + // Initialize a worklist with the operands of the predicated instruction. + SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); + + // Holds instructions that we need to analyze again. An instruction may be + // reanalyzed if we don't yet know if we can sink it or not. + SmallVector<Instruction *, 8> InstsToReanalyze; + + // Returns true if a given use occurs in the predicated block. Phi nodes use + // their operands in their corresponding predecessor blocks. + auto isBlockOfUsePredicated = [&](Use &U) -> bool { + auto *I = cast<Instruction>(U.getUser()); + BasicBlock *BB = I->getParent(); + if (auto *Phi = dyn_cast<PHINode>(I)) + BB = Phi->getIncomingBlock( + PHINode::getIncomingValueNumForOperand(U.getOperandNo())); + return BB == PredBB; + }; + + // Iteratively sink the scalarized operands of the predicated instruction + // into the block we created for it. When an instruction is sunk, it's + // operands are then added to the worklist. The algorithm ends after one pass + // through the worklist doesn't sink a single instruction. + bool Changed; + do { + + // Add the instructions that need to be reanalyzed to the worklist, and + // reset the changed indicator. + Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); + InstsToReanalyze.clear(); + Changed = false; + + while (!Worklist.empty()) { + auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); + + // We can't sink an instruction if it is a phi node, is already in the + // predicated block, is not in the loop, or may have side effects. + if (!I || isa<PHINode>(I) || I->getParent() == PredBB || + !VectorLoop->contains(I) || I->mayHaveSideEffects()) + continue; + + // It's legal to sink the instruction if all its uses occur in the + // predicated block. Otherwise, there's nothing to do yet, and we may + // need to reanalyze the instruction. + if (!all_of(I->uses(), isBlockOfUsePredicated)) { + InstsToReanalyze.push_back(I); + continue; + } + + // Move the instruction to the beginning of the predicated block, and add + // it's operands to the worklist. + I->moveBefore(&*PredBB->getFirstInsertionPt()); + Worklist.insert(I->op_begin(), I->op_end()); + + // The sinking may have enabled other instructions to be sunk, so we will + // need to iterate. + Changed = true; + } + } while (Changed); +} + +void InnerLoopVectorizer::predicateInstructions() { + + // For each instruction I marked for predication on value C, split I into its + // own basic block to form an if-then construct over C. Since I may be fed by + // an extractelement instruction or other scalar operand, we try to + // iteratively sink its scalar operands into the predicated block. If I feeds + // an insertelement instruction, we try to move this instruction into the + // predicated block as well. For non-void types, a phi node will be created + // for the resulting value (either vector or scalar). + // + // So for some predicated instruction, e.g. the conditional sdiv in: + // + // for.body: + // ... + // %add = add nsw i32 %mul, %0 + // %cmp5 = icmp sgt i32 %2, 7 + // br i1 %cmp5, label %if.then, label %if.end + // + // if.then: + // %div = sdiv i32 %0, %1 + // br label %if.end + // + // if.end: + // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ] + // + // the sdiv at this point is scalarized and if-converted using a select. + // The inactive elements in the vector are not used, but the predicated + // instruction is still executed for all vector elements, essentially: + // + // vector.body: + // ... + // %17 = add nsw <2 x i32> %16, %wide.load + // %29 = extractelement <2 x i32> %wide.load, i32 0 + // %30 = extractelement <2 x i32> %wide.load51, i32 0 + // %31 = sdiv i32 %29, %30 + // %32 = insertelement <2 x i32> undef, i32 %31, i32 0 + // %35 = extractelement <2 x i32> %wide.load, i32 1 + // %36 = extractelement <2 x i32> %wide.load51, i32 1 + // %37 = sdiv i32 %35, %36 + // %38 = insertelement <2 x i32> %32, i32 %37, i32 1 + // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17 + // + // Predication will now re-introduce the original control flow to avoid false + // side-effects by the sdiv instructions on the inactive elements, yielding + // (after cleanup): + // + // vector.body: + // ... + // %5 = add nsw <2 x i32> %4, %wide.load + // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7> + // %9 = extractelement <2 x i1> %8, i32 0 + // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue + // + // pred.sdiv.if: + // %10 = extractelement <2 x i32> %wide.load, i32 0 + // %11 = extractelement <2 x i32> %wide.load51, i32 0 + // %12 = sdiv i32 %10, %11 + // %13 = insertelement <2 x i32> undef, i32 %12, i32 0 + // br label %pred.sdiv.continue + // + // pred.sdiv.continue: + // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ] + // %15 = extractelement <2 x i1> %8, i32 1 + // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55 + // + // pred.sdiv.if54: + // %16 = extractelement <2 x i32> %wide.load, i32 1 + // %17 = extractelement <2 x i32> %wide.load51, i32 1 + // %18 = sdiv i32 %16, %17 + // %19 = insertelement <2 x i32> %14, i32 %18, i32 1 + // br label %pred.sdiv.continue55 + // + // pred.sdiv.continue55: + // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ] + // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5 + + for (auto KV : PredicatedInstructions) { + BasicBlock::iterator I(KV.first); + BasicBlock *Head = I->getParent(); + auto *BB = SplitBlock(Head, &*std::next(I), DT, LI); + auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DT, LI); + I->moveBefore(T); + sinkScalarOperands(&*I); + + I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if"); + BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue"); + + // If the instruction is non-void create a Phi node at reconvergence point. + if (!I->getType()->isVoidTy()) { + Value *IncomingTrue = nullptr; + Value *IncomingFalse = nullptr; + + if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) { + // If the predicated instruction is feeding an insert-element, move it + // into the Then block; Phi node will be created for the vector. + InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin()); + IEI->moveBefore(T); + IncomingTrue = IEI; // the new vector with the inserted element. + IncomingFalse = IEI->getOperand(0); // the unmodified vector + } else { + // Phi node will be created for the scalar predicated instruction. + IncomingTrue = &*I; + IncomingFalse = UndefValue::get(I->getType()); + } + + BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); + assert(PostDom && "Then block has multiple successors"); + PHINode *Phi = + PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); + IncomingTrue->replaceAllUsesWith(Phi); + Phi->addIncoming(IncomingFalse, Head); + Phi->addIncoming(IncomingTrue, I->getParent()); + } + } + + DEBUG(DT->verifyDomTree()); +} + InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { - assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && - "Invalid edge"); + assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); // Look for cached value. std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); @@ -4033,12 +4555,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void InnerLoopVectorizer::widenPHIInstruction( - Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, - unsigned VF, PhiVector *PV) { +void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, + unsigned VF, PhiVector *PV) { PHINode *P = cast<PHINode>(PN); // Handle recurrences. if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { + VectorParts Entry(UF); for (unsigned part = 0; part < UF; ++part) { // This is phase one of vectorizing PHIs. Type *VecTy = @@ -4046,6 +4568,7 @@ void InnerLoopVectorizer::widenPHIInstruction( Entry[part] = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); } + VectorLoopValueMap.initVector(P, Entry); PV->push_back(P); return; } @@ -4066,10 +4589,11 @@ void InnerLoopVectorizer::widenPHIInstruction( // SELECT(Mask3, In3, // SELECT(Mask2, In2, // ( ...))) + VectorParts Entry(UF); for (unsigned In = 0; In < NumIncoming; In++) { VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), P->getParent()); - VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); + const VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); for (unsigned part = 0; part < UF; ++part) { // We might have single edge PHIs (blocks) - use an identity @@ -4083,6 +4607,7 @@ void InnerLoopVectorizer::widenPHIInstruction( "predphi"); } } + VectorLoopValueMap.initVector(P, Entry); return; } @@ -4099,46 +4624,95 @@ void InnerLoopVectorizer::widenPHIInstruction( case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: - return widenIntInduction(P, Entry); - case InductionDescriptor::IK_PtrInduction: + return widenIntInduction(P); + case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); // This is the normalized GEP that starts counting at zero. Value *PtrInd = Induction; PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); - // This is the vector of results. Notice that we don't generate - // vector geps because scalar geps result in better code. - for (unsigned part = 0; part < UF; ++part) { - if (VF == 1) { - int EltIndex = part; - Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); - SclrGep->setName("next.gep"); - Entry[part] = SclrGep; - continue; - } - - Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); - for (unsigned int i = 0; i < VF; ++i) { - int EltIndex = i + part * VF; - Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF; + // These are the scalar results. Notice that we don't generate vector GEPs + // because scalar GEPs result in better code. + ScalarParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(VF); + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); SclrGep->setName("next.gep"); - VecVal = Builder.CreateInsertElement(VecVal, SclrGep, - Builder.getInt32(i), "insert.gep"); + Entry[Part][Lane] = SclrGep; } - Entry[part] = VecVal; } + VectorLoopValueMap.initScalar(P, Entry); return; } + case InductionDescriptor::IK_FpInduction: { + assert(P->getType() == II.getStartValue()->getType() && + "Types must match"); + // Handle other induction variables that are now based on the + // canonical one. + assert(P != OldInduction && "Primary induction can be integer only"); + + Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType()); + V = II.transform(Builder, V, PSE.getSE(), DL); + V->setName("fp.offset.idx"); + + // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal + + Value *Broadcasted = getBroadcastInstrs(V); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc. + Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue(); + VectorParts Entry(UF); + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getStepVector(Broadcasted, VF * part, StepVal, + II.getInductionOpcode()); + VectorLoopValueMap.initVector(P, Entry); + return; + } + } +} + +/// A helper function for checking whether an integer division-related +/// instruction may divide by zero (in which case it must be predicated if +/// executed conditionally in the scalar code). +/// TODO: It may be worthwhile to generalize and check isKnownNonZero(). +/// Non-zero divisors that are non compile-time constants will not be +/// converted into multiplication, so we will still end up scalarizing +/// the division, but can do so w/o predication. +static bool mayDivideByZero(Instruction &I) { + assert((I.getOpcode() == Instruction::UDiv || + I.getOpcode() == Instruction::SDiv || + I.getOpcode() == Instruction::URem || + I.getOpcode() == Instruction::SRem) && + "Unexpected instruction"); + Value *Divisor = I.getOperand(1); + auto *CInt = dyn_cast<ConstantInt>(Divisor); + return !CInt || CInt->isZero(); } void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (Instruction &I : *BB) { - VectorParts &Entry = WidenMap.get(&I); + + // If the instruction will become trivially dead when vectorized, we don't + // need to generate it. + if (DeadInstructions.count(&I)) + continue; + + // Scalarize instructions that should remain scalar after vectorization. + if (VF > 1 && + !(isa<BranchInst>(&I) || isa<PHINode>(&I) || + isa<DbgInfoIntrinsic>(&I)) && + shouldScalarizeInstruction(&I)) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); + continue; + } switch (I.getOpcode()) { case Instruction::Br: @@ -4147,21 +4721,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { continue; case Instruction::PHI: { // Vectorize PHINodes. - widenPHIInstruction(&I, Entry, UF, VF, PV); + widenPHIInstruction(&I, UF, VF, PV); continue; } // End of PHI. + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // Scalarize with predication if this instruction may divide by zero and + // block execution is conditional, otherwise fallthrough. + if (Legal->isScalarWithPredication(&I)) { + scalarizeInstruction(&I, true); + continue; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: @@ -4172,10 +4752,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Just widen binops. auto *BinOp = cast<BinaryOperator>(&I); setDebugLocFromInst(Builder, BinOp); - VectorParts &A = getVectorValue(BinOp->getOperand(0)); - VectorParts &B = getVectorValue(BinOp->getOperand(1)); + const VectorParts &A = getVectorValue(BinOp->getOperand(0)); + const VectorParts &B = getVectorValue(BinOp->getOperand(1)); // Use this vector value for all users of the original instruction. + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); @@ -4185,6 +4766,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, BinOp); break; } @@ -4201,20 +4783,19 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - VectorParts &Cond = getVectorValue(I.getOperand(0)); - VectorParts &Op0 = getVectorValue(I.getOperand(1)); - VectorParts &Op1 = getVectorValue(I.getOperand(2)); + const VectorParts &Cond = getVectorValue(I.getOperand(0)); + const VectorParts &Op0 = getVectorValue(I.getOperand(1)); + const VectorParts &Op1 = getVectorValue(I.getOperand(2)); - Value *ScalarCond = - (VF == 1) - ? Cond[0] - : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); + auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = Builder.CreateSelect( InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4225,8 +4806,9 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { bool FCmp = (I.getOpcode() == Instruction::FCmp); auto *Cmp = dyn_cast<CmpInst>(&I); setDebugLocFromInst(Builder, Cmp); - VectorParts &A = getVectorValue(Cmp->getOperand(0)); - VectorParts &B = getVectorValue(Cmp->getOperand(1)); + const VectorParts &A = getVectorValue(Cmp->getOperand(0)); + const VectorParts &B = getVectorValue(Cmp->getOperand(1)); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Value *C = nullptr; if (FCmp) { @@ -4238,6 +4820,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = C; } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4268,8 +4851,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { auto ID = Legal->getInductionVars()->lookup(OldInduction); if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction && ID.getConstIntStepValue()) { - widenIntInduction(OldInduction, Entry, cast<TruncInst>(CI)); - addMetadata(Entry, &I); + widenIntInduction(OldInduction, cast<TruncInst>(CI)); break; } @@ -4277,9 +4859,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); - VectorParts &A = getVectorValue(CI->getOperand(0)); + const VectorParts &A = getVectorValue(CI->getOperand(0)); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4318,6 +4902,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { break; } + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { @@ -4325,7 +4910,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Some intrinsics have a scalar argument - don't replace it with a // vector. if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { - VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); + const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); Arg = VectorArg[Part]; } Args.push_back(Arg); @@ -4363,6 +4948,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4414,7 +5000,8 @@ static bool canIfConvertPHINodes(BasicBlock *BB) { bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { - emitAnalysis(VectorizationReport() << "if-conversion is disabled"); + ORE->emit(createMissedAnalysis("IfConversionDisabled") + << "if-conversion is disabled"); return false; } @@ -4428,12 +5015,9 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (blockNeedsPredication(BB)) continue; - for (Instruction &I : *BB) { - if (auto *LI = dyn_cast<LoadInst>(&I)) - SafePointes.insert(LI->getPointerOperand()); - else if (auto *SI = dyn_cast<StoreInst>(&I)) - SafePointes.insert(SI->getPointerOperand()); - } + for (Instruction &I : *BB) + if (auto *Ptr = getPointerOperand(&I)) + SafePointes.insert(Ptr); } // Collect the blocks that need predication. @@ -4441,21 +5025,21 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { for (BasicBlock *BB : TheLoop->blocks()) { // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) { - emitAnalysis(VectorizationReport(BB->getTerminator()) - << "loop contains a switch statement"); + ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator()) + << "loop contains a switch statement"); return false; } // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { if (!blockCanBePredicated(BB, SafePointes)) { - emitAnalysis(VectorizationReport(BB->getTerminator()) - << "control flow cannot be substituted for a select"); + ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; } } else if (BB != Header && !canIfConvertPHINodes(BB)) { - emitAnalysis(VectorizationReport(BB->getTerminator()) - << "control flow cannot be substituted for a select"); + ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; } } @@ -4468,8 +5052,8 @@ bool LoopVectorizationLegality::canVectorize() { // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. if (!TheLoop->getLoopPreheader()) { - emitAnalysis(VectorizationReport() - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); return false; } @@ -4478,21 +5062,22 @@ bool LoopVectorizationLegality::canVectorize() { // // We can only vectorize innermost loops. if (!TheLoop->empty()) { - emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); + ORE->emit(createMissedAnalysis("NotInnermostLoop") + << "loop is not the innermost loop"); return false; } // We must have a single backedge. if (TheLoop->getNumBackEdges() != 1) { - emitAnalysis(VectorizationReport() - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); return false; } // We must have a single exiting block. if (!TheLoop->getExitingBlock()) { - emitAnalysis(VectorizationReport() - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); return false; } @@ -4500,8 +5085,8 @@ bool LoopVectorizationLegality::canVectorize() { // checked at the end of each iteration. With that we can assume that all // instructions in the loop are executed the same number of times. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - emitAnalysis(VectorizationReport() - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); return false; } @@ -4519,8 +5104,8 @@ bool LoopVectorizationLegality::canVectorize() { // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = PSE.getBackedgeTakenCount(); if (ExitCount == PSE.getSE()->getCouldNotCompute()) { - emitAnalysis(VectorizationReport() - << "could not determine number of loop iterations"); + ORE->emit(createMissedAnalysis("CantComputeNumberOfIterations") + << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -4537,9 +5122,6 @@ bool LoopVectorizationLegality::canVectorize() { return false; } - // Collect all of the variables that remain uniform after vectorization. - collectLoopUniforms(); - DEBUG(dbgs() << "LV: We can vectorize this loop" << (LAI->getRuntimePointerChecking()->Need ? " (with a runtime bound check)" @@ -4556,14 +5138,20 @@ bool LoopVectorizationLegality::canVectorize() { if (UseInterleaved) InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); + // Collect all instructions that are known to be uniform after vectorization. + collectLoopUniforms(); + + // Collect all instructions that are known to be scalar after vectorization. + collectLoopScalars(); + unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { - emitAnalysis(VectorizationReport() - << "Too many SCEV assumptions need to be made and checked " - << "at runtime"); + ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks") + << "Too many SCEV assumptions need to be made and checked " + << "at runtime"); DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); return false; } @@ -4621,10 +5209,12 @@ void LoopVectorizationLegality::addInductionPhi( const DataLayout &DL = Phi->getModule()->getDataLayout(); // Get the widest type. - if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(DL, PhiTy); - else - WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + if (!PhiTy->isFloatingPointTy()) { + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); + else + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + } // Int inductions are special because we only allow one IV. if (ID.getKind() == InductionDescriptor::IK_IntInduction && @@ -4667,8 +5257,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check that this PHI type is allowed. if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(VectorizationReport(Phi) - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) + << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } @@ -4681,16 +5271,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // identified reduction value with an outside user. if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit)) continue; - emitAnalysis(VectorizationReport(Phi) - << "value could not be identified as " - "an induction or reduction variable"); + ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi) + << "value could not be identified as " + "an induction or reduction variable"); return false; } // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(VectorizationReport(Phi) - << "control flow not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) + << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } @@ -4705,8 +5295,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, PSE, ID)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { addInductionPhi(Phi, ID, AllowedExit); + if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) + Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); continue; } @@ -4717,14 +5309,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // As a last resort, coerce the PHI to a AddRec expression // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, PSE, ID, true)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { addInductionPhi(Phi, ID, AllowedExit); continue; } - emitAnalysis(VectorizationReport(Phi) - << "value that could not be identified as " - "reduction is used outside the loop"); + ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi) + << "value that could not be identified as " + "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n"); return false; } // end of PHI handling @@ -4738,8 +5330,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { !isa<DbgInfoIntrinsic>(CI) && !(CI->getCalledFunction() && TLI && TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - emitAnalysis(VectorizationReport(CI) - << "call instruction cannot be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) + << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -4750,8 +5342,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { getVectorIntrinsicIDForCall(CI, TLI), 1)) { auto *SE = PSE.getSE(); if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(VectorizationReport(CI) - << "intrinsic instruction cannot be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) + << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; } @@ -4762,8 +5354,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if ((!VectorType::isValidElementType(I.getType()) && !I.getType()->isVoidTy()) || isa<ExtractElementInst>(I)) { - emitAnalysis(VectorizationReport(&I) - << "instruction return type cannot be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I) + << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } @@ -4772,8 +5364,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (auto *ST = dyn_cast<StoreInst>(&I)) { Type *T = ST->getValueOperand()->getType(); if (!VectorType::isValidElementType(T)) { - emitAnalysis(VectorizationReport(ST) - << "store instruction cannot be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeStore", ST) + << "store instruction cannot be vectorized"); return false; } @@ -4791,8 +5383,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { - emitAnalysis(VectorizationReport(&I) - << "value cannot be used outside the loop"); + ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I) + << "value cannot be used outside the loop"); return false; } @@ -4802,8 +5394,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); if (Inductions.empty()) { - emitAnalysis(VectorizationReport() - << "loop induction variable could not be identified"); + ORE->emit(createMissedAnalysis("NoInductionVariable") + << "loop induction variable could not be identified"); return false; } } @@ -4817,12 +5409,132 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { return true; } +void LoopVectorizationLegality::collectLoopScalars() { + + // If an instruction is uniform after vectorization, it will remain scalar. + Scalars.insert(Uniforms.begin(), Uniforms.end()); + + // Collect the getelementptr instructions that will not be vectorized. A + // getelementptr instruction is only vectorized if it is used for a legal + // gather or scatter operation. + for (auto *BB : TheLoop->blocks()) + for (auto &I : *BB) { + if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { + Scalars.insert(GEP); + continue; + } + auto *Ptr = getPointerOperand(&I); + if (!Ptr) + continue; + auto *GEP = getGEPInstruction(Ptr); + if (GEP && isLegalGatherOrScatter(&I)) + Scalars.erase(GEP); + } + + // An induction variable will remain scalar if all users of the induction + // variable and induction variable update remain scalar. + auto *Latch = TheLoop->getLoopLatch(); + for (auto &Induction : *getInductionVars()) { + auto *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + + // Determine if all users of the induction variable are scalar after + // vectorization. + auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I); + }); + if (!ScalarInd) + continue; + + // Determine if all users of the induction variable update instruction are + // scalar after vectorization. + auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == Ind || !TheLoop->contains(I) || Scalars.count(I); + }); + if (!ScalarIndUpdate) + continue; + + // The induction variable and its update instruction will remain scalar. + Scalars.insert(Ind); + Scalars.insert(IndUpdate); + } +} + +bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) { + if (isAccessInterleaved(I)) + return true; + if (auto *Ptr = getPointerOperand(I)) + return isConsecutivePtr(Ptr); + return false; +} + +bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { + if (!blockNeedsPredication(I->getParent())) + return false; + switch(I->getOpcode()) { + default: + break; + case Instruction::Store: + return !isMaskRequired(I); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + return mayDivideByZero(*I); + } + return false; +} + +bool LoopVectorizationLegality::memoryInstructionMustBeScalarized( + Instruction *I, unsigned VF) { + + // If the memory instruction is in an interleaved group, it will be + // vectorized and its pointer will remain uniform. + if (isAccessInterleaved(I)) + return false; + + // Get and ensure we have a valid memory instruction. + LoadInst *LI = dyn_cast<LoadInst>(I); + StoreInst *SI = dyn_cast<StoreInst>(I); + assert((LI || SI) && "Invalid memory instruction"); + + // If the pointer operand is uniform (loop invariant), the memory instruction + // will be scalarized. + auto *Ptr = getPointerOperand(I); + if (LI && isUniform(Ptr)) + return true; + + // If the pointer operand is non-consecutive and neither a gather nor a + // scatter operation is legal, the memory instruction will be scalarized. + if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I)) + return true; + + // If the instruction is a store located in a predicated block, it will be + // scalarized. + if (isScalarWithPredication(I)) + return true; + + // If the instruction's allocated size doesn't equal it's type size, it + // requires padding and will be scalarized. + auto &DL = I->getModule()->getDataLayout(); + auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + if (hasIrregularType(ScalarTy, DL, VF)) + return true; + + // Otherwise, the memory instruction should be vectorized if the rest of the + // loop is. + return false; +} + void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! - // Collect variables that will remain uniform after vectorization. + // Collect instructions inside the loop that will remain uniform after + // vectorization. - // If V is not an instruction inside the current loop, it is a Value - // outside of the scope which we are interesting in. + // Global values, params and instructions outside of current loop are out of + // scope. auto isOutOfScope = [&](Value *V) -> bool { Instruction *I = dyn_cast<Instruction>(V); return (!I || !TheLoop->contains(I)); @@ -4830,30 +5542,75 @@ void LoopVectorizationLegality::collectLoopUniforms() { SetVector<Instruction *> Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); - // Start with the conditional branch. - if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) { - Instruction *Cmp = cast<Instruction>(Latch->getTerminator()->getOperand(0)); + + // Start with the conditional branch. If the branch condition is an + // instruction contained in the loop that is only used by the branch, it is + // uniform. + auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { Worklist.insert(Cmp); DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); } - // Also add all consecutive pointer values; these values will be uniform - // after vectorization (and subsequent cleanup). - for (auto *BB : TheLoop->blocks()) { + // Holds consecutive and consecutive-like pointers. Consecutive-like pointers + // are pointers that are treated like consecutive pointers during + // vectorization. The pointer operands of interleaved accesses are an + // example. + SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; + + // Holds pointer operands of instructions that are possibly non-uniform. + SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; + + // Iterate over the instructions in the loop, and collect all + // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible + // that a consecutive-like pointer operand will be scalarized, we collect it + // in PossibleNonUniformPtrs instead. We use two sets here because a single + // getelementptr instruction can be used by both vectorized and scalarized + // memory instructions. For example, if a loop loads and stores from the same + // location, but the store is conditional, the store will be scalarized, and + // the getelementptr won't remain uniform. + for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { - if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) { - Worklist.insert(&I); - DEBUG(dbgs() << "LV: Found uniform instruction: " << I << "\n"); - } + + // If there's no pointer operand, there's nothing to do. + auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I)); + if (!Ptr) + continue; + + // True if all users of Ptr are memory accesses that have Ptr as their + // pointer operand. + auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool { + return getPointerOperand(U) == Ptr; + }); + + // Ensure the memory instruction will not be scalarized, making its + // pointer operand non-uniform. If the pointer operand is used by some + // instruction other than a memory access, we're not going to check if + // that other instruction may be scalarized here. Thus, conservatively + // assume the pointer operand may be non-uniform. + if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I)) + PossibleNonUniformPtrs.insert(Ptr); + + // If the memory instruction will be vectorized and its pointer operand + // is consecutive-like, the pointer operand should remain uniform. + else if (hasConsecutiveLikePtrOperand(&I)) + ConsecutiveLikePtrs.insert(Ptr); + } + + // Add to the Worklist all consecutive and consecutive-like pointers that + // aren't also identified as possibly non-uniform. + for (auto *V : ConsecutiveLikePtrs) + if (!PossibleNonUniformPtrs.count(V)) { + DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); + Worklist.insert(V); } - } // Expand Worklist in topological order: whenever a new instruction // is added , its users should be either already inside Worklist, or // out of scope. It ensures a uniform instruction will only be used // by uniform instructions or out of scope instructions. unsigned idx = 0; - do { + while (idx != Worklist.size()) { Instruction *I = Worklist[idx++]; for (auto OV : I->operand_values()) { @@ -4867,32 +5624,49 @@ void LoopVectorizationLegality::collectLoopUniforms() { DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); } } - } while (idx != Worklist.size()); + } + + // Returns true if Ptr is the pointer operand of a memory access instruction + // I, and I is known to not require scalarization. + auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { + return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I); + }; // For an instruction to be added into Worklist above, all its users inside - // the current loop should be already added into Worklist. This condition - // cannot be true for phi instructions which is always in a dependence loop. - // Because any instruction in the dependence cycle always depends on others - // in the cycle to be added into Worklist first, the result is no ones in - // the cycle will be added into Worklist in the end. - // That is why we process PHI separately. - for (auto &Induction : *getInductionVars()) { - auto *PN = Induction.first; - auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); - if (all_of(PN->users(), - [&](User *U) -> bool { - return U == UpdateV || isOutOfScope(U) || - Worklist.count(cast<Instruction>(U)); - }) && - all_of(UpdateV->users(), [&](User *U) -> bool { - return U == PN || isOutOfScope(U) || - Worklist.count(cast<Instruction>(U)); - })) { - Worklist.insert(cast<Instruction>(PN)); - Worklist.insert(cast<Instruction>(UpdateV)); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *PN << "\n"); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *UpdateV << "\n"); - } + // the loop should also be in Worklist. However, this condition cannot be + // true for phi nodes that form a cyclic dependence. We must process phi + // nodes separately. An induction variable will remain uniform if all users + // of the induction variable and induction variable update remain uniform. + // The code below handles both pointer and non-pointer induction variables. + for (auto &Induction : Inductions) { + auto *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + + // Determine if all users of the induction variable are uniform after + // vectorization. + auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || + isVectorizedMemAccessUse(I, Ind); + }); + if (!UniformInd) + continue; + + // Determine if all users of the induction variable update instruction are + // uniform after vectorization. + auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || + isVectorizedMemAccessUse(I, IndUpdate); + }); + if (!UniformIndUpdate) + continue; + + // The induction variable and its update instruction will remain uniform. + Worklist.insert(Ind); + Worklist.insert(IndUpdate); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n"); } Uniforms.insert(Worklist.begin(), Worklist.end()); @@ -4901,16 +5675,18 @@ void LoopVectorizationLegality::collectLoopUniforms() { bool LoopVectorizationLegality::canVectorizeMemory() { LAI = &(*GetLAA)(*TheLoop); InterleaveInfo.setLAI(LAI); - auto &OptionalReport = LAI->getReport(); - if (OptionalReport) - emitAnalysis(VectorizationReport(*OptionalReport)); + const OptimizationRemarkAnalysis *LAR = LAI->getReport(); + if (LAR) { + OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(), + "loop not vectorized: ", *LAR); + ORE->emit(VR); + } if (!LAI->canVectorizeMemory()) return false; if (LAI->hasStoreToLoopInvariantAddress()) { - emitAnalysis( - VectorizationReport() - << "write to a loop invariant address could not be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") + << "write to a loop invariant address could not be vectorized"); DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; } @@ -4967,7 +5743,6 @@ bool LoopVectorizationLegality::blockCanBePredicated( } } - // We don't predicate stores at the moment. if (I.mayWriteToMemory()) { auto *SI = dyn_cast<StoreInst>(&I); // We only support predication of stores in basic blocks with one @@ -4992,17 +5767,6 @@ bool LoopVectorizationLegality::blockCanBePredicated( } if (I.mayThrow()) return false; - - // The instructions below can trap. - switch (I.getOpcode()) { - default: - continue; - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - return false; - } } return true; @@ -5029,8 +5793,16 @@ void InterleavedAccessInfo::collectConstStrideAccesses( if (!LI && !SI) continue; - Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); - int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides); + Value *Ptr = getPointerOperand(&I); + // We don't check wrapping here because we don't know yet if Ptr will be + // part of a full group or a group with gaps. Checking wrapping for all + // pointers (even those that end up in groups with no gaps) will be overly + // conservative. For full groups, wrapping should be ok since if we would + // wrap around the address space we would do a memory access at nullptr + // even without the transformation. The wrapping checks are therefore + // deferred until after we've formed the interleaved groups. + int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false); const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); @@ -5234,20 +6006,66 @@ void InterleavedAccessInfo::analyzeInterleaving( if (Group->getNumMembers() != Group->getFactor()) releaseGroup(Group); - // If there is a non-reversed interleaved load group with gaps, we will need - // to execute at least one scalar epilogue iteration. This will ensure that - // we don't speculatively access memory out-of-bounds. Note that we only need - // to look for a member at index factor - 1, since every group must have a - // member at index zero. - for (InterleaveGroup *Group : LoadGroups) - if (!Group->getMember(Group->getFactor() - 1)) { + // Remove interleaved groups with gaps (currently only loads) whose memory + // accesses may wrap around. We have to revisit the getPtrStride analysis, + // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does + // not check wrapping (see documentation there). + // FORNOW we use Assume=false; + // TODO: Change to Assume=true but making sure we don't exceed the threshold + // of runtime SCEV assumptions checks (thereby potentially failing to + // vectorize altogether). + // Additional optional optimizations: + // TODO: If we are peeling the loop and we know that the first pointer doesn't + // wrap then we can deduce that all pointers in the group don't wrap. + // This means that we can forcefully peel the loop in order to only have to + // check the first pointer for no-wrap. When we'll change to use Assume=true + // we'll only need at most one runtime check per interleaved group. + // + for (InterleaveGroup *Group : LoadGroups) { + + // Case 1: A full group. Can Skip the checks; For full groups, if the wide + // load would wrap around the address space we would do a memory access at + // nullptr even without the transformation. + if (Group->getNumMembers() == Group->getFactor()) + continue; + + // Case 2: If first and last members of the group don't wrap this implies + // that all the pointers in the group don't wrap. + // So we check only group member 0 (which is always guaranteed to exist), + // and group member Factor - 1; If the latter doesn't exist we rely on + // peeling (if it is a non-reveresed accsess -- see Case 3). + Value *FirstMemberPtr = getPointerOperand(Group->getMember(0)); + if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, + /*ShouldCheckWrap=*/true)) { + DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " + "first group member potentially pointer-wrapping.\n"); + releaseGroup(Group); + continue; + } + Instruction *LastMember = Group->getMember(Group->getFactor() - 1); + if (LastMember) { + Value *LastMemberPtr = getPointerOperand(LastMember); + if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, + /*ShouldCheckWrap=*/true)) { + DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " + "last group member potentially pointer-wrapping.\n"); + releaseGroup(Group); + } + } + else { + // Case 3: A non-reversed interleaved load group with gaps: We need + // to execute at least one scalar epilogue iteration. This will ensure + // we don't speculatively access memory out-of-bounds. We only need + // to look for a member at index factor - 1, since every group must have + // a member at index zero. if (Group->isReverse()) { releaseGroup(Group); - } else { - DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); - RequiresScalarEpilogue = true; + continue; } + DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); + RequiresScalarEpilogue = true; } + } } LoopVectorizationCostModel::VectorizationFactor @@ -5255,28 +6073,22 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize VectorizationFactor Factor = {1U, 0U}; if (OptForSize && Legal->getRuntimePointerChecking()->Need) { - emitAnalysis( - VectorizationReport() - << "runtime pointer checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); + ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") + << "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz"); DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); return Factor; } if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { - emitAnalysis( - VectorizationReport() - << "store that is conditionally executed prevents vectorization"); + ORE->emit(createMissedAnalysis("ConditionalStore") + << "store that is conditionally executed prevents vectorization"); DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); return Factor; } - // Find the trip count. - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5334,10 +6146,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { - // If we are unable to calculate the trip count then don't try to vectorize. + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + + // If we don't know the precise trip count, don't try to vectorize. if (TC < 2) { - emitAnalysis( - VectorizationReport() + ORE->emit( + createMissedAnalysis("UnknownLoopCountComplexCFG") << "unable to calculate the loop count due to complex control flow"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; @@ -5351,11 +6166,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { else { // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. - emitAnalysis(VectorizationReport() - << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os/-Oz"); + ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") + << "cannot optimize for size and vectorize at the " + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os/-Oz"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; } @@ -5367,6 +6182,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; + collectInstsToScalarize(UserVF); return Factor; } @@ -5712,15 +6528,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; - // Ignore instructions that are never used within the loop. - if (!Ends.count(I)) - continue; // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; for (Instruction *ToRemove : List) OpenIntervals.erase(ToRemove); + // Ignore instructions that are never used within the loop. + if (!Ends.count(I)) + continue; + // Skip ignored values. if (ValuesToIgnore.count(I)) continue; @@ -5772,10 +6589,160 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { return RUs; } +void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { + + // If we aren't vectorizing the loop, or if we've already collected the + // instructions to scalarize, there's nothing to do. Collection may already + // have occurred if we have a user-selected VF and are now computing the + // expected cost for interleaving. + if (VF < 2 || InstsToScalarize.count(VF)) + return; + + // Initialize a mapping for VF in InstsToScalalarize. If we find that it's + // not profitable to scalarize any instructions, the presence of VF in the + // map will indicate that we've analyzed it already. + ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + + // Find all the instructions that are scalar with predication in the loop and + // determine if it would be better to not if-convert the blocks they are in. + // If so, we also record the instructions to scalarize. + for (BasicBlock *BB : TheLoop->blocks()) { + if (!Legal->blockNeedsPredication(BB)) + continue; + for (Instruction &I : *BB) + if (Legal->isScalarWithPredication(&I)) { + ScalarCostsTy ScalarCosts; + if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + } + } +} + +int LoopVectorizationCostModel::computePredInstDiscount( + Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, + unsigned VF) { + + assert(!Legal->isUniformAfterVectorization(PredInst) && + "Instruction marked uniform-after-vectorization will be predicated"); + + // Initialize the discount to zero, meaning that the scalar version and the + // vector version cost the same. + int Discount = 0; + + // Holds instructions to analyze. The instructions we visit are mapped in + // ScalarCosts. Those instructions are the ones that would be scalarized if + // we find that the scalar version costs less. + SmallVector<Instruction *, 8> Worklist; + + // Returns true if the given instruction can be scalarized. + auto canBeScalarized = [&](Instruction *I) -> bool { + + // We only attempt to scalarize instructions forming a single-use chain + // from the original predicated block that would otherwise be vectorized. + // Although not strictly necessary, we give up on instructions we know will + // already be scalar to avoid traversing chains that are unlikely to be + // beneficial. + if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || + Legal->isScalarAfterVectorization(I)) + return false; + + // If the instruction is scalar with predication, it will be analyzed + // separately. We ignore it within the context of PredInst. + if (Legal->isScalarWithPredication(I)) + return false; + + // If any of the instruction's operands are uniform after vectorization, + // the instruction cannot be scalarized. This prevents, for example, a + // masked load from being scalarized. + // + // We assume we will only emit a value for lane zero of an instruction + // marked uniform after vectorization, rather than VF identical values. + // Thus, if we scalarize an instruction that uses a uniform, we would + // create uses of values corresponding to the lanes we aren't emitting code + // for. This behavior can be changed by allowing getScalarValue to clone + // the lane zero values for uniforms rather than asserting. + for (Use &U : I->operands()) + if (auto *J = dyn_cast<Instruction>(U.get())) + if (Legal->isUniformAfterVectorization(J)) + return false; + + // Otherwise, we can scalarize the instruction. + return true; + }; + + // Returns true if an operand that cannot be scalarized must be extracted + // from a vector. We will account for this scalarization overhead below. Note + // that the non-void predicated instructions are placed in their own blocks, + // and their return values are inserted into vectors. Thus, an extract would + // still be required. + auto needsExtract = [&](Instruction *I) -> bool { + return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I); + }; + + // Compute the expected cost discount from scalarizing the entire expression + // feeding the predicated instruction. We currently only consider expressions + // that are single-use instruction chains. + Worklist.push_back(PredInst); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + + // If we've already analyzed the instruction, there's nothing to do. + if (ScalarCosts.count(I)) + continue; + + // Compute the cost of the vector instruction. Note that this cost already + // includes the scalarization overhead of the predicated instruction. + unsigned VectorCost = getInstructionCost(I, VF).first; + + // Compute the cost of the scalarized instruction. This cost is the cost of + // the instruction as if it wasn't if-converted and instead remained in the + // predicated block. We will scale this cost by block probability after + // computing the scalarization overhead. + unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + + // Compute the scalarization overhead of needed insertelement instructions + // and phi nodes. + if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, + false, TTI); + ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); + } + + // Compute the scalarization overhead of needed extractelement + // instructions. For each of the instruction's operands, if the operand can + // be scalarized, add it to the worklist; otherwise, account for the + // overhead. + for (Use &U : I->operands()) + if (auto *J = dyn_cast<Instruction>(U.get())) { + assert(VectorType::isValidElementType(J->getType()) && + "Instruction has non-scalar type"); + if (canBeScalarized(J)) + Worklist.push_back(J); + else if (needsExtract(J)) + ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF), + false, true, TTI); + } + + // Scale the total scalar cost by block probability. + ScalarCost /= getReciprocalPredBlockProb(); + + // Compute the discount. A non-negative discount means the vector version + // of the instruction costs more, and scalarizing would be beneficial. + Discount += VectorCost - ScalarCost; + ScalarCosts[I] = ScalarCost; + } + + return Discount; +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + collectInstsToScalarize(VF); + // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -5802,11 +6769,14 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { << VF << " For instruction: " << I << '\n'); } - // We assume that if-converted blocks have a 50% chance of being executed. - // When the code is scalar then some of the blocks are avoided due to CF. - // When the code is vectorized we execute all code paths. + // If we are vectorizing a predicated block, it will have been + // if-converted. This means that the block's instructions (aside from + // stores and instructions that may divide by zero) will now be + // unconditionally executed. For the scalar case, we may not always execute + // the predicated block. Thus, scale the block's cost by the probability of + // executing it. if (VF == 1 && Legal->blockNeedsPredication(BB)) - BlockCost.first /= 2; + BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; Cost.second |= BlockCost.second; @@ -5815,19 +6785,6 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } -/// \brief Check if the load/store instruction \p I may be translated into -/// gather/scatter during vectorization. -/// -/// Pointer \p Ptr specifies address in memory for the given scalar memory -/// instruction. We need it to retrieve data type. -/// Using gather/scatter is possible when it is supported by target. -static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr, - LoopVectorizationLegality *Legal) { - auto *DataTy = cast<PointerType>(Ptr->getType())->getElementType(); - return (isa<LoadInst>(I) && Legal->isLegalMaskedGather(DataTy)) || - (isa<StoreInst>(I) && Legal->isLegalMaskedScatter(DataTy)); -} - /// \brief Check whether the address computation for a non-consecutive memory /// access looks like an unlikely candidate for being merged into the indexing /// mode. @@ -5893,6 +6850,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (Legal->isUniformAfterVectorization(I)) VF = 1; + if (VF > 1 && isProfitableToScalarize(I, VF)) + return VectorizationCostTy(InstsToScalarize[VF][I], false); + Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); @@ -5905,7 +6865,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy) { Type *RetTy = I->getType(); - if (VF > 1 && MinBWs.count(I)) + if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); VectorTy = ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); @@ -5932,17 +6892,42 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // TODO: IF-converted IFs become selects. return 0; } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + // If we have a predicated instruction, it may not be executed for each + // vector lane. Get the scalarization cost and scale this amount by the + // probability of executing the predicated block. If the instruction is not + // predicated, we fall through to the next case. + if (VF > 1 && Legal->isScalarWithPredication(I)) { + unsigned Cost = 0; + + // These instructions have a non-void type, so account for the phi nodes + // that we will create. This cost is likely to be zero. The phi node + // cost, if any, should be scaled by the block probability because it + // models a copy at the end of each predicated block. + Cost += VF * TTI.getCFInstrCost(Instruction::PHI); + + // The cost of the non-predicated instruction. + Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); + + // The cost of insertelement and extractelement instructions needed for + // scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // Scale the cost by the probability of executing the predicated blocks. + // This assumes the predicated block for each vector lane is equally + // likely. + return Cost / getReciprocalPredBlockProb(); + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: @@ -5965,7 +6950,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, TargetTransformInfo::OP_None; Value *Op2 = I->getOperand(1); - // Check for a splat of a constant or for a non uniform vector of constants. + // Check for a splat or for a non uniform vector of constants. if (isa<ConstantInt>(Op2)) { ConstantInt *CInt = cast<ConstantInt>(Op2); if (CInt && CInt->getValue().isPowerOf2()) @@ -5980,6 +6965,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; } + } else if (Legal->isUniform(Op2)) { + Op2VK = TargetTransformInfo::OK_UniformValue; } return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, @@ -5999,9 +6986,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); - auto It = MinBWs.find(Op0AsInstruction); - if (VF > 1 && It != MinBWs.end()) - ValTy = IntegerType::get(ValTy->getContext(), It->second); + if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) + ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } @@ -6015,7 +7001,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); unsigned AS = SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); - Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); + Value *Ptr = getPointerOperand(I); // We add the cost of address computation here instead of with the gep // instruction because only here we know whether the operation is // scalarized. @@ -6072,41 +7058,43 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, return Cost; } - // Scalarized loads/stores. - int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); - bool UseGatherOrScatter = - (ConsecutiveStride == 0) && isGatherOrScatterLegal(I, Ptr, Legal); + // Check if the memory instruction will be scalarized. + if (Legal->memoryInstructionMustBeScalarized(I, VF)) { + unsigned Cost = 0; + Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - bool Reverse = ConsecutiveStride < 0; - const DataLayout &DL = I->getModule()->getDataLayout(); - uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); - uint64_t VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; - if ((!ConsecutiveStride && !UseGatherOrScatter) || - ScalarAllocatedSize != VectorElementSize) { + // True if the memory instruction's address computation is complex. bool IsComplexComputation = isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); - unsigned Cost = 0; - // The cost of extracting from the value vector and pointer vector. - Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - for (unsigned i = 0; i < VF; ++i) { - // The cost of extracting the pointer operand. - Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); - // In case of STORE, the cost of ExtractElement from the vector. - // In case of LOAD, the cost of InsertElement into the returned - // vector. - Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement - : Instruction::InsertElement, - VectorTy, i); - } - // The cost of the scalar loads/stores. + // Get the cost of the scalar memory instruction and address computation. Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); + + // Get the overhead of the extractelement and insertelement instructions + // we might create due to scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // If we have a predicated store, it may not be executed for each vector + // lane. Scale the cost by the probability of executing the predicated + // block. + if (Legal->isScalarWithPredication(I)) + Cost /= getReciprocalPredBlockProb(); + return Cost; } + // Determine if the pointer operand of the access is either consecutive or + // reverse consecutive. + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool Reverse = ConsecutiveStride < 0; + + // Determine if either a gather or scatter operation is legal. + bool UseGatherOrScatter = + !ConsecutiveStride && Legal->isLegalGatherOrScatter(I); + unsigned Cost = TTI.getAddressComputationCost(VectorTy); if (UseGatherOrScatter) { assert(ConsecutiveStride == 0 && @@ -6147,7 +7135,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); - if (VF > 1 && MinBWs.count(I)) { + if (canTruncateToMinimalBitwidth(I, VF)) { // This cast is going to be shrunk. This may remove the cast or it might // turn it into slightly different cast. For example, if MinBW == 16, // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". @@ -6176,28 +7164,11 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); return CallCost; } - default: { - // We are scalarizing the instruction. Return the cost of the scalar - // instruction, plus the cost of insert and extract into vector - // elements, times the vector width. - unsigned Cost = 0; - - if (!RetTy->isVoidTy() && VF != 1) { - unsigned InsCost = - TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy); - unsigned ExtCost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy); - - // The cost of inserting the results plus extracting each one of the - // operands. - Cost += VF * (InsCost + ExtCost * I->getNumOperands()); - } - + default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); - return Cost; - } + return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + + getScalarizationOverhead(I, VF, TTI); } // end of switch. } @@ -6217,6 +7188,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -6226,14 +7198,11 @@ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { } bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { - // Check for a store. - if (auto *ST = dyn_cast<StoreInst>(Inst)) - return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; - - // Check for a load. - if (auto *LI = dyn_cast<LoadInst>(Inst)) - return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; + // Check if the pointer operand of a load or store instruction is + // consecutive. + if (auto *Ptr = getPointerOperand(Inst)) + return Legal->isConsecutivePtr(Ptr); return false; } @@ -6249,123 +7218,46 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } - // Ignore induction phis that are only used in either GetElementPtr or ICmp - // instruction to exit loop. Induction variables usually have large types and - // can have big impact when estimating register usage. - // This is for when VF > 1. - for (auto &Induction : *Legal->getInductionVars()) { - auto *PN = Induction.first; - auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); - - // Check that the PHI is only used by the induction increment (UpdateV) or - // by GEPs. Then check that UpdateV is only used by a compare instruction, - // the loop header PHI, or by GEPs. - // FIXME: Need precise def-use analysis to determine if this instruction - // variable will be vectorized. - if (all_of(PN->users(), - [&](const User *U) -> bool { - return U == UpdateV || isa<GetElementPtrInst>(U); - }) && - all_of(UpdateV->users(), [&](const User *U) -> bool { - return U == PN || isa<ICmpInst>(U) || isa<GetElementPtrInst>(U); - })) { - VecValuesToIgnore.insert(PN); - VecValuesToIgnore.insert(UpdateV); - } - } - - // Ignore instructions that will not be vectorized. - // This is for when VF > 1. - for (BasicBlock *BB : TheLoop->blocks()) { - for (auto &Inst : *BB) { - switch (Inst.getOpcode()) - case Instruction::GetElementPtr: { - // Ignore GEP if its last operand is an induction variable so that it is - // a consecutive load/store and won't be vectorized as scatter/gather - // pattern. - - GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst); - unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(Gep); - bool GepToIgnore = true; - - // Check that all of the gep indices are uniform except for the - // induction operand. - for (unsigned i = 0; i != NumOperands; ++i) { - if (i != InductionOperand && - !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), - TheLoop)) { - GepToIgnore = false; - break; - } - } - - if (GepToIgnore) - VecValuesToIgnore.insert(&Inst); - break; - } - } - } + // Insert values known to be scalar into VecValuesToIgnore. This is a + // conservative estimation of the values that will later be scalarized. + // + // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may + // still be scalarized. For example, we may find an instruction to be + // more profitable for a given vectorization factor if it were to be + // scalarized. But at this point, we haven't yet computed the + // vectorization factor. + for (auto *BB : TheLoop->getBlocks()) + for (auto &I : *BB) + if (Legal->isScalarAfterVectorization(&I)) + VecValuesToIgnore.insert(&I); } void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore) { + bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType()); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // Initialize a new scalar map entry. + ScalarParts Entry(UF); VectorParts Cond; - if (IfPredicateStore) { - assert(Instr->getParent()->getSinglePredecessor() && - "Only support single predecessor blocks"); - Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), - Instr->getParent()); - } + if (IfPredicateInstr) + Cond = createBlockInMask(Instr->getParent()); // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(1); // For each scalar that we create: // Start an "if (pred) a[i] = ..." block. Value *Cmp = nullptr; - if (IfPredicateStore) { + if (IfPredicateInstr) { if (Cond[Part]->getType()->isVectorTy()) Cond[Part] = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); @@ -6376,47 +7268,57 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. + + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - Value *Op = Params[op][Part]; - Cloned->setOperand(op, Op); + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0); + Cloned->setOperand(op, NewOp); } // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to the scalar map entry. + Entry[Part][0] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Cloned; - // End if-block. - if (IfPredicateStore) - PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), Cmp)); + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } + VectorLoopValueMap.initScalar(Instr, Entry); } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { auto *SI = dyn_cast<StoreInst>(Instr); - bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); + bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); - return scalarizeInstruction(Instr, IfPredicateStore); + return scalarizeInstruction(Instr, IfPredicateInstr); } Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { +Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps BinOp) { // When unrolling and the VF is 1, we only need to add a simple scalar. - Type *ITy = Val->getType(); - assert(!ITy->isVectorTy() && "Val must be a scalar"); - Constant *C = ConstantInt::get(ITy, StartIdx); + Type *Ty = Val->getType(); + assert(!Ty->isVectorTy() && "Val must be a scalar"); + + if (Ty->isFloatingPointTy()) { + Constant *C = ConstantFP::get(Ty, (double)StartIdx); + + // Floating point operations had to be 'fast' to enable the unrolling. + Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); + return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); + } + Constant *C = ConstantInt::get(Ty, StartIdx); return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); } @@ -6465,7 +7367,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"); - LoopVectorizeHints Hints(L, DisableUnrolling); + LoopVectorizeHints Hints(L, DisableUnrolling, *ORE); DEBUG(dbgs() << "LV: Loop hints:" << " force=" @@ -6483,8 +7385,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Looking at the diagnostic output is the only way to determine if a loop // was vectorized (other than looking at the IR or machine code), so it // is important to generate an optimization remark for each loop. Most of - // these messages are generated by emitOptimizationRemarkAnalysis. Remarks - // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are + // these messages are generated as OptimizationRemarkAnalysis. Remarks + // generated as OptimizationRemark and OptimizationRemarkMissed are // less verbose reporting vectorized loops and unvectorized loops that may // benefit from vectorization, respectively. @@ -6495,17 +7397,18 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the loop for a trip count threshold: // do not vectorize loops with a tiny trip count. - const unsigned TC = SE->getSmallConstantTripCount(L); - if (TC > 0u && TC < TinyTripCountVectorThreshold) { + const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L); + if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); - emitAnalysisDiag(F, L, Hints, VectorizationReport() - << "vectorization is not beneficial " - "and is not explicitly forced"); + ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(), + "NotBeneficial", L) + << "vectorization is not beneficial " + "and is not explicitly forced"); return false; } } @@ -6513,17 +7416,17 @@ bool LoopVectorizePass::processLoop(Loop *L) { PredicatedScalarEvolution PSE(*SE, *L); // Check if it is legal to vectorize the loop. - LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, + LoopVectorizationRequirements Requirements(*ORE); + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); - emitMissedWarning(F, L, Hints); + emitMissedWarning(F, L, Hints, ORE); return false; } // Use the cost model. - LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F, + LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints); CM.collectValuesToIgnore(); @@ -6551,11 +7454,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); - emitAnalysisDiag( - F, L, Hints, - VectorizationReport() - << "loop not vectorized due to NoImplicitFloat attribute"); - emitMissedWarning(F, L, Hints); + ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(), + "NoImplicitFloat", L) + << "loop not vectorized due to NoImplicitFloat attribute"); + emitMissedWarning(F, L, Hints, ORE); return false; } @@ -6566,10 +7468,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (Hints.isPotentiallyUnsafe() && TTI->isFPVectorizationPotentiallyUnsafe()) { DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); - emitAnalysisDiag(F, L, Hints, - VectorizationReport() - << "loop not vectorized due to unsafe FP support."); - emitMissedWarning(F, L, Hints); + ORE->emit( + createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) + << "loop not vectorized due to unsafe FP support."); + emitMissedWarning(F, L, Hints, ORE); return false; } @@ -6584,38 +7486,43 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserIC = Hints.getInterleave(); // Identify the diagnostic messages that should be produced. - std::string VecDiagMsg, IntDiagMsg; + std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; - if (Requirements.doesNotMeet(F, L, Hints)) { DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " "requirements.\n"); - emitMissedWarning(F, L, Hints); + emitMissedWarning(F, L, Hints, ORE); return false; } if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - VecDiagMsg = - "the cost-model indicates that vectorization is not beneficial"; + VecDiagMsg = std::make_pair( + "VectorizationNotBeneficial", + "the cost-model indicates that vectorization is not beneficial"); VectorizeLoop = false; } if (IC == 1 && UserIC <= 1) { // Tell the user interleaving is not beneficial. DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); - IntDiagMsg = - "the cost-model indicates that interleaving is not beneficial"; + IntDiagMsg = std::make_pair( + "InterleavingNotBeneficial", + "the cost-model indicates that interleaving is not beneficial"); InterleaveLoop = false; - if (UserIC == 1) - IntDiagMsg += + if (UserIC == 1) { + IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; + IntDiagMsg.second += " and is explicitly disabled or interleave count is set to 1"; + } } else if (IC > 1 && UserIC == 1) { // Tell the user interleaving is beneficial, but it explicitly disabled. DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); - IntDiagMsg = "the cost-model indicates that interleaving is beneficial " - "but is explicitly disabled or interleave count is set to 1"; + IntDiagMsg = std::make_pair( + "InterleavingBeneficialButDisabled", + "the cost-model indicates that interleaving is beneficial " + "but is explicitly disabled or interleave count is set to 1"); InterleaveLoop = false; } @@ -6626,40 +7533,48 @@ bool LoopVectorizePass::processLoop(Loop *L) { const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { // Do not vectorize or interleaving the loop. - emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, - L->getStartLoc(), VecDiagMsg); - emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, - L->getStartLoc(), IntDiagMsg); + ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << VecDiagMsg.second); + ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << IntDiagMsg.second); return false; } else if (!VectorizeLoop && InterleaveLoop) { DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); - emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, - L->getStartLoc(), VecDiagMsg); + ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << VecDiagMsg.second); } else if (VectorizeLoop && !InterleaveLoop) { DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); - emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, - L->getStartLoc(), IntDiagMsg); + ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << IntDiagMsg.second); } else if (VectorizeLoop && InterleaveLoop) { DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } + using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, IC); - Unroller.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore); - - emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), - Twine("interleaved loop (interleaved count: ") + - Twine(IC) + ")"); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, + &CM); + Unroller.vectorize(); + + ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"); } else { // If we decided that it is *legal* to vectorize the loop, then do it. - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, VF.Width, IC); - LB.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, + &LVL, &CM); + LB.vectorize(); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are @@ -6669,10 +7584,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { AddRuntimeUnrollDisableMetaData(L); // Report the vectorization decision. - emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), - Twine("vectorized loop (vectorization width: ") + - Twine(VF.Width) + ", interleaved count: " + - Twine(IC) + ")"); + ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), + L->getHeader()) + << "vectorized loop (vectorization width: " + << NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << NV("InterleaveCount", IC) << ")"); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -6686,7 +7602,8 @@ bool LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, - std::function<const LoopAccessInfo &(Loop &)> &GetLAA_) { + std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, + OptimizationRemarkEmitter &ORE_) { SE = &SE_; LI = &LI_; @@ -6698,6 +7615,7 @@ bool LoopVectorizePass::runImpl( AC = &AC_; GetLAA = &GetLAA_; DB = &DB_; + ORE = &ORE_; // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -6746,13 +7664,15 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &AA = AM.getResult<AAManager>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); auto &DB = AM.getResult<DemandedBitsAnalysis>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { return LAM.getResult<LoopAccessAnalysis>(L); }; - bool Changed = runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC, GetLAA); + bool Changed = + runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC, GetLAA, ORE); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; |