diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 1687 |
1 files changed, 787 insertions, 900 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 859d0c92ca5a..c45dee590b84 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,6 +58,7 @@ #include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" #include "VPlanHCFGBuilder.h" +#include "VPlanHCFGTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -151,6 +152,16 @@ using namespace llvm; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME +/// @{ +/// Metadata attribute names +static const char *const LLVMLoopVectorizeFollowupAll = + "llvm.loop.vectorize.followup_all"; +static const char *const LLVMLoopVectorizeFollowupVectorized = + "llvm.loop.vectorize.followup_vectorized"; +static const char *const LLVMLoopVectorizeFollowupEpilogue = + "llvm.loop.vectorize.followup_epilogue"; +/// @} + STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); @@ -171,11 +182,11 @@ static cl::opt<bool> EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); -/// Maximum factor for an interleaved memory access. -static cl::opt<unsigned> MaxInterleaveGroupFactor( - "max-interleave-group-factor", cl::Hidden, - cl::desc("Maximum factor for an interleaved access group (default = 8)"), - cl::init(8)); +/// An interleave-group may need masking if it resides in a block that needs +/// predication, or in order to mask away gaps. +static cl::opt<bool> EnableMaskedInterleavedMemAccesses( + "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); /// We don't interleave loops with a known constant trip count below this /// number. @@ -240,7 +251,7 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); -static cl::opt<bool> EnableVPlanNativePath( +cl::opt<bool> EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); @@ -265,10 +276,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } -// FIXME: The following helper functions have multiple implementations -// in the project. They can be effectively organized in a common Load/Store -// utilities unit. - /// A helper function that returns the type of loaded or stored value. static Type *getMemInstValueType(Value *I) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && @@ -278,25 +285,6 @@ static Type *getMemInstValueType(Value *I) { return cast<StoreInst>(I)->getValueOperand()->getType(); } -/// A helper function that returns the alignment of load or store instruction. -static unsigned getMemInstAlignment(Value *I) { - assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && - "Expected Load or Store instruction"); - if (auto *LI = dyn_cast<LoadInst>(I)) - return LI->getAlignment(); - return cast<StoreInst>(I)->getAlignment(); -} - -/// A helper function that returns the address space of the pointer operand of -/// load or store instruction. -static unsigned getMemInstAddressSpace(Value *I) { - assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && - "Expected Load or Store instruction"); - if (auto *LI = dyn_cast<LoadInst>(I)) - return LI->getPointerAddressSpace(); - return cast<StoreInst>(I)->getPointerAddressSpace(); -} - /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. @@ -436,8 +424,10 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); + /// Try to vectorize the interleaved access group that \p Instr belongs to, + /// optionally masking the vector operations if \p BlockInMask is non-null. + void vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask = nullptr); /// Vectorize Load and Store instructions, optionally masking the vector /// operations if \p BlockInMask is non-null. @@ -448,6 +438,9 @@ public: /// the instruction. void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + /// Fix the non-induction PHIs in the OrigPHIsToFix vector. + void fixNonInductionPHIs(void); + protected: friend class LoopVectorizationPlanner; @@ -584,6 +577,16 @@ protected: /// Emit bypass checks to check any memory assumptions we may have made. void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// Compute the transformed value of Index at offset StartValue using step + /// StepValue. + /// For integer induction, returns StartValue + Index * StepValue. + /// For pointer induction, returns StartValue[Index * StepValue]. + /// FIXME: The newly created binary instructions should contain nsw/nuw + /// flags, which can be found from the original scalar operations. + Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, + const DataLayout &DL, + const InductionDescriptor &ID) const; + /// Add additional metadata to \p To that was not present on \p Orig. /// /// Currently this is used to add the noalias annotations based on the @@ -705,6 +708,10 @@ protected: // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap<PHINode *, Value *> IVEndValues; + + // Vector of original scalar PHIs whose corresponding widened PHIs need to be + // fixed up at the end of vector code generation. + SmallVector<PHINode *, 8> OrigPHIsToFix; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -752,8 +759,15 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && - !isa<DbgInfoIntrinsic>(Inst)) - B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF)); + !isa<DbgInfoIntrinsic>(Inst)) { + auto NewDIL = DIL->cloneWithDuplicationFactor(UF * VF); + if (NewDIL) + B.SetCurrentDebugLocation(NewDIL.getValue()); + else + LLVM_DEBUG(dbgs() + << "Failed to create new discriminator: " + << DIL->getFilename() << " Line: " << DIL->getLine()); + } else B.SetCurrentDebugLocation(DIL); } else @@ -801,367 +815,6 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, namespace llvm { -/// The group of interleaved loads/stores sharing the same stride and -/// close to each other. -/// -/// Each member in this group has an index starting from 0, and the largest -/// index should be less than interleaved factor, which is equal to the absolute -/// value of the access's stride. -/// -/// E.g. An interleaved load group of factor 4: -/// for (unsigned i = 0; i < 1024; i+=4) { -/// a = A[i]; // Member of index 0 -/// b = A[i+1]; // Member of index 1 -/// d = A[i+3]; // Member of index 3 -/// ... -/// } -/// -/// An interleaved store group of factor 4: -/// for (unsigned i = 0; i < 1024; i+=4) { -/// ... -/// A[i] = a; // Member of index 0 -/// A[i+1] = b; // Member of index 1 -/// A[i+2] = c; // Member of index 2 -/// A[i+3] = d; // Member of index 3 -/// } -/// -/// Note: the interleaved load group could have gaps (missing members), but -/// the interleaved store group doesn't allow gaps. -class InterleaveGroup { -public: - InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) - : Align(Align), InsertPos(Instr) { - assert(Align && "The alignment should be non-zero"); - - Factor = std::abs(Stride); - assert(Factor > 1 && "Invalid interleave factor"); - - Reverse = Stride < 0; - Members[0] = Instr; - } - - bool isReverse() const { return Reverse; } - unsigned getFactor() const { return Factor; } - unsigned getAlignment() const { return Align; } - unsigned getNumMembers() const { return Members.size(); } - - /// Try to insert a new member \p Instr with index \p Index and - /// alignment \p NewAlign. The index is related to the leader and it could be - /// negative if it is the new leader. - /// - /// \returns false if the instruction doesn't belong to the group. - bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { - assert(NewAlign && "The new member's alignment should be non-zero"); - - int Key = Index + SmallestKey; - - // Skip if there is already a member with the same index. - if (Members.count(Key)) - return false; - - if (Key > LargestKey) { - // The largest index is always less than the interleave factor. - if (Index >= static_cast<int>(Factor)) - return false; - - LargestKey = Key; - } else if (Key < SmallestKey) { - // The largest index is always less than the interleave factor. - if (LargestKey - Key >= static_cast<int>(Factor)) - return false; - - SmallestKey = Key; - } - - // It's always safe to select the minimum alignment. - Align = std::min(Align, NewAlign); - Members[Key] = Instr; - return true; - } - - /// Get the member with the given index \p Index - /// - /// \returns nullptr if contains no such member. - Instruction *getMember(unsigned Index) const { - int Key = SmallestKey + Index; - if (!Members.count(Key)) - return nullptr; - - return Members.find(Key)->second; - } - - /// Get the index for the given member. Unlike the key in the member - /// map, the index starts from 0. - unsigned getIndex(Instruction *Instr) const { - for (auto I : Members) - if (I.second == Instr) - return I.first - SmallestKey; - - llvm_unreachable("InterleaveGroup contains no such member"); - } - - Instruction *getInsertPos() const { return InsertPos; } - void setInsertPos(Instruction *Inst) { InsertPos = Inst; } - - /// Add metadata (e.g. alias info) from the instructions in this group to \p - /// NewInst. - /// - /// FIXME: this function currently does not add noalias metadata a'la - /// addNewMedata. To do that we need to compute the intersection of the - /// noalias info from all members. - void addMetadata(Instruction *NewInst) const { - SmallVector<Value *, 4> VL; - std::transform(Members.begin(), Members.end(), std::back_inserter(VL), - [](std::pair<int, Instruction *> p) { return p.second; }); - propagateMetadata(NewInst, VL); - } - -private: - unsigned Factor; // Interleave Factor. - bool Reverse; - unsigned Align; - DenseMap<int, Instruction *> Members; - int SmallestKey = 0; - int LargestKey = 0; - - // To avoid breaking dependences, vectorized instructions of an interleave - // group should be inserted at either the first load or the last store in - // program order. - // - // E.g. %even = load i32 // Insert Position - // %add = add i32 %even // Use of %even - // %odd = load i32 - // - // store i32 %even - // %odd = add i32 // Def of %odd - // store i32 %odd // Insert Position - Instruction *InsertPos; -}; -} // end namespace llvm - -namespace { - -/// Drive the analysis of interleaved memory accesses in the loop. -/// -/// Use this class to analyze interleaved accesses only when we can vectorize -/// a loop. Otherwise it's meaningless to do analysis as the vectorization -/// on interleaved accesses is unsafe. -/// -/// The analysis collects interleave groups and records the relationships -/// between the member and the group in a map. -class InterleavedAccessInfo { -public: - InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, - DominatorTree *DT, LoopInfo *LI, - const LoopAccessInfo *LAI) - : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} - - ~InterleavedAccessInfo() { - SmallPtrSet<InterleaveGroup *, 4> DelSet; - // Avoid releasing a pointer twice. - for (auto &I : InterleaveGroupMap) - DelSet.insert(I.second); - for (auto *Ptr : DelSet) - delete Ptr; - } - - /// Analyze the interleaved accesses and collect them in interleave - /// groups. Substitute symbolic strides using \p Strides. - void analyzeInterleaving(); - - /// Check if \p Instr belongs to any interleave group. - bool isInterleaved(Instruction *Instr) const { - return InterleaveGroupMap.count(Instr); - } - - /// Get the interleave group that \p Instr belongs to. - /// - /// \returns nullptr if doesn't have such group. - InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { - if (InterleaveGroupMap.count(Instr)) - return InterleaveGroupMap.find(Instr)->second; - return nullptr; - } - - /// Returns true if an interleaved group that may access memory - /// out-of-bounds requires a scalar epilogue iteration for correctness. - bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } - -private: - /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. - /// Simplifies SCEV expressions in the context of existing SCEV assumptions. - /// The interleaved access analysis can also add new predicates (for example - /// by versioning strides of pointers). - PredicatedScalarEvolution &PSE; - - Loop *TheLoop; - DominatorTree *DT; - LoopInfo *LI; - const LoopAccessInfo *LAI; - - /// True if the loop may contain non-reversed interleaved groups with - /// out-of-bounds accesses. We ensure we don't speculatively access memory - /// out-of-bounds by executing at least one scalar epilogue iteration. - bool RequiresScalarEpilogue = false; - - /// Holds the relationships between the members and the interleave group. - DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; - - /// Holds dependences among the memory accesses in the loop. It maps a source - /// access to a set of dependent sink accesses. - DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences; - - /// The descriptor for a strided memory access. - struct StrideDescriptor { - StrideDescriptor() = default; - StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size, - unsigned Align) - : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} - - // The access's stride. It is negative for a reverse access. - int64_t Stride = 0; - - // The scalar expression of this access. - const SCEV *Scev = nullptr; - - // The size of the memory object. - uint64_t Size = 0; - - // The alignment of this access. - unsigned Align = 0; - }; - - /// A type for holding instructions and their stride descriptors. - using StrideEntry = std::pair<Instruction *, StrideDescriptor>; - - /// Create a new interleave group with the given instruction \p Instr, - /// stride \p Stride and alignment \p Align. - /// - /// \returns the newly created interleave group. - InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, - unsigned Align) { - assert(!InterleaveGroupMap.count(Instr) && - "Already in an interleaved access group"); - InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); - return InterleaveGroupMap[Instr]; - } - - /// Release the group and remove all the relationships. - void releaseGroup(InterleaveGroup *Group) { - for (unsigned i = 0; i < Group->getFactor(); i++) - if (Instruction *Member = Group->getMember(i)) - InterleaveGroupMap.erase(Member); - - delete Group; - } - - /// Collect all the accesses with a constant stride in program order. - void collectConstStrideAccesses( - MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, - const ValueToValueMap &Strides); - - /// Returns true if \p Stride is allowed in an interleaved group. - static bool isStrided(int Stride) { - unsigned Factor = std::abs(Stride); - return Factor >= 2 && Factor <= MaxInterleaveGroupFactor; - } - - /// Returns true if \p BB is a predicated block. - bool isPredicated(BasicBlock *BB) const { - return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); - } - - /// Returns true if LoopAccessInfo can be used for dependence queries. - bool areDependencesValid() const { - return LAI && LAI->getDepChecker().getDependences(); - } - - /// Returns true if memory accesses \p A and \p B can be reordered, if - /// necessary, when constructing interleaved groups. - /// - /// \p A must precede \p B in program order. We return false if reordering is - /// not necessary or is prevented because \p A and \p B may be dependent. - bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A, - StrideEntry *B) const { - // Code motion for interleaved accesses can potentially hoist strided loads - // and sink strided stores. The code below checks the legality of the - // following two conditions: - // - // 1. Potentially moving a strided load (B) before any store (A) that - // precedes B, or - // - // 2. Potentially moving a strided store (A) after any load or store (B) - // that A precedes. - // - // It's legal to reorder A and B if we know there isn't a dependence from A - // to B. Note that this determination is conservative since some - // dependences could potentially be reordered safely. - - // A is potentially the source of a dependence. - auto *Src = A->first; - auto SrcDes = A->second; - - // B is potentially the sink of a dependence. - auto *Sink = B->first; - auto SinkDes = B->second; - - // Code motion for interleaved accesses can't violate WAR dependences. - // Thus, reordering is legal if the source isn't a write. - if (!Src->mayWriteToMemory()) - return true; - - // At least one of the accesses must be strided. - if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride)) - return true; - - // If dependence information is not available from LoopAccessInfo, - // conservatively assume the instructions can't be reordered. - if (!areDependencesValid()) - return false; - - // If we know there is a dependence from source to sink, assume the - // instructions can't be reordered. Otherwise, reordering is legal. - return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink); - } - - /// Collect the dependences from LoopAccessInfo. - /// - /// We process the dependences once during the interleaved access analysis to - /// enable constant-time dependence queries. - void collectDependences() { - if (!areDependencesValid()) - return; - auto *Deps = LAI->getDepChecker().getDependences(); - for (auto Dep : *Deps) - Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI)); - } -}; - -} // end anonymous namespace - -static void emitMissedWarning(Function *F, Loop *L, - const LoopVectorizeHints &LH, - OptimizationRemarkEmitter *ORE) { - LH.emitRemarkWithHints(); - - if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { - if (LH.getWidth() != 1) - ORE->emit(DiagnosticInfoOptimizationFailure( - DEBUG_TYPE, "FailedRequestedVectorization", - L->getStartLoc(), L->getHeader()) - << "loop not vectorized: " - << "failed explicitly specified loop vectorization"); - else if (LH.getInterleave() != 1) - ORE->emit(DiagnosticInfoOptimizationFailure( - DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(), - L->getHeader()) - << "loop not interleaved: " - << "failed explicitly specified loop interleaving"); - } -} - -namespace llvm { - /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -1247,34 +900,55 @@ public: /// vectorization factor \p VF. bool isProfitableToScalarize(Instruction *I, unsigned VF) const { assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return false; + auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); - return Scalars->second.count(I); + return Scalars->second.find(I) != Scalars->second.end(); } /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { if (VF == 1) return true; - assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity"); + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return false; + auto UniformsPerVF = Uniforms.find(VF); - return UniformsPerVF->second.count(I); + assert(UniformsPerVF != Uniforms.end() && + "VF not yet analyzed for uniformity"); + return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); } /// Returns true if \p I is known to be scalar after vectorization. bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { if (VF == 1) return true; - assert(Scalars.count(VF) && "Scalar values are not calculated for VF"); + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return false; + auto ScalarsPerVF = Scalars.find(VF); - return ScalarsPerVF->second.count(I); + assert(ScalarsPerVF != Scalars.end() && + "Scalar values are not calculated for VF"); + return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); } /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { - return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) && + return VF > 1 && MinBWs.find(I) != MinBWs.end() && + !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1298,7 +972,7 @@ public: /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. - void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, + void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, InstWidening W, unsigned Cost) { assert(VF >= 2 && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. @@ -1318,6 +992,12 @@ public: /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, unsigned VF) { assert(VF >= 2 && "Expected VF >=2"); + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return CM_GatherScatter; + std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) @@ -1330,7 +1010,8 @@ public: unsigned getWideningCost(Instruction *I, unsigned VF) { assert(VF >= 2 && "Expected VF >=2"); std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); - assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated"); + assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && + "The cost is not calculated"); return WideningDecisions[InstOnVF].second; } @@ -1369,7 +1050,7 @@ public: /// that may be vectorized as interleave, gather-scatter or scalarized. void collectUniformsAndScalars(unsigned VF) { // Do the analysis once. - if (VF == 1 || Uniforms.count(VF)) + if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1414,26 +1095,58 @@ public: /// Returns true if \p I is an instruction that will be scalarized with /// predication. Such instructions include conditional stores and /// instructions that may divide by zero. - bool isScalarWithPredication(Instruction *I); + /// If a non-zero VF has been calculated, we check if I will be scalarized + /// predication for that VF. + bool isScalarWithPredication(Instruction *I, unsigned VF = 1); + + // Returns true if \p I is an instruction that will be predicated either + // through scalar predication or masked load/store or masked gather/scatter. + // Superset of instructions that return true for isScalarWithPredication. + bool isPredicatedInst(Instruction *I) { + if (!blockNeedsPredication(I->getParent())) + return false; + // Loads and stores that need some form of masked operation are predicated + // instructions. + if (isa<LoadInst>(I) || isa<StoreInst>(I)) + return Legal->isMaskRequired(I); + return isScalarWithPredication(I); + } /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction in an interleaved-group + /// of memory accesses that can be vectorized with wide vector loads/stores + /// and shuffles. + bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { return InterleaveInfo.isInterleaved(Instr); } /// Get the interleaved access group that \p Instr belongs to. - const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { + const InterleaveGroup<Instruction> * + getInterleavedAccessGroup(Instruction *Instr) { return InterleaveInfo.getInterleaveGroup(Instr); } /// Returns true if an interleaved group requires a scalar iteration - /// to handle accesses with gaps. + /// to handle accesses with gaps, and there is nothing preventing us from + /// creating a scalar epilogue. bool requiresScalarEpilogue() const { - return InterleaveInfo.requiresScalarEpilogue(); + return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue(); + } + + /// Returns true if a scalar epilogue is not allowed due to optsize. + bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; } + + /// Returns true if all loop blocks should be masked to fold tail loop. + bool foldTailByMasking() const { return FoldTailByMasking; } + + bool blockNeedsPredication(BasicBlock *BB) { + return foldTailByMasking() || Legal->blockNeedsPredication(BB); } private: @@ -1482,8 +1195,10 @@ private: /// memory access. unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); - /// The cost calculation for Load instruction \p I with uniform pointer - - /// scalar load + broadcast. + /// The cost calculation for Load/Store instruction \p I with uniform pointer - + /// Load: scalar load + broadcast. + /// Store: scalar store + (loop invariant value stored? 0 : extract of last + /// element) unsigned getUniformMemOpCost(Instruction *I, unsigned VF); /// Returns whether the instruction is a load or store and will be a emitted @@ -1517,6 +1232,18 @@ private: /// vectorization as a predicated block. SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; + /// Records whether it is allowed to have the original scalar loop execute at + /// least once. This may be needed as a fallback loop in case runtime + /// aliasing/dependence checks fail, or to handle the tail/remainder + /// iterations when the trip count is unknown or doesn't divide by the VF, + /// or as a peel-loop to handle gaps in interleave-groups. + /// Under optsize and when the trip count is very small we don't allow any + /// iterations to execute in the scalar loop. + bool IsScalarEpilogueAllowed = true; + + /// All blocks of loop are to be masked to fold tail of scalar iterations. + bool FoldTailByMasking = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -1639,14 +1366,15 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp, return false; Function *Fn = OuterLp->getHeader()->getParent(); - if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) { + if (!Hints.allowVectorization(Fn, OuterLp, + true /*VectorizeOnlyWhenForced*/)) { LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); return false; } if (!Hints.getWidth()) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n"); - emitMissedWarning(Fn, OuterLp, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -1654,7 +1382,7 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp, // TODO: Interleave support is future work. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " "outer loops.\n"); - emitMissedWarning(Fn, OuterLp, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -1695,10 +1423,11 @@ struct LoopVectorize : public FunctionPass { LoopVectorizePass Impl; - explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) + explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, + bool VectorizeOnlyWhenForced = false) : FunctionPass(ID) { - Impl.DisableUnrolling = NoUnrolling; - Impl.AlwaysVectorize = AlwaysVectorize; + Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; + Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -1737,8 +1466,16 @@ struct LoopVectorize : public FunctionPass { AU.addRequired<LoopAccessLegacyAnalysis>(); AU.addRequired<DemandedBitsWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); + + // We currently do not preserve loopinfo/dominator analyses with outer loop + // vectorization. Until this is addressed, mark these analyses as preserved + // only for non-VPlan-native path. + // TODO: Preserve Loop and Dominator analyses for VPlan-native path. + if (!EnableVPlanNativePath) { + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + AU.addPreserved<BasicAAWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } @@ -1950,7 +1687,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) : Builder.CreateCast(Instruction::SIToFP, Induction, IV->getType()); - ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); + ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); ScalarIV->setName("offset.idx"); } if (Trunc) { @@ -2089,8 +1826,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { assert(!V->getType()->isVectorTy() && "Can't widen a vector"); assert(!V->getType()->isVoidTy() && "Type does not produce a value"); - // If we have a stride that is replaced by one, do it here. - if (Legal->hasStride(V)) + // If we have a stride that is replaced by one, do it here. Defer this for + // the VPlan-native path until we start running Legal checks in that path. + if (!EnableVPlanNativePath && Legal->hasStride(V)) V = ConstantInt::get(V->getType(), 1); // If we have a vector mapped to this value, return it. @@ -2214,6 +1952,17 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { "reverse"); } +// Return whether we allow using masked interleave-groups (for dealing with +// strided loads/stores that reside in predicated blocks, or for dealing +// with gaps). +static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { + // If an override option has been passed in for interleaved accesses, use it. + if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) + return EnableMaskedInterleavedMemAccesses; + + return TTI.enableMaskedInterleavedAccessVectorization(); +} + // Try to vectorize the interleave group that \p Instr belongs to. // // E.g. Translate following interleaved load group (factor = 3): @@ -2242,8 +1991,10 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { - const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask) { + const InterleaveGroup<Instruction> *Group = + Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); // Skip if current instruction is not the insert position. @@ -2257,13 +2008,22 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); - Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr)); + Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); // Prepare for the new pointers. setDebugLocFromInst(Builder, Ptr); SmallVector<Value *, 2> NewPtrs; unsigned Index = Group->getIndex(Instr); + VectorParts Mask; + bool IsMaskForCondRequired = BlockInMask; + if (IsMaskForCondRequired) { + Mask = *BlockInMask; + // TODO: extend the masked interleaved-group support to reversed access. + assert(!Group->isReverse() && "Reversed masked interleave-group " + "not supported."); + } + // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the @@ -2302,13 +2062,39 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { setDebugLocFromInst(Builder, Instr); Value *UndefVec = UndefValue::get(VecTy); + Value *MaskForGaps = nullptr; + if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { + MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); + assert(MaskForGaps && "Mask for Gaps is required but it is null"); + } + // Vectorize the interleaved load group. if (isa<LoadInst>(Instr)) { // For each unroll part, create a wide load for the group. SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { - auto *NewLoad = Builder.CreateAlignedLoad( - NewPtrs[Part], Group->getAlignment(), "wide.vec"); + Instruction *NewLoad; + if (IsMaskForCondRequired || MaskForGaps) { + assert(useMaskedInterleavedAccesses(*TTI) && + "masked interleaved groups are not allowed."); + Value *GroupMask = MaskForGaps; + if (IsMaskForCondRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + GroupMask = MaskForGaps + ? Builder.CreateBinOp(Instruction::And, ShuffledMask, + MaskForGaps) + : ShuffledMask; + } + NewLoad = + Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + GroupMask, UndefVec, "wide.masked.vec"); + } + else + NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], + Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2375,8 +2161,18 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, "interleaved.vec"); - Instruction *NewStoreInstr = - Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); + Instruction *NewStoreInstr; + if (IsMaskForCondRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + NewStoreInstr = Builder.CreateMaskedStore( + IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); + } + else + NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], + Group->getAlignment()); Group->addMetadata(NewStoreInstr); } @@ -2400,13 +2196,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = getLoadStorePointerOperand(Instr); - unsigned Alignment = getMemInstAlignment(Instr); + unsigned Alignment = getLoadStoreAlignment(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); if (!Alignment) Alignment = DL.getABITypeAlignment(ScalarDataTy); - unsigned AddressSpace = getMemInstAddressSpace(Instr); + unsigned AddressSpace = getLoadStoreAddressSpace(Instr); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -2594,6 +2390,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { if (TripCount) return TripCount; + assert(L && "Create Trip Count for null loop."); IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); // Find the loop boundaries. ScalarEvolution *SE = PSE.getSE(); @@ -2602,6 +2399,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { "Invalid loop count"); Type *IdxTy = Legal->getWidestInductionType(); + assert(IdxTy && "No type for induction"); // The exit count might have the type of i64 while the phi is i32. This can // happen if we have an induction variable that is sign extended before the @@ -2642,12 +2440,26 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { Value *TC = getOrCreateTripCount(L); IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + Type *Ty = TC->getType(); + Constant *Step = ConstantInt::get(Ty, VF * UF); + + // If the tail is to be folded by masking, round the number of iterations N + // up to a multiple of Step instead of rounding down. This is done by first + // adding Step-1 and then rounding down. Note that it's ok if this addition + // overflows: the vector induction variable will eventually wrap to zero given + // that it starts at zero and its Step is a power of two; the loop will then + // exit, with the last early-exit vector comparison also producing all-true. + if (Cost->foldTailByMasking()) { + assert(isPowerOf2_32(VF * UF) && + "VF*UF must be a power of 2 when folding tail by masking"); + TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); + } + // Now we need to generate the expression for the part of the loop that the // vectorized body will execute. This is equal to N - (N % Step) if scalar // iterations are not required for correctness, or N - Step, otherwise. Step // is equal to the vectorization factor (number of SIMD elements) times the // unroll factor (number of SIMD instructions). - Constant *Step = ConstantInt::get(TC->getType(), VF * UF); Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); // If there is a non-reversed interleaved group that may speculatively access @@ -2710,8 +2522,13 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // of zero. In this case we will also jump to the scalar loop. auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; - Value *CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); + + // If tail is to be folded, vector loop takes care of all iterations. + Value *CheckMinIters = Builder.getFalse(); + if (!Cost->foldTailByMasking()) + CheckMinIters = Builder.CreateICmp( + P, Count, ConstantInt::get(Count->getType(), VF * UF), + "min.iters.check"); BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a @@ -2740,6 +2557,8 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { if (C->isZero()) return; + assert(!Cost->foldTailByMasking() && + "Cannot SCEV check stride or overflow when folding tail"); // Create a new block containing the stride check. BB->setName("vector.scevcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -2756,6 +2575,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { } void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { + // VPlan-native path does not do any analysis for runtime checks currently. + if (EnableVPlanNativePath) + return; + BasicBlock *BB = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the @@ -2768,6 +2591,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (!MemRuntimeCheck) return; + assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail"); // Create a new block containing the memory check. BB->setName("vector.memcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -2789,6 +2613,94 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { LVer->prepareNoAliasMetadata(); } +Value *InnerLoopVectorizer::emitTransformedIndex( + IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, + const InductionDescriptor &ID) const { + + SCEVExpander Exp(*SE, DL, "induction"); + auto Step = ID.getStep(); + auto StartValue = ID.getStartValue(); + assert(Index->getType() == Step->getType() && + "Index type does not match StepValue type"); + + // Note: the IR at this point is broken. We cannot use SE to create any new + // SCEV and then expand it, hoping that SCEV's simplification will give us + // a more optimal code. Unfortunately, attempt of doing so on invalid IR may + // lead to various SCEV crashes. So all we can do is to use builder and rely + // on InstCombine for future simplifications. Here we handle some trivial + // cases only. + auto CreateAdd = [&B](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast<ConstantInt>(X)) + if (CX->isZero()) + return Y; + if (auto *CY = dyn_cast<ConstantInt>(Y)) + if (CY->isZero()) + return X; + return B.CreateAdd(X, Y); + }; + + auto CreateMul = [&B](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast<ConstantInt>(X)) + if (CX->isOne()) + return Y; + if (auto *CY = dyn_cast<ConstantInt>(Y)) + if (CY->isOne()) + return X; + return B.CreateMul(X, Y); + }; + + switch (ID.getKind()) { + case InductionDescriptor::IK_IntInduction: { + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) + return B.CreateSub(StartValue, Index); + auto *Offset = CreateMul( + Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); + return CreateAdd(StartValue, Offset); + } + case InductionDescriptor::IK_PtrInduction: { + assert(isa<SCEVConstant>(Step) && + "Expected constant step for pointer induction"); + return B.CreateGEP( + nullptr, StartValue, + CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), + &*B.GetInsertPoint()))); + } + case InductionDescriptor::IK_FpInduction: { + assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); + auto InductionBinOp = ID.getInductionBinOp(); + assert(InductionBinOp && + (InductionBinOp->getOpcode() == Instruction::FAdd || + InductionBinOp->getOpcode() == Instruction::FSub) && + "Original bin op should be defined for FP induction"); + + Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); + + // Floating point operations had to be 'fast' to enable the induction. + FastMathFlags Flags; + Flags.setFast(); + + Value *MulExp = B.CreateFMul(StepValue, Index); + if (isa<Instruction>(MulExp)) + // We have to check, the MulExp may be a constant. + cast<Instruction>(MulExp)->setFastMathFlags(Flags); + + Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, + "induction"); + if (isa<Instruction>(BOp)) + cast<Instruction>(BOp)->setFastMathFlags(Flags); + + return BOp; + } + case InductionDescriptor::IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); +} + BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain @@ -2825,6 +2737,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); + MDNode *OrigLoopID = OrigLoop->getLoopID(); assert(VectorPH && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); @@ -2927,7 +2840,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - EndValue = II.transform(B, CRD, PSE.getSE(), DL); + EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } @@ -2948,9 +2861,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); + // If tail is to be folded, we know we don't need to run the remainder. + Value *CmpN = Builder.getTrue(); + if (!Cost->foldTailByMasking()) + CmpN = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); ReplaceInstWithInst(MiddleBlock->getTerminator(), BranchInst::Create(ExitBlock, ScalarPH, CmpN)); @@ -2965,6 +2881,17 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; + Optional<MDNode *> VectorizedLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized}); + if (VectorizedLoopID.hasValue()) { + Lp->setLoopID(VectorizedLoopID.getValue()); + + // Do not setAlreadyVectorized if loop attributes have been defined + // explicitly. + return LoopVectorPreHeader; + } + // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). if (MDNode *LID = OrigLoop->getLoopID()) @@ -3023,7 +2950,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, II.getStep()->getType()) : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); CMO->setName("cast.cmo"); - Value *Escape = II.transform(B, CMO, PSE.getSE(), DL); + Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } @@ -3109,6 +3036,10 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead(RetTy, true, false); + // Some targets keep addresses scalar. + if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) + return Cost; + if (CallInst *CI = dyn_cast<CallInst>(I)) { SmallVector<const Value *, 4> Operands(CI->arg_operands()); Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); @@ -3212,7 +3143,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { continue; for (unsigned Part = 0; Part < UF; ++Part) { Value *I = getOrCreateVectorValue(KV.first, Part); - if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) + if (Erased.find(I) != Erased.end() || I->use_empty() || + !isa<Instruction>(I)) continue; Type *OriginalTy = I->getType(); Type *ScalarTruncatedTy = @@ -3330,6 +3262,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() { if (VF > 1) truncateToMinimalBitwidths(); + // Fix widened non-induction PHIs by setting up the PHI operands. + if (OrigPHIsToFix.size()) { + assert(EnableVPlanNativePath && + "Unexpected non-induction PHIs for fixup in non VPlan-native path"); + fixNonInductionPHIs(); + } + // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. @@ -3666,8 +3605,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx")); else - ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( - Builder, MinMaxKind, ReducedPartRdx, RdxPart); + ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, + RdxPart); } if (VF > 1) { @@ -3720,9 +3659,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { void InnerLoopVectorizer::fixLCSSAPHIs() { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { - assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) && - "Incoming value isn't loop invariant"); - LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock); + auto *IncomingValue = LCSSAPhi.getIncomingValue(0); + // Non-instruction incoming values will have only one value. + unsigned LastLane = 0; + if (isa<Instruction>(IncomingValue)) + LastLane = Cost->isUniformAfterVectorization( + cast<Instruction>(IncomingValue), VF) + ? 0 + : VF - 1; + // Can be a loop invariant incoming value or the last scalar value to be + // extracted from the vectorized loop. + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + Value *lastIncomingValue = + getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); + LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } } @@ -3791,12 +3741,62 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { } while (Changed); } +void InnerLoopVectorizer::fixNonInductionPHIs() { + for (PHINode *OrigPhi : OrigPHIsToFix) { + PHINode *NewPhi = + cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); + unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); + + SmallVector<BasicBlock *, 2> ScalarBBPredecessors( + predecessors(OrigPhi->getParent())); + SmallVector<BasicBlock *, 2> VectorBBPredecessors( + predecessors(NewPhi->getParent())); + assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && + "Scalar and Vector BB should have the same number of predecessors"); + + // The insertion point in Builder may be invalidated by the time we get + // here. Force the Builder insertion point to something valid so that we do + // not run into issues during insertion point restore in + // getOrCreateVectorValue calls below. + Builder.SetInsertPoint(NewPhi); + + // The predecessor order is preserved and we can rely on mapping between + // scalar and vector block predecessors. + for (unsigned i = 0; i < NumIncomingValues; ++i) { + BasicBlock *NewPredBB = VectorBBPredecessors[i]; + + // When looking up the new scalar/vector values to fix up, use incoming + // values from original phi. + Value *ScIncV = + OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); + + // Scalar incoming value may need a broadcast + Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); + NewPhi->addIncoming(NewIncV, NewPredBB); + } + } +} + void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { + PHINode *P = cast<PHINode>(PN); + if (EnableVPlanNativePath) { + // Currently we enter here in the VPlan-native path for non-induction + // PHIs where all control flow is uniform. We simply widen these PHIs. + // Create a vector phi with no operands - the vector phi operands will be + // set at the end of vector code generation. + Type *VecTy = + (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); + VectorLoopValueMap.setVectorValue(P, 0, VecPhi); + OrigPHIsToFix.push_back(P); + + return; + } + assert(PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"); - PHINode *P = cast<PHINode>(PN); // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use @@ -3846,7 +3846,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); + Value *SclrGep = + emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); SclrGep->setName("next.gep"); VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); } @@ -4151,6 +4152,10 @@ void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); + // DT is not kept up-to-date for outer loop vectorization + if (EnableVPlanNativePath) + return; + // Update the dominator tree information. assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); @@ -4167,7 +4172,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF >= 2 && !Scalars.count(VF) && + assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); SmallSetVector<Instruction *, 8> Worklist; @@ -4253,7 +4258,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { } } for (auto *I : ScalarPtrs) - if (!PossibleNonScalarPtrs.count(I)) { + if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); Worklist.insert(I); } @@ -4279,8 +4284,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // Insert the forced scalars. // FIXME: Currently widenPHIInstruction() often creates a dead vector // induction variable when the PHI user is scalarized. - if (ForcedScalars.count(VF)) - for (auto *I : ForcedScalars.find(VF)->second) + auto ForcedScalar = ForcedScalars.find(VF); + if (ForcedScalar != ForcedScalars.end()) + for (auto *I : ForcedScalar->second) Worklist.insert(I); // Expand the worklist by looking through any bitcasts and getelementptr @@ -4348,8 +4354,8 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) { - if (!Legal->blockNeedsPredication(I->getParent())) +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { + if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { default: @@ -4360,6 +4366,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) { return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getMemInstValueType(I); + // We have already decided how to vectorize this instruction, get that + // result. + if (VF > 1) { + InstWidening WideningDecision = getWideningDecision(I, VF); + assert(WideningDecision != CM_Unknown && + "Widening decision should be ready at this moment"); + return WideningDecision == CM_Scalarize; + } return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); @@ -4373,6 +4387,35 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) { return false; } +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, + unsigned VF) { + assert(isAccessInterleaved(I) && "Expecting interleaved access."); + assert(getWideningDecision(I, VF) == CM_Unknown && + "Decision should not be set yet."); + auto *Group = getInterleavedAccessGroup(I); + assert(Group && "Must have a group."); + + // Check if masking is required. + // A Group may need masking for one of two reasons: it resides in a block that + // needs predication, or it was decided to use masking to deal with gaps. + bool PredicatedAccessRequiresMasking = + Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); + bool AccessWithGapsRequiresMasking = + Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) + return true; + + // If masked interleaving is required, we expect that the user/target had + // enabled it, because otherwise it either wouldn't have been created or + // it should have been invalidated by the CostModel. + assert(useMaskedInterleavedAccesses(TTI) && + "Masked interleave-groups for predicated accesses are not enabled."); + + auto *Ty = getMemInstValueType(I); + return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) + : TTI.isLegalMaskedStore(Ty); +} + bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, unsigned VF) { // Get and ensure we have a valid memory instruction. @@ -4407,7 +4450,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF >= 2 && !Uniforms.count(VF) && + assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -4494,26 +4537,33 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Add to the Worklist all consecutive and consecutive-like pointers that // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) - if (!PossibleNonUniformPtrs.count(V)) { + if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); Worklist.insert(V); } // Expand Worklist in topological order: whenever a new instruction - // is added , its users should be either already inside Worklist, or - // out of scope. It ensures a uniform instruction will only be used - // by uniform instructions or out of scope instructions. + // is added , its users should be already inside Worklist. It ensures + // a uniform instruction will only be used by uniform instructions. unsigned idx = 0; while (idx != Worklist.size()) { Instruction *I = Worklist[idx++]; for (auto OV : I->operand_values()) { + // isOutOfScope operands cannot be uniform instructions. if (isOutOfScope(OV)) continue; + // First order recurrence Phi's should typically be considered + // non-uniform. + auto *OP = dyn_cast<PHINode>(OV); + if (OP && Legal->isFirstOrderRecurrence(OP)) + continue; + // If all the users of the operand are uniform, then add the + // operand into the uniform worklist. auto *OI = cast<Instruction>(OV); if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast<Instruction>(U); - return !TheLoop->contains(J) || Worklist.count(J) || + return Worklist.count(J) || (OI == getLoadStorePointerOperand(J) && isUniformDecision(J, VF)); })) { @@ -4571,318 +4621,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -void InterleavedAccessInfo::collectConstStrideAccesses( - MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, - const ValueToValueMap &Strides) { - auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); - - // Since it's desired that the load/store instructions be maintained in - // "program order" for the interleaved access analysis, we have to visit the - // blocks in the loop in reverse postorder (i.e., in a topological order). - // Such an ordering will ensure that any load/store that may be executed - // before a second load/store will precede the second load/store in - // AccessStrideInfo. - LoopBlocksDFS DFS(TheLoop); - DFS.perform(LI); - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - for (auto &I : *BB) { - auto *LI = dyn_cast<LoadInst>(&I); - auto *SI = dyn_cast<StoreInst>(&I); - if (!LI && !SI) - continue; - - Value *Ptr = getLoadStorePointerOperand(&I); - // We don't check wrapping here because we don't know yet if Ptr will be - // part of a full group or a group with gaps. Checking wrapping for all - // pointers (even those that end up in groups with no gaps) will be overly - // conservative. For full groups, wrapping should be ok since if we would - // wrap around the address space we would do a memory access at nullptr - // even without the transformation. The wrapping checks are therefore - // deferred until after we've formed the interleaved groups. - int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, - /*Assume=*/true, /*ShouldCheckWrap=*/false); - - const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); - PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); - uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); - - // An alignment of 0 means target ABI alignment. - unsigned Align = getMemInstAlignment(&I); - if (!Align) - Align = DL.getABITypeAlignment(PtrTy->getElementType()); - - AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align); - } -} - -// Analyze interleaved accesses and collect them into interleaved load and -// store groups. -// -// When generating code for an interleaved load group, we effectively hoist all -// loads in the group to the location of the first load in program order. When -// generating code for an interleaved store group, we sink all stores to the -// location of the last store. This code motion can change the order of load -// and store instructions and may break dependences. -// -// The code generation strategy mentioned above ensures that we won't violate -// any write-after-read (WAR) dependences. -// -// E.g., for the WAR dependence: a = A[i]; // (1) -// A[i] = b; // (2) -// -// The store group of (2) is always inserted at or below (2), and the load -// group of (1) is always inserted at or above (1). Thus, the instructions will -// never be reordered. All other dependences are checked to ensure the -// correctness of the instruction reordering. -// -// The algorithm visits all memory accesses in the loop in bottom-up program -// order. Program order is established by traversing the blocks in the loop in -// reverse postorder when collecting the accesses. -// -// We visit the memory accesses in bottom-up order because it can simplify the -// construction of store groups in the presence of write-after-write (WAW) -// dependences. -// -// E.g., for the WAW dependence: A[i] = a; // (1) -// A[i] = b; // (2) -// A[i + 1] = c; // (3) -// -// We will first create a store group with (3) and (2). (1) can't be added to -// this group because it and (2) are dependent. However, (1) can be grouped -// with other accesses that may precede it in program order. Note that a -// bottom-up order does not imply that WAW dependences should not be checked. -void InterleavedAccessInfo::analyzeInterleaving() { - LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); - const ValueToValueMap &Strides = LAI->getSymbolicStrides(); - - // Holds all accesses with a constant stride. - MapVector<Instruction *, StrideDescriptor> AccessStrideInfo; - collectConstStrideAccesses(AccessStrideInfo, Strides); - - if (AccessStrideInfo.empty()) - return; - - // Collect the dependences in the loop. - collectDependences(); - - // Holds all interleaved store groups temporarily. - SmallSetVector<InterleaveGroup *, 4> StoreGroups; - // Holds all interleaved load groups temporarily. - SmallSetVector<InterleaveGroup *, 4> LoadGroups; - - // Search in bottom-up program order for pairs of accesses (A and B) that can - // form interleaved load or store groups. In the algorithm below, access A - // precedes access B in program order. We initialize a group for B in the - // outer loop of the algorithm, and then in the inner loop, we attempt to - // insert each A into B's group if: - // - // 1. A and B have the same stride, - // 2. A and B have the same memory object size, and - // 3. A belongs in B's group according to its distance from B. - // - // Special care is taken to ensure group formation will not break any - // dependences. - for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend(); - BI != E; ++BI) { - Instruction *B = BI->first; - StrideDescriptor DesB = BI->second; - - // Initialize a group for B if it has an allowable stride. Even if we don't - // create a group for B, we continue with the bottom-up algorithm to ensure - // we don't break any of B's dependences. - InterleaveGroup *Group = nullptr; - if (isStrided(DesB.Stride)) { - Group = getInterleaveGroup(B); - if (!Group) { - LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B - << '\n'); - Group = createInterleaveGroup(B, DesB.Stride, DesB.Align); - } - if (B->mayWriteToMemory()) - StoreGroups.insert(Group); - else - LoadGroups.insert(Group); - } - - for (auto AI = std::next(BI); AI != E; ++AI) { - Instruction *A = AI->first; - StrideDescriptor DesA = AI->second; - - // Our code motion strategy implies that we can't have dependences - // between accesses in an interleaved group and other accesses located - // between the first and last member of the group. Note that this also - // means that a group can't have more than one member at a given offset. - // The accesses in a group can have dependences with other accesses, but - // we must ensure we don't extend the boundaries of the group such that - // we encompass those dependent accesses. - // - // For example, assume we have the sequence of accesses shown below in a - // stride-2 loop: - // - // (1, 2) is a group | A[i] = a; // (1) - // | A[i-1] = b; // (2) | - // A[i-3] = c; // (3) - // A[i] = d; // (4) | (2, 4) is not a group - // - // Because accesses (2) and (3) are dependent, we can group (2) with (1) - // but not with (4). If we did, the dependent access (3) would be within - // the boundaries of the (2, 4) group. - if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) { - // If a dependence exists and A is already in a group, we know that A - // must be a store since A precedes B and WAR dependences are allowed. - // Thus, A would be sunk below B. We release A's group to prevent this - // illegal code motion. A will then be free to form another group with - // instructions that precede it. - if (isInterleaved(A)) { - InterleaveGroup *StoreGroup = getInterleaveGroup(A); - StoreGroups.remove(StoreGroup); - releaseGroup(StoreGroup); - } - - // If a dependence exists and A is not already in a group (or it was - // and we just released it), B might be hoisted above A (if B is a - // load) or another store might be sunk below A (if B is a store). In - // either case, we can't add additional instructions to B's group. B - // will only form a group with instructions that it precedes. - break; - } - - // At this point, we've checked for illegal code motion. If either A or B - // isn't strided, there's nothing left to do. - if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride)) - continue; - - // Ignore A if it's already in a group or isn't the same kind of memory - // operation as B. - // Note that mayReadFromMemory() isn't mutually exclusive to mayWriteToMemory - // in the case of atomic loads. We shouldn't see those here, canVectorizeMemory() - // should have returned false - except for the case we asked for optimization - // remarks. - if (isInterleaved(A) || (A->mayReadFromMemory() != B->mayReadFromMemory()) - || (A->mayWriteToMemory() != B->mayWriteToMemory())) - continue; - - // Check rules 1 and 2. Ignore A if its stride or size is different from - // that of B. - if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size) - continue; - - // Ignore A if the memory object of A and B don't belong to the same - // address space - if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B)) - continue; - - // Calculate the distance from A to B. - const SCEVConstant *DistToB = dyn_cast<SCEVConstant>( - PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev)); - if (!DistToB) - continue; - int64_t DistanceToB = DistToB->getAPInt().getSExtValue(); - - // Check rule 3. Ignore A if its distance to B is not a multiple of the - // size. - if (DistanceToB % static_cast<int64_t>(DesB.Size)) - continue; - - // Ignore A if either A or B is in a predicated block. Although we - // currently prevent group formation for predicated accesses, we may be - // able to relax this limitation in the future once we handle more - // complicated blocks. - if (isPredicated(A->getParent()) || isPredicated(B->getParent())) - continue; - - // The index of A is the index of B plus A's distance to B in multiples - // of the size. - int IndexA = - Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size); - - // Try to insert A into B's group. - if (Group->insertMember(A, IndexA, DesA.Align)) { - LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n' - << " into the interleave group with" << *B - << '\n'); - InterleaveGroupMap[A] = Group; - - // Set the first load in program order as the insert position. - if (A->mayReadFromMemory()) - Group->setInsertPos(A); - } - } // Iteration over A accesses. - } // Iteration over B accesses. - - // Remove interleaved store groups with gaps. - for (InterleaveGroup *Group : StoreGroups) - if (Group->getNumMembers() != Group->getFactor()) { - LLVM_DEBUG( - dbgs() << "LV: Invalidate candidate interleaved store group due " - "to gaps.\n"); - releaseGroup(Group); - } - // Remove interleaved groups with gaps (currently only loads) whose memory - // accesses may wrap around. We have to revisit the getPtrStride analysis, - // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does - // not check wrapping (see documentation there). - // FORNOW we use Assume=false; - // TODO: Change to Assume=true but making sure we don't exceed the threshold - // of runtime SCEV assumptions checks (thereby potentially failing to - // vectorize altogether). - // Additional optional optimizations: - // TODO: If we are peeling the loop and we know that the first pointer doesn't - // wrap then we can deduce that all pointers in the group don't wrap. - // This means that we can forcefully peel the loop in order to only have to - // check the first pointer for no-wrap. When we'll change to use Assume=true - // we'll only need at most one runtime check per interleaved group. - for (InterleaveGroup *Group : LoadGroups) { - // Case 1: A full group. Can Skip the checks; For full groups, if the wide - // load would wrap around the address space we would do a memory access at - // nullptr even without the transformation. - if (Group->getNumMembers() == Group->getFactor()) - continue; - - // Case 2: If first and last members of the group don't wrap this implies - // that all the pointers in the group don't wrap. - // So we check only group member 0 (which is always guaranteed to exist), - // and group member Factor - 1; If the latter doesn't exist we rely on - // peeling (if it is a non-reveresed accsess -- see Case 3). - Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0)); - if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, - /*ShouldCheckWrap=*/true)) { - LLVM_DEBUG( - dbgs() << "LV: Invalidate candidate interleaved group due to " - "first group member potentially pointer-wrapping.\n"); - releaseGroup(Group); - continue; - } - Instruction *LastMember = Group->getMember(Group->getFactor() - 1); - if (LastMember) { - Value *LastMemberPtr = getLoadStorePointerOperand(LastMember); - if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, - /*ShouldCheckWrap=*/true)) { - LLVM_DEBUG( - dbgs() << "LV: Invalidate candidate interleaved group due to " - "last group member potentially pointer-wrapping.\n"); - releaseGroup(Group); - } - } else { - // Case 3: A non-reversed interleaved load group with gaps: We need - // to execute at least one scalar epilogue iteration. This will ensure - // we don't speculatively access memory out-of-bounds. We only need - // to look for a member at index factor - 1, since every group must have - // a member at index zero. - if (Group->isReverse()) { - LLVM_DEBUG( - dbgs() << "LV: Invalidate candidate interleaved group due to " - "a reverse access with gaps.\n"); - releaseGroup(Group); - continue; - } - LLVM_DEBUG( - dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); - RequiresScalarEpilogue = true; - } - } -} - Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically @@ -4912,39 +4650,78 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { return None; } + if (!PSE.getUnionPredicate().getPredicates().empty()) { + ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") + << "runtime SCEV checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz"); + LLVM_DEBUG( + dbgs() + << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); + return None; + } + + // FIXME: Avoid specializing for stride==1 instead of bailing out. + if (!Legal->getLAI()->getSymbolicStrides().empty()) { + ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") + << "runtime stride == 1 checks needed. Enable vectorization of " + "this loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz"); + LLVM_DEBUG( + dbgs() + << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"); + return None; + } + // If we optimize the program for size, avoid creating the tail loop. LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - // If we don't know the precise trip count, don't try to vectorize. - if (TC < 2) { - ORE->emit( - createMissedAnalysis("UnknownLoopCountComplexCFG") - << "unable to calculate the loop count due to complex control flow"); - LLVM_DEBUG( - dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + if (TC == 1) { + ORE->emit(createMissedAnalysis("SingleIterationLoop") + << "loop trip count is one, irrelevant for vectorization"); + LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n"); return None; } + // Record that scalar epilogue is not allowed. + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + + IsScalarEpilogueAllowed = !OptForSize; + + // We don't create an epilogue when optimizing for size. + // Invalidate interleave groups that require an epilogue if we can't mask + // the interleave-group. + if (!useMaskedInterleavedAccesses(TTI)) + InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); + unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); - if (TC % MaxVF != 0) { - // If the trip count that we found modulo the vectorization factor is not - // zero then we require a tail. - // FIXME: look for a smaller MaxVF that does divide TC rather than give up. - // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a - // smaller MaxVF that does not require a scalar epilog. - - ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") - << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + if (TC > 0 && TC % MaxVF == 0) { + LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + return MaxVF; + } + + // If we don't know the precise trip count, or if the trip count that we + // found modulo the vectorization factor is not zero, try to fold the tail + // by masking. + // FIXME: look for a smaller MaxVF that does divide TC rather than masking. + if (Legal->canFoldTailByMasking()) { + FoldTailByMasking = true; + return MaxVF; + } + + if (TC == 0) { + ORE->emit( + createMissedAnalysis("UnknownLoopCountComplexCFG") + << "unable to calculate the loop count due to complex control flow"); return None; } - return MaxVF; + ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") + << "cannot optimize for size and vectorize at the same time. " + "Enable vectorization of this loop with '#pragma clang loop " + "vectorize(enable)' when compiling with -Os/-Oz"); + return None; } unsigned @@ -5080,11 +4857,11 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { // For each block. for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the loop. - for (Instruction &I : *BB) { + for (Instruction &I : BB->instructionsWithoutDebug()) { Type *T = I.getType(); // Skip ignored values. - if (ValuesToIgnore.count(&I)) + if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) continue; // Only examine Loads, Stores and PHINodes. @@ -5182,6 +4959,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // fit without causing spills. All of this is rounded down if necessary to be // a power of two. We want power of two interleave count to simplify any // addressing operations or alignment considerations. + // We also want power of two interleave counts to ensure that the induction + // variable of the vector loop wraps to zero, when tail is folded by masking; + // this currently happens when OptForSize, in which case IC is set to 1 above. unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers); @@ -5307,7 +5087,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { using IntervalMap = DenseMap<Instruction *, unsigned>; // Maps instruction to its index. - DenseMap<unsigned, Instruction *> IdxToInstr; + SmallVector<Instruction *, 64> IdxToInstr; // Marks the end of each interval. IntervalMap EndPoint; // Saves the list of instruction indices that are used in the loop. @@ -5316,10 +5096,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { // defined outside the loop, such as arguments and constants. SmallPtrSet<Value *, 8> LoopInvariants; - unsigned Index = 0; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { - for (Instruction &I : *BB) { - IdxToInstr[Index++] = &I; + for (Instruction &I : BB->instructionsWithoutDebug()) { + IdxToInstr.push_back(&I); // Save the end location of each USE. for (Value *U : I.operands()) { @@ -5336,7 +5115,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { } // Overwrite previous end points. - EndPoint[Instr] = Index; + EndPoint[Instr] = IdxToInstr.size(); Ends.insert(Instr); } } @@ -5373,7 +5152,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { return std::max<unsigned>(1, VF * TypeSize / WidestRegister); }; - for (unsigned int i = 0; i < Index; ++i) { + for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { Instruction *I = IdxToInstr[i]; // Remove all of the instructions that end at this location. @@ -5382,11 +5161,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { OpenIntervals.erase(ToRemove); // Ignore instructions that are never used within the loop. - if (!Ends.count(I)) + if (Ends.find(I) == Ends.end()) continue; // Skip ignored values. - if (ValuesToIgnore.count(I)) + if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) continue; // For each VF find the maximum usage of registers. @@ -5400,7 +5179,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { unsigned RegUsage = 0; for (auto Inst : OpenIntervals) { // Skip ignored values for VF > 1. - if (VecValuesToIgnore.count(Inst) || + if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || isScalarAfterVectorization(Inst, VFs[j])) continue; RegUsage += GetRegUsage(Inst->getType(), VFs[j]); @@ -5446,8 +5225,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isScalarWithPredication(I) && - "Expecting a scalar emulated instruction"); + assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); return isa<LoadInst>(I) || (isa<StoreInst>(I) && NumPredStores > NumberOfStoresToPredicate); @@ -5458,7 +5236,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF < 2 || InstsToScalarize.count(VF)) + if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -5470,7 +5248,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. for (BasicBlock *BB : TheLoop->blocks()) { - if (!Legal->blockNeedsPredication(BB)) + if (!blockNeedsPredication(BB)) continue; for (Instruction &I : *BB) if (isScalarWithPredication(&I)) { @@ -5553,7 +5331,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( Instruction *I = Worklist.pop_back_val(); // If we've already analyzed the instruction, there's nothing to do. - if (ScalarCosts.count(I)) + if (ScalarCosts.find(I) != ScalarCosts.end()) continue; // Compute the cost of the vector instruction. Note that this cost already @@ -5612,8 +5390,8 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.count(&I) || - (VF > 1 && VecValuesToIgnore.count(&I))) + if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || + (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) continue; VectorizationCostTy C = getInstructionCost(&I, VF); @@ -5635,7 +5413,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // unconditionally executed. For the scalar case, we may not always execute // the predicated block. Thus, scale the block's cost by the probability of // executing it. - if (VF == 1 && Legal->blockNeedsPredication(BB)) + if (VF == 1 && blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -5682,11 +5460,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned VF) { + assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); - unsigned Alignment = getMemInstAlignment(I); - unsigned AS = getMemInstAddressSpace(I); + unsigned Alignment = getLoadStoreAlignment(I); + unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); @@ -5697,9 +5476,11 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Get the cost of the scalar memory instruction and address computation. unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + // Don't pass *I here, since it is scalar but will actually be part of a + // vectorized loop where the user of it is a vectorized instruction. Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS, I); + AS); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5708,7 +5489,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated // block. - if (isScalarWithPredication(I)) { + if (isPredicatedInst(I)) { Cost /= getReciprocalPredBlockProb(); if (useEmulatedMaskMemRefHack(I)) @@ -5724,9 +5505,9 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getMemInstAlignment(I); + unsigned Alignment = getLoadStoreAlignment(I); Value *Ptr = getLoadStorePointerOperand(I); - unsigned AS = getMemInstAddressSpace(I); + unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && @@ -5745,22 +5526,30 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { - LoadInst *LI = cast<LoadInst>(I); - Type *ValTy = LI->getType(); + Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = LI->getAlignment(); - unsigned AS = LI->getPointerAddressSpace(); + unsigned Alignment = getLoadStoreAlignment(I); + unsigned AS = getLoadStoreAddressSpace(I); + if (isa<LoadInst>(I)) { + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + } + StoreInst *SI = cast<StoreInst>(I); + bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( + Instruction::ExtractElement, + VectorTy, VF - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getMemInstAlignment(I); + unsigned Alignment = getLoadStoreAlignment(I); Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + @@ -5772,7 +5561,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned AS = getMemInstAddressSpace(I); + unsigned AS = getLoadStoreAddressSpace(I); auto Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); @@ -5790,13 +5579,19 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, - Group->getFactor(), Indices, - Group->getAlignment(), AS); - - if (Group->isReverse()) + bool UseMaskForGaps = + Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, Group->getFactor(), Indices, + Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); + + if (Group->isReverse()) { + // TODO: Add support for reversed masked interleaved access. + assert(!Legal->isMaskRequired(I) && + "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + } return Cost; } @@ -5806,8 +5601,8 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, // moment. if (VF == 1) { Type *ValTy = getMemInstValueType(I); - unsigned Alignment = getMemInstAlignment(I); - unsigned AS = getMemInstAddressSpace(I); + unsigned Alignment = getLoadStoreAlignment(I); + unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); @@ -5826,9 +5621,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return VectorizationCostTy(InstsToScalarize[VF][I], false); // Forced scalars do not have any scalarization overhead. - if (VF > 1 && ForcedScalars.count(VF) && - ForcedScalars.find(VF)->second.count(I)) - return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + auto ForcedScalar = ForcedScalars.find(VF); + if (VF > 1 && ForcedScalar != ForcedScalars.end()) { + auto InstSet = ForcedScalar->second; + if (InstSet.find(I) != InstSet.end()) + return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + } Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); @@ -5849,10 +5647,22 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (!Ptr) continue; + // TODO: We should generate better code and update the cost model for + // predicated uniform stores. Today they are treated as any other + // predicated store (see added test cases in + // invariant-store-vectorization.ll). if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) { - // Scalar load + broadcast + + if (Legal->isUniform(Ptr) && + // Conditional loads and stores should be scalarized and predicated. + // isScalarWithPredication cannot be used here since masked + // gather/scatters are not considered scalar with predication. + !Legal->blockNeedsPredication(I.getParent())) { + // TODO: Avoid replicating loads and stores instead of + // relying on instcombine to remove them. + // Load: Scalar load + broadcast + // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract unsigned Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); continue; @@ -5883,7 +5693,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { continue; NumAccesses = Group->getNumMembers(); - InterleaveCost = getInterleaveGroupCost(&I, VF); + if (interleavedAccessCanBeWidened(&I, VF)) + InterleaveCost = getInterleaveGroupCost(&I, VF); } unsigned GatherScatterCost = @@ -6001,8 +5812,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); if (VF > 1 && BI->isConditional() && - (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || - PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) + (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != + PredicatedBBsAfterVectorization.end() || + PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != + PredicatedBBsAfterVectorization.end())) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { @@ -6025,9 +5838,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, auto *Phi = cast<PHINode>(I); // First-order recurrences are replaced by vector shuffles inside the loop. + // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - VectorTy, VF - 1, VectorTy); + VectorTy, VF - 1, VectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi @@ -6089,38 +5903,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, return 0; // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueProperties Op1VP = - TargetTransformInfo::OP_None; - TargetTransformInfo::OperandValueProperties Op2VP = - TargetTransformInfo::OP_None; Value *Op2 = I->getOperand(1); - - // Check for a splat or for a non uniform vector of constants. - if (isa<ConstantInt>(Op2)) { - ConstantInt *CInt = cast<ConstantInt>(Op2); - if (CInt && CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; - Op2VK = TargetTransformInfo::OK_UniformConstantValue; - } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) { - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - Constant *SplatValue = cast<Constant>(Op2)->getSplatValue(); - if (SplatValue) { - ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue); - if (CInt && CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; - Op2VK = TargetTransformInfo::OK_UniformConstantValue; - } - } else if (Legal->isUniform(Op2)) { + TargetTransformInfo::OperandValueProperties Op2VP; + TargetTransformInfo::OperandValueKind Op2VK = + TTI.getOperandInfo(Op2, Op2VP); + if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) Op2VK = TargetTransformInfo::OK_UniformValue; - } + SmallVector<const Value *, 4> Operands(I->operand_values()); unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; - return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + return N * TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, + Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); @@ -6237,8 +6031,9 @@ INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { -Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { - return new LoopVectorize(NoUnrolling, AlwaysVectorize); +Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, + bool VectorizeOnlyWhenForced) { + return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); } } // end namespace llvm @@ -6316,6 +6111,16 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. return NoVectorization; + // Invalidate interleave groups if all blocks of loop will be predicated. + if (CM.blockNeedsPredication(OrigLoop->getHeader()) && + !useMaskedInterleavedAccesses(*TTI)) { + LLVM_DEBUG( + dbgs() + << "LV: Invalidate all interleaved groups due to fold-tail by masking " + "which requires masked-interleaved support.\n"); + CM.InterleaveInfo.reset(); + } + if (UserVF) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); @@ -6372,6 +6177,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, DT, ILV.Builder, ILV.VectorLoopValueMap, &ILV, CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); + State.TripCount = ILV.getOrCreateTripCount(nullptr); //===------------------------------------------------===// // @@ -6408,7 +6214,8 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( PHINode *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.count(cast<Instruction>(U)); + return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != + DeadInstructions.end(); })) DeadInstructions.insert(IndUpdate); @@ -6551,9 +6358,17 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { // load/store/gather/scatter. Initialize BlockMask to no-mask. VPValue *BlockMask = nullptr; - // Loop incoming mask is all-one. - if (OrigLoop->getHeader() == BB) + if (OrigLoop->getHeader() == BB) { + if (!CM.blockNeedsPredication(BB)) + return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. + VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); + VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); return BlockMaskCache[BB] = BlockMask; + } // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { @@ -6573,8 +6388,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { } VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range) { - const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); + VFRange &Range, + VPlanPtr &Plan) { + const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); if (!IG) return nullptr; @@ -6595,7 +6411,11 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, assert(I == IG->getInsertPos() && "Generating a recipe for an adjunct member of an interleave group"); - return new VPInterleaveRecipe(IG); + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(I)) + Mask = createBlockInMask(I->getParent(), Plan); + + return new VPInterleaveRecipe(IG, Mask); } VPWidenMemoryInstructionRecipe * @@ -6688,7 +6508,11 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range) { - if (CM.isScalarWithPredication(I)) + + bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + + if (IsPredicated) return false; auto IsVectorizableOpcode = [](unsigned Opcode) { @@ -6795,7 +6619,9 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); - bool IsPredicated = CM.isScalarWithPredication(I); + bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); // Find if I uses a predicated instruction. If so, it will use its scalar @@ -6857,7 +6683,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPRecipeBase *Recipe = nullptr; // Check if Instr should belong to an interleave memory recipe, or already // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { VPBB->appendRecipe(Recipe); return true; } @@ -6908,6 +6734,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, NeedDef.insert(Branch->getCondition()); } + // If the tail is to be folded by masking, the primary induction variable + // needs to be represented in VPlan for it to model early-exit masking. + if (CM.foldTailByMasking()) + NeedDef.insert(Legal->getPrimaryInduction()); + // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For // example, original induction update instructions can become dead because we @@ -6969,18 +6800,21 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes( // First filter out irrelevant instructions, to ensure no recipes are // built for them. - if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) + if (isa<BranchInst>(Instr) || + DeadInstructions.find(Instr) != DeadInstructions.end()) continue; // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct // member of the IG, do not construct any Recipe for it. - const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr); + const InterleaveGroup<Instruction> *IG = + CM.getInterleavedAccessGroup(Instr); if (IG && Instr != IG->getInsertPos() && Range.Start >= 2 && // Query is illegal for VF == 1 CM.getWideningDecision(Instr, Range.Start) == LoopVectorizationCostModel::CM_Interleave) { - if (SinkAfterInverse.count(Instr)) - Ingredients.push_back(SinkAfterInverse.find(Instr)->second); + auto SinkCandidate = SinkAfterInverse.find(Instr); + if (SinkCandidate != SinkAfterInverse.end()) + Ingredients.push_back(SinkCandidate->second); continue; } @@ -7063,6 +6897,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range) { VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); + SmallPtrSet<Instruction *, 1> DeadInstructions; + VPlanHCFGTransforms::VPInstructionsToVPRecipes( + Plan, Legal->getInductionVars(), DeadInstructions); + + for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) + Plan->addVF(VF); + return Plan; } @@ -7075,6 +6916,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); + if (User) { + O << ", "; + User->getOperand(0)->printAsOperand(O); + } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) @@ -7137,7 +6982,15 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + if (!User) + return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + + // Last (and currently only) operand is a mask. + InnerLoopVectorizer::VectorParts MaskValues(State.UF); + VPValue *Mask = User->getOperand(User->getNumOperands() - 1); + for (unsigned Part = 0; Part < State.UF; ++Part) + MaskValues[Part] = State.get(Mask, Part); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7264,11 +7117,26 @@ static bool processLoopInVPlanNativePath( Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); // Plan how to best vectorize, return the best VF and its cost. - LVP.planInVPlanNativePath(OptForSize, UserVF); + VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); - // Returning false. We are currently not generating vector code in the VPlan - // native path. - return false; + // If we are stress testing VPlan builds, do not attempt to generate vector + // code. + if (VPlanBuildStressTest) + return false; + + LVP.setBestPlan(VF.Width, 1); + + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL, + &CM); + LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" + << L->getHeader()->getParent()->getName() << "\"\n"); + LVP.executePlan(LB, DT); + + // Mark the loop as already vectorized to avoid vectorizing again. + Hints.setAlreadyVectorized(); + + LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); + return true; } bool LoopVectorizePass::processLoop(Loop *L) { @@ -7283,7 +7151,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"); - LoopVectorizeHints Hints(L, DisableUnrolling, *ORE); + LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); LLVM_DEBUG( dbgs() << "LV: Loop hints:" @@ -7307,7 +7175,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // less verbose reporting vectorized loops and unvectorized loops that may // benefit from vectorization, respectively. - if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { + if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); return false; } @@ -7320,7 +7188,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { &Requirements, &Hints, DB, AC); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); - emitMissedWarning(F, L, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -7393,7 +7261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "NoImplicitFloat", L) << "loop not vectorized due to NoImplicitFloat attribute"); - emitMissedWarning(F, L, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -7408,7 +7276,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { ORE->emit( createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) << "loop not vectorized due to unsafe FP support."); - emitMissedWarning(F, L, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -7421,7 +7289,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Analyze interleaved memory accesses. if (UseInterleaved) { - IAI.analyzeInterleaving(); + IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); } // Use the cost model. @@ -7450,7 +7318,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (Requirements.doesNotMeet(F, L, Hints)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " "requirements.\n"); - emitMissedWarning(F, L, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -7527,6 +7395,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { LVP.setBestPlan(VF.Width, IC); using namespace ore; + bool DisableRuntimeUnroll = false; + MDNode *OrigLoopID = L->getLoopID(); if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); @@ -7553,7 +7423,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // no runtime checks about strides and memory. A scalar loop that is // rarely used is not worth unrolling. if (!LB.areSafetyChecksAdded()) - AddRuntimeUnrollDisableMetaData(L); + DisableRuntimeUnroll = true; // Report the vectorization decision. ORE->emit([&]() { @@ -7565,8 +7435,18 @@ bool LoopVectorizePass::processLoop(Loop *L) { }); } - // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(); + Optional<MDNode *> RemainderLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupEpilogue}); + if (RemainderLoopID.hasValue()) { + L->setLoopID(RemainderLoopID.getValue()); + } else { + if (DisableRuntimeUnroll) + AddRuntimeUnrollDisableMetaData(L); + + // Mark the loop as already vectorized to avoid vectorizing again. + Hints.setAlreadyVectorized(); + } LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; @@ -7659,8 +7539,15 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserve<LoopAnalysis>(); - PA.preserve<DominatorTreeAnalysis>(); + + // We currently do not preserve loopinfo/dominator analyses with outer loop + // vectorization. Until this is addressed, mark these analyses as preserved + // only for non-VPlan-native path. + // TODO: Preserve Loop and Dominator analyses for VPlan-native path. + if (!EnableVPlanNativePath) { + PA.preserve<LoopAnalysis>(); + PA.preserve<DominatorTreeAnalysis>(); + } PA.preserve<BasicAA>(); PA.preserve<GlobalsAA>(); return PA; |