diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-02-16 20:13:02 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2021-02-16 20:13:02 +0000 |
commit | b60736ec1405bb0a8dd40989f67ef4c93da068ab (patch) | |
tree | 5c43fbb7c9fc45f0f87e0e6795a86267dbd12f9d /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | cfca06d7963fa0909f90483b42a6d7d194d01e08 (diff) |
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3587 |
1 files changed, 2598 insertions, 989 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 35af8e425778..ea0d7673edf6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -130,6 +130,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/InstructionCost.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -157,18 +158,37 @@ using namespace llvm; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME +#ifndef NDEBUG +const char VerboseDebug[] = DEBUG_TYPE "-verbose"; +#endif + /// @{ /// Metadata attribute names -static const char *const LLVMLoopVectorizeFollowupAll = - "llvm.loop.vectorize.followup_all"; -static const char *const LLVMLoopVectorizeFollowupVectorized = +const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; +const char LLVMLoopVectorizeFollowupVectorized[] = "llvm.loop.vectorize.followup_vectorized"; -static const char *const LLVMLoopVectorizeFollowupEpilogue = +const char LLVMLoopVectorizeFollowupEpilogue[] = "llvm.loop.vectorize.followup_epilogue"; /// @} STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); +STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); + +static cl::opt<bool> EnableEpilogueVectorization( + "enable-epilogue-vectorization", cl::init(true), cl::Hidden, + cl::desc("Enable vectorization of epilogue loops.")); + +static cl::opt<unsigned> EpilogueVectorizationForceVF( + "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, + cl::desc("When epilogue vectorization is enabled, and a value greater than " + "1 is specified, forces the given VF for all applicable epilogue " + "loops.")); + +static cl::opt<unsigned> EpilogueVectorizationMinVF( + "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, + cl::desc("Only loops with vectorization factor equal to or larger than " + "the specified value are considered for epilogue vectorization.")); /// Loops with a known constant trip count below this number are vectorized only /// if no scalar iteration overheads are incurred. @@ -178,13 +198,36 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); -// Indicates that an epilogue is undesired, predication is preferred. -// This means that the vectorizer will try to fold the loop-tail (epilogue) -// into the loop and predicate the loop body accordingly. -static cl::opt<bool> PreferPredicateOverEpilog( - "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, - cl::desc("Indicate that an epilogue is undesired, predication should be " - "used instead.")); +// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, +// that predication is preferred, and this lists all options. I.e., the +// vectorizer will try to fold the tail-loop (epilogue) into the vector body +// and predicate the instructions accordingly. If tail-folding fails, there are +// different fallback strategies depending on these values: +namespace PreferPredicateTy { + enum Option { + ScalarEpilogue = 0, + PredicateElseScalarEpilogue, + PredicateOrDontVectorize + }; +} // namespace PreferPredicateTy + +static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( + "prefer-predicate-over-epilogue", + cl::init(PreferPredicateTy::ScalarEpilogue), + cl::Hidden, + cl::desc("Tail-folding and predication preferences over creating a scalar " + "epilogue loop."), + cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, + "scalar-epilogue", + "Don't tail-predicate loops, create scalar epilogue"), + clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, + "predicate-else-scalar-epilogue", + "prefer tail-folding, create scalar epilogue if tail " + "folding fails."), + clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, + "predicate-dont-vectorize", + "prefers tail-folding, don't attempt vectorization if " + "tail-folding fails."))); static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, @@ -196,7 +239,7 @@ static cl::opt<bool> EnableInterleavedMemAccesses( cl::desc("Enable vectorization on interleaved memory accesses in a loop")); /// An interleave-group may need masking if it resides in a block that needs -/// predication, or in order to mask away gaps. +/// predication, or in order to mask away gaps. static cl::opt<bool> EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); @@ -230,6 +273,12 @@ static cl::opt<unsigned> ForceTargetInstructionCost( "an instruction to a single constant value. Mostly " "useful for getting consistent testing.")); +static cl::opt<bool> ForceTargetSupportsScalableVectors( + "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, + cl::desc( + "Pretend that scalable vectors are supported, even if the target does " + "not support them. This flag should only be used for testing.")); + static cl::opt<unsigned> SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, cl::desc( @@ -247,6 +296,12 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave( cl::desc( "Enable runtime interleaving until load/store ports are saturated")); +/// Interleave small loops with scalar reductions. +static cl::opt<bool> InterleaveSmallLoopScalarReduction( + "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, + cl::desc("Enable interleaving for loops with small iteration counts that " + "contain scalar reductions to expose ILP.")); + /// The number of stores in a loop that are allowed to need predication. static cl::opt<unsigned> NumberOfStoresToPredicate( "vectorize-num-stores-pred", cl::init(1), cl::Hidden, @@ -265,6 +320,17 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); +static cl::opt<bool> + PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), + cl::Hidden, + cl::desc("Prefer in-loop vector reductions, " + "overriding the targets preference.")); + +static cl::opt<bool> PreferPredicatedReductionSelect( + "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, + cl::desc( + "Prefer predicating a reduction operation over an after loop select.")); + cl::opt<bool> EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -307,12 +373,14 @@ static Type *getMemInstValueType(Value *I) { /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. -static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { +static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a <VF x Ty> vector. - if (VF > 1) { - auto *VectorTy = FixedVectorType::get(Ty, VF); - return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); + if (VF.isVector()) { + auto *VectorTy = VectorType::get(Ty, VF); + return TypeSize::get(VF.getKnownMinValue() * + DL.getTypeAllocSize(Ty).getFixedValue(), + VF.isScalable()) != DL.getTypeStoreSize(VectorTy); } // If the vectorization factor is one, we just check if an array of type Ty @@ -393,29 +461,42 @@ public: LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, unsigned VecWidth, + OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), - VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} + VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), + BFI(BFI), PSI(PSI) { + // Query this against the original loop and save it here because the profile + // of the original loop header may change as the transformation happens. + OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( + OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); + } + virtual ~InnerLoopVectorizer() = default; - /// Create a new empty loop. Unlink the old loop and connect the new one. - /// Return the pre-header block of the new loop. - BasicBlock *createVectorizedLoopSkeleton(); + /// Create a new empty loop that will contain vectorized instructions later + /// on, while the old loop will be used as the scalar remainder. Control flow + /// is generated around the vectorized (and scalar epilogue) loops consisting + /// of various checks and bypasses. Return the pre-header block of the new + /// loop. + /// In the case of epilogue vectorization, this function is overriden to + /// handle the more complex control flow around the loops. + virtual BasicBlock *createVectorizedLoopSkeleton(); /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I, VPUser &Operands, + void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, VPTransformState &State); /// Widen a single call instruction within the innermost loop. - void widenCallInstruction(CallInst &I, VPUser &ArgOperands, + void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State); /// Widen a single select instruction within the innermost loop. - void widenSelectInstruction(SelectInst &I, VPUser &Operands, + void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, bool InvariantCond, VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. @@ -431,14 +512,15 @@ public: /// Vectorize a single GetElementPtrInst based on information gathered and /// decisions taken during planning. - void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, - unsigned VF, bool IsPtrLoopInvariant, + void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, + unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, + Value *StartV, unsigned UF, ElementCount VF); /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane @@ -452,7 +534,8 @@ public: /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); + void widenIntOrFpInduction(PHINode *IV, Value *Start, + TruncInst *Trunc = nullptr); /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a /// vector or scalar value on-demand if one is not yet available. When @@ -477,6 +560,10 @@ public: /// value into a vector. Value *getOrCreateVectorValue(Value *V, unsigned Part); + void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { + VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); + } + /// Return a value in the new loop corresponding to \p V from the original /// loop at unroll and vector indices \p Instance. If the value has been /// vectorized but not scalarized, the necessary extractelement instruction @@ -491,7 +578,9 @@ public: /// BlockInMask is non-null. Use \p State to translate given VPValues to IR /// values in the vectorized loop. void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, + ArrayRef<VPValue *> VPDefs, VPTransformState &State, VPValue *Addr, + ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask = nullptr); /// Vectorize Load and Store instructions with the base address given in \p @@ -499,8 +588,8 @@ public: /// non-null. Use \p State to translate given VPValues to IR values in the /// vectorized loop. void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, - VPValue *Addr, VPValue *StoredValue, - VPValue *BlockInMask); + VPValue *Def, VPValue *Addr, + VPValue *StoredValue, VPValue *BlockInMask); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -544,10 +633,11 @@ protected: /// Clear NSW/NUW flags from reduction instructions if necessary. void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); - /// The Loop exit block may have single value PHI nodes with some - /// incoming value. While vectorizing we only handled real values - /// that were defined inside the loop and we should have one value for - /// each predecessor of its parent basic block. See PR14725. + /// Fixup the LCSSA phi nodes in the unique exit block. This simply + /// means we need to add the appropriate incoming value from the middle + /// block as exiting edges from the scalar epilogue loop (if present) are + /// already in place, and we exit the vector loop exclusively to the middle + /// block. void fixLCSSAPHIs(); /// Iteratively sink the scalarized operands of a predicated instruction into @@ -586,7 +676,8 @@ protected: /// truncate instruction, instead of widening the original IV, we widen a /// version of the IV truncated to \p EntryVal's type. void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, - Value *Step, Instruction *EntryVal); + Value *Step, Value *Start, + Instruction *EntryVal); /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. @@ -654,6 +745,28 @@ protected: const DataLayout &DL, const InductionDescriptor &ID) const; + /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, + /// vector loop preheader, middle block and scalar preheader. Also + /// allocate a loop object for the new vector loop and return it. + Loop *createVectorLoopSkeleton(StringRef Prefix); + + /// Create new phi nodes for the induction variables to resume iteration count + /// in the scalar epilogue, from where the vectorized loop left off (given by + /// \p VectorTripCount). + /// In cases where the loop skeleton is more complicated (eg. epilogue + /// vectorization) and the resume values can come from an additional bypass + /// block, the \p AdditionalBypass pair provides information about the bypass + /// block and the end value on the edge from bypass to this loop. + void createInductionResumeValues( + Loop *L, Value *VectorTripCount, + std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); + + /// Complete the loop skeleton by adding debug MDs, creating appropriate + /// conditional branches in the middle block, preparing the builder and + /// running the verifier. Take in the vector loop \p L as argument, and return + /// the preheader of the completed vector loop. + BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); + /// Add additional metadata to \p To that was not present on \p Orig. /// /// Currently this is used to add the noalias annotations based on the @@ -672,6 +785,11 @@ protected: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); + /// Allow subclasses to override and print debug traces before/after vplan + /// execution, when trace information is requested. + virtual void printDebugTracesAtStart(){}; + virtual void printDebugTracesAtEnd(){}; + /// The original loop. Loop *OrigLoop; @@ -710,7 +828,7 @@ protected: /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. - unsigned VF; + ElementCount VF; /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. @@ -730,7 +848,8 @@ protected: /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; - /// The ExitBlock of the scalar loop. + /// The (unique) ExitBlock of the scalar loop. Note that + /// there can be multiple exiting edges reaching this block. BasicBlock *LoopExitBlock; /// The vector loop body. @@ -779,6 +898,14 @@ protected: // Vector of original scalar PHIs whose corresponding widened PHIs need to be // fixed up at the end of vector code generation. SmallVector<PHINode *, 8> OrigPHIsToFix; + + /// BFI and PSI are used to check for profile guided size optimizations. + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; + + // Whether this loop should be optimized for size based on profile guided size + // optimizatios. + bool OptForSizeBasedOnProfile; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -789,9 +916,11 @@ public: const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM) {} + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), UnrollFactor, LVL, CM, + BFI, PSI) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -801,6 +930,128 @@ private: Value *reverseVector(Value *Vec) override; }; +/// Encapsulate information regarding vectorization of a loop and its epilogue. +/// This information is meant to be updated and used across two stages of +/// epilogue vectorization. +struct EpilogueLoopVectorizationInfo { + ElementCount MainLoopVF = ElementCount::getFixed(0); + unsigned MainLoopUF = 0; + ElementCount EpilogueVF = ElementCount::getFixed(0); + unsigned EpilogueUF = 0; + BasicBlock *MainLoopIterationCountCheck = nullptr; + BasicBlock *EpilogueIterationCountCheck = nullptr; + BasicBlock *SCEVSafetyCheck = nullptr; + BasicBlock *MemSafetyCheck = nullptr; + Value *TripCount = nullptr; + Value *VectorTripCount = nullptr; + + EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, + unsigned EUF) + : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), + EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { + assert(EUF == 1 && + "A high UF for the epilogue loop is likely not beneficial."); + } +}; + +/// An extension of the inner loop vectorizer that creates a skeleton for a +/// vectorized loop that has its epilogue (residual) also vectorized. +/// The idea is to run the vplan on a given loop twice, firstly to setup the +/// skeleton and vectorize the main loop, and secondly to complete the skeleton +/// from the first step and vectorize the epilogue. This is achieved by +/// deriving two concrete strategy classes from this base class and invoking +/// them in succession from the loop vectorizer planner. +class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { +public: + InnerLoopAndEpilogueVectorizer( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), + EPI(EPI) {} + + // Override this function to handle the more complex control flow around the + // three loops. + BasicBlock *createVectorizedLoopSkeleton() final override { + return createEpilogueVectorizedLoopSkeleton(); + } + + /// The interface for creating a vectorized skeleton using one of two + /// different strategies, each corresponding to one execution of the vplan + /// as described above. + virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + + /// Holds and updates state information required to vectorize the main loop + /// and its epilogue in two separate passes. This setup helps us avoid + /// regenerating and recomputing runtime safety checks. It also helps us to + /// shorten the iteration-count-check path length for the cases where the + /// iteration count of the loop is so small that the main vector loop is + /// completely skipped. + EpilogueLoopVectorizationInfo &EPI; +}; + +/// A specialized derived class of inner loop vectorizer that performs +/// vectorization of *main* loops in the process of vectorizing loops and their +/// epilogues. +class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { +public: + EpilogueVectorizerMainLoop( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, LVL, CM, BFI, PSI) {} + /// Implements the interface for creating a vectorized skeleton using the + /// *main loop* strategy (ie the first pass of vplan execution). + BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + +protected: + /// Emits an iteration count bypass check once for the main loop (when \p + /// ForEpilogue is false) and once for the epilogue loop (when \p + /// ForEpilogue is true). + BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, + bool ForEpilogue); + void printDebugTracesAtStart() override; + void printDebugTracesAtEnd() override; +}; + +// A specialized derived class of inner loop vectorizer that performs +// vectorization of *epilogue* loops in the process of vectorizing loops and +// their epilogues. +class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { +public: + EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, + EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, + llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, LVL, CM, BFI, PSI) {} + /// Implements the interface for creating a vectorized skeleton using the + /// *epilogue loop* strategy (ie the second pass of vplan execution). + BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + +protected: + /// Emits an iteration count bypass check after the main vector loop has + /// finished to see if there are any iterations left to execute by either + /// the vector epilogue or the scalar epilogue. + BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, + BasicBlock *Bypass, + BasicBlock *Insert); + void printDebugTracesAtStart() override; + void printDebugTracesAtEnd() override; +}; } // end namespace llvm /// Look for a meaningful debug location on the instruction or it's @@ -827,7 +1078,9 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && !isa<DbgInfoIntrinsic>(Inst)) { - auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto NewDIL = + DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); if (NewDIL) B.SetCurrentDebugLocation(NewDIL.getValue()); else @@ -881,6 +1134,15 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, return R; } +/// Return a value for Step multiplied by VF. +static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { + assert(isa<ConstantInt>(Step) && "Expected an integer step"); + Constant *StepVal = ConstantInt::get( + Step->getType(), + cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -952,7 +1214,10 @@ enum ScalarEpilogueLowering { CM_ScalarEpilogueNotAllowedLowTripLoop, // Loop hint predicate indicating an epilogue is undesired. - CM_ScalarEpilogueNotNeededUsePredicate + CM_ScalarEpilogueNotNeededUsePredicate, + + // Directive indicating we must either tail fold or not vectorize + CM_ScalarEpilogueNotAllowedUsePredicate }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -979,7 +1244,7 @@ public: /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); + Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -989,10 +1254,13 @@ public: /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(unsigned MaxVF); + VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); + VectorizationFactor + selectEpilogueVectorizationFactor(const ElementCount MaxVF, + const LoopVectorizationPlanner &LVP); /// Setup cost-based decisions for user vectorization factor. - void selectUserVectorizationFactor(unsigned UserVF) { + void selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); } @@ -1006,7 +1274,7 @@ public: /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); + unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1015,7 +1283,7 @@ public: /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. - void setCostBasedWideningDecision(unsigned VF); + void setCostBasedWideningDecision(ElementCount VF); /// A struct that represents some properties of the register usage /// of a loop. @@ -1030,11 +1298,16 @@ public: /// \return Returns information about the register usages of the loop for the /// given vectorization factors. - SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); + SmallVector<RegisterUsage, 8> + calculateRegisterUsage(ArrayRef<ElementCount> VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); + /// Split reductions into those that happen in the loop, and those that happen + /// outside. In loop reductions are collected into InLoopReductionChains. + void collectInLoopReductions(); + /// \returns The smallest bitwidth each instruction can be represented with. /// The vector equivalents of these instructions should be truncated to this /// type. @@ -1044,8 +1317,9 @@ public: /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. - bool isProfitableToScalarize(Instruction *I, unsigned VF) const { - assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); + bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { + assert(VF.isVector() && + "Profitable to scalarize relevant only for VF > 1."); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. @@ -1059,8 +1333,8 @@ public: } /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1075,8 +1349,8 @@ public: } /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1092,8 +1366,8 @@ public: /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. - bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { - return VF > 1 && MinBWs.find(I) != MinBWs.end() && + bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { + return VF.isVector() && MinBWs.find(I) != MinBWs.end() && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1110,17 +1384,18 @@ public: /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. - void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, - unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, + InstructionCost Cost) { + assert(VF.isVector() && "Expected VF >=2"); WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. - void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, - InstWidening W, unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + void setWideningDecision(const InterleaveGroup<Instruction> *Grp, + ElementCount VF, InstWidening W, + InstructionCost Cost) { + assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { @@ -1136,15 +1411,14 @@ public: /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - + InstWidening getWideningDecision(Instruction *I, ElementCount VF) { + assert(VF.isVector() && "Expected VF to be a vector VF"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; - std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); + std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; @@ -1153,9 +1427,9 @@ public: /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. - unsigned getWideningCost(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); + InstructionCost getWideningCost(Instruction *I, ElementCount VF) { + assert(VF.isVector() && "Expected VF >=2"); + std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; @@ -1164,7 +1438,7 @@ public: /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. - bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { + bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { // If the instruction is not a truncate, return false. auto *Trunc = dyn_cast<TruncInst>(I); if (!Trunc) @@ -1189,14 +1463,14 @@ public: /// Collects the instructions to scalarize for each predicated instruction in /// the loop. - void collectInstsToScalarize(unsigned VF); + void collectInstsToScalarize(ElementCount VF); /// Collect Uniform and Scalar values for the given \p VF. /// The sets depend on CM decision for Load/Store instructions /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { + void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. - if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) + if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1247,7 +1521,8 @@ public: /// instructions that may divide by zero. /// If a non-zero VF has been calculated, we check if I will be scalarized /// predication for that VF. - bool isScalarWithPredication(Instruction *I, unsigned VF = 1); + bool isScalarWithPredication(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. @@ -1264,12 +1539,16 @@ public: /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + bool + memoryInstructionCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + bool + interleavedAccessCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1282,11 +1561,16 @@ public: return InterleaveInfo.getInterleaveGroup(Instr); } - /// Returns true if an interleaved group requires a scalar iteration - /// to handle accesses with gaps, and there is nothing preventing us from - /// creating a scalar epilogue. + /// Returns true if we're required to use a scalar epilogue for at least + /// the final iteration of the original loop. bool requiresScalarEpilogue() const { - return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); + if (!isScalarEpilogueAllowed()) + return false; + // If we might exit from anywhere but the latch, must run the exiting + // iteration in scalar form. + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) + return true; + return InterleaveInfo.requiresScalarEpilogue(); } /// Returns true if a scalar epilogue is not allowed due to optsize or a @@ -1302,17 +1586,34 @@ public: return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// A SmallMapVector to store the InLoop reduction op chains, mapping phi + /// nodes to the chain of instructions representing the reductions. Uses a + /// MapVector to ensure deterministic iteration order. + using ReductionChainMap = + SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; + + /// Return the chain of instructions representing an inloop reduction. + const ReductionChainMap &getInLoopReductionChains() const { + return InLoopReductionChains; + } + + /// Returns true if the Phi is part of an inloop reduction. + bool isInLoopReduction(PHINode *Phi) const { + return InLoopReductionChains.count(Phi); + } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. - unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. - unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, + bool &NeedToScalarize); /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1327,7 +1628,8 @@ private: /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - unsigned computeFeasibleMaxVF(unsigned ConstTripCount); + ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1336,47 +1638,54 @@ private: /// is /// false, then all operations will be scalarized (i.e. no vectorization has /// actually taken place). - using VectorizationCostTy = std::pair<unsigned, bool>; + using VectorizationCostTy = std::pair<InstructionCost, bool>; /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. - VectorizationCostTy expectedCost(unsigned VF); + VectorizationCostTy expectedCost(ElementCount VF); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + InstructionCost getInstructionCost(Instruction *I, ElementCount VF, + Type *&VectorTy); + + /// Return the cost of instructions in an inloop reduction pattern, if I is + /// part of that pattern. + InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, + Type *VectorTy, + TTI::TargetCostKind CostKind); /// Calculate vectorization cost of memory instruction \p I. - unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); + InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); /// The cost computation for scalarized memory instruction. - unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); /// The cost computation for interleaving group of memory instructions. - unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); /// The cost computation for Gather/Scatter instruction. - unsigned getGatherScatterCost(Instruction *I, unsigned VF); + InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. - unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); + InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. /// Store: scalar store + (loop invariant value stored? 0 : extract of last /// element) - unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -1394,7 +1703,7 @@ private: /// A type representing the costs for instructions if they were to be /// scalarized rather than vectorized. The entries are Instruction-Cost /// pairs. - using ScalarCostsTy = DenseMap<Instruction *, unsigned>; + using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. @@ -1416,19 +1725,30 @@ private: /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; + DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. - DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; + DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. - DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; + DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. - DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; + DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; + + /// PHINodes of the reductions that should be expanded in-loop along with + /// their associated chains of reduction operations, in program order from top + /// (PHI) to bottom + ReductionChainMap InLoopReductionChains; + + /// A Map of inloop reduction operations and their immediate chain operand. + /// FIXME: This can be removed once reductions can be costed correctly in + /// vplan. This was added to allow quick lookup to the inloop operations, + /// without having to loop through InLoopReductionChains. + DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to @@ -1436,7 +1756,7 @@ private: /// non-negative return value implies the expression will be scalarized. /// Currently, only single-use chains are considered for scalarization. int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, - unsigned VF); + ElementCount VF); /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in @@ -1447,27 +1767,28 @@ private: /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. - void collectLoopUniforms(unsigned VF); + void collectLoopUniforms(ElementCount VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized /// during vectorization. Non-uniform scalarized instructions will be /// represented by VF values in the vectorized loop, each corresponding to an /// iteration of the original scalar loop. - void collectLoopScalars(unsigned VF); + void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, - std::pair<InstWidening, unsigned>>; + using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, + std::pair<InstWidening, InstructionCost>>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. - bool needsExtract(Value *V, unsigned VF) const { + bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast<Instruction>(V); - if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) + if (VF.isScalar() || !I || !TheLoop->contains(I) || + TheLoop->isLoopInvariant(I)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1482,11 +1803,21 @@ private: /// Returns a range containing only operands needing to be extracted. SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, - unsigned VF) { + ElementCount VF) { return SmallVector<Value *, 4>(make_filter_range( Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } + /// Determines if we have the infrastructure to vectorize loop \p L and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const Loop &L, + const ElementCount VF) const; + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -1529,6 +1860,9 @@ public: /// Values to ignore in the cost model when VF > 1. SmallPtrSet<const Value *, 16> VecValuesToIgnore; + + /// Profitable vector factors. + SmallVector<VectorizationFactor, 8> ProfitableVFs; }; } // end namespace llvm @@ -1549,7 +1883,7 @@ public: // representation for pragma 'omp simd' is introduced. static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE) { - assert(!OuterLp->empty() && "This is not an outer loop"); + assert(!OuterLp->isInnermost() && "This is not an outer loop"); LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); // Only outer loops with an explicit vectorization hint are supported. @@ -1582,7 +1916,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI, // now, only collect outer loops that have explicit vectorization hints. If we // are stress testing the VPlan H-CFG construction, we collect the outermost // loop of every loop nest. - if (L.empty() || VPlanBuildStressTest || + if (L.isInnermost() || VPlanBuildStressTest || (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { LoopBlocksRPO RPOT(&L); RPOT.perform(LI); @@ -1696,10 +2030,10 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { } void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( - const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { + const InductionDescriptor &II, Value *Step, Value *Start, + Instruction *EntryVal) { assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); - Value *Start = II.getStartValue(); // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); @@ -1729,7 +2063,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); + Value *ConstVF = + getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. @@ -1737,10 +2072,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - Value *SplatVF = - isa<Constant>(Mul) - ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(VF, Mul); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + Value *SplatVF = isa<Constant>(Mul) + ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -1816,7 +2151,8 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, + TruncInst *Trunc) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); @@ -1874,8 +2210,10 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *EntryPart = - getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); + getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, + ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); if (Trunc) addMetadata(EntryPart, Trunc); @@ -1885,7 +2223,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF <= 1) { + if (VF.isZero() || VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -1896,7 +2234,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // least one user in the loop that is not widened. auto NeedsScalarIV = needsScalarInduction(EntryVal); if (!NeedsScalarIV) { - createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); return; } @@ -1904,7 +2242,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); Value *ScalarIV = CreateScalarIV(Step); // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase @@ -1926,7 +2264,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. - auto *ValVTy = cast<VectorType>(Val->getType()); + auto *ValVTy = cast<FixedVectorType>(Val->getType()); int VLen = ValVTy->getNumElements(); Type *STy = Val->getType()->getScalarType(); @@ -1983,8 +2321,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF > 1 && "VF should be greater than one"); - + assert(VF.isVector() && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2006,12 +2343,27 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 - : VF; + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) + ? 1 + : VF.getKnownMinValue(); + assert((!VF.isScalable() || Lanes == 1) && + "Should never scalarize a scalable vector"); // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); + auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), + ScalarIVTy->getScalarSizeInBits()); + Value *StartIdx = + createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + if (ScalarIVTy->isFloatingPointTy()) + StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); + StartIdx = addFastMathFlag(Builder.CreateBinOp( + AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); + // The step returned by `createStepForVF` is a runtime-evaluated value + // when VF is scalable. Otherwise, it should be folded into a Constant. + assert((VF.isScalable() || isa<Constant>(StartIdx)) && + "Expected StartIdx to be folded to a constant when VF is not " + "scalable"); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2045,7 +2397,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // If we aren't vectorizing, we can just copy the scalar map values over to // the vector map. - if (VF == 1) { + if (VF.isScalar()) { VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); return ScalarValue; } @@ -2054,7 +2406,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; + unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) + ? 0 + : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "Scalable vectorization can't lead to any scalarized values."); auto *LastInst = cast<Instruction>( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2075,10 +2431,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { VectorValue = getBroadcastInstrs(ScalarValue); VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { - // Initialize packing with insertelements to start from undef. - Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); - VectorLoopValueMap.setVectorValue(V, Part, Undef); - for (unsigned Lane = 0; Lane < VF; ++Lane) + // Initialize packing with insertelements to start from poison. + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); + VectorLoopValueMap.setVectorValue(V, Part, Poison); + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } @@ -2117,7 +2474,7 @@ InnerLoopVectorizer::getOrCreateScalarValue(Value *V, // extractelement instruction. auto *U = getOrCreateVectorValue(V, Instance.Part); if (!U->getType()->isVectorTy()) { - assert(VF == 1 && "Value not scalarized has non-vector type"); + assert(VF.isScalar() && "Value not scalarized has non-vector type"); return U; } @@ -2142,12 +2499,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue( Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); + assert(!VF.isScalable() && "Cannot reverse scalable vectors"); SmallVector<int, 8> ShuffleMask; - for (unsigned i = 0; i < VF; ++i) - ShuffleMask.push_back(VF - i - 1); + for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) + ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); - return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), - ShuffleMask, "reverse"); + return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); } // Return whether we allow using masked interleave-groups (for dealing with @@ -2172,9 +2529,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // } // To: // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B -// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements -// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements -// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements +// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements +// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements +// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements // // Or translate following interleaved store group (factor = 3): // for (i = 0; i < N; i+=3) { @@ -2185,20 +2542,22 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // } // To: // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> -// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup( - const InterleaveGroup<Instruction> *Group, VPTransformState &State, - VPValue *Addr, VPValue *BlockInMask) { + const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, + VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, + VPValue *BlockInMask) { Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. SmallVector<Value *, 2> AddrParts; @@ -2214,8 +2573,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. + assert(!VF.isScalable() && + "scalable vector reverse operation is not implemented"); if (Group->isReverse()) - Index += (VF - 1) * Group->getFactor(); + Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, {Part, 0}); @@ -2246,11 +2607,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } setDebugLocFromInst(Builder, Instr); - Value *UndefVec = UndefValue::get(VecTy); + Value *PoisonVec = PoisonValue::get(VecTy); Value *MaskForGaps = nullptr; if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2266,10 +2628,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *GroupMask = MaskForGaps; if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); - auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, - createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) @@ -2277,7 +2640,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } NewLoad = Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), - GroupMask, UndefVec, "wide.masked.vec"); + GroupMask, PoisonVec, "wide.masked.vec"); } else NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], @@ -2288,6 +2651,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // For each member in the group, shuffle out the appropriate data from the // wide loads. + unsigned J = 0; for (unsigned I = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); @@ -2295,28 +2659,33 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (!Member) continue; - auto StrideMask = createStrideMask(I, InterleaveFactor, VF); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto StrideMask = + createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( - NewLoads[Part], UndefVec, StrideMask, "strided.vec"); + NewLoads[Part], StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } if (Group->isReverse()) StridedVec = reverseVector(StridedVec); - VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); + State.set(VPDefs[J], Member, StridedVec, Part); } + ++J; } return; } // The sub vector type for current instruction. - auto *SubVT = FixedVectorType::get(ScalarTy, VF); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. for (unsigned Part = 0; Part < UF; Part++) { @@ -2324,11 +2693,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( SmallVector<Value *, 4> StoredVecs; for (unsigned i = 0; i < InterleaveFactor; i++) { // Interleaved store group doesn't allow a gap, so each index has a member - Instruction *Member = Group->getMember(i); - assert(Member && "Fail to get a member from an interleaved store group"); + assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); + + Value *StoredVec = State.get(StoredValues[i], Part); - Value *StoredVec = getOrCreateVectorValue( - cast<StoreInst>(Member)->getValueOperand(), Part); if (Group->isReverse()) StoredVec = reverseVector(StoredVec); @@ -2344,16 +2712,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *IVec = Builder.CreateShuffleVector( - WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), + WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); - auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); @@ -2366,11 +2735,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } } -void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, - VPTransformState &State, - VPValue *Addr, - VPValue *StoredValue, - VPValue *BlockInMask) { +void InnerLoopVectorizer::vectorizeMemoryInstruction( + Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, + VPValue *StoredValue, VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2387,7 +2754,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, "CM decision is not to widen the memory instruction"); Type *ScalarDataTy = getMemInstValueType(Instr); - auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); + + auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); // Determine if the pointer operand of the access is either consecutive or @@ -2419,19 +2787,23 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, InBounds = gep->isInBounds(); if (Reverse) { + assert(!VF.isScalable() && + "Reversing vectors is not yet supported for scalable vectors."); + // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); + PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( + ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); + PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( + ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { + Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); PartPtr->setIsInBounds(InBounds); } @@ -2486,7 +2858,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad( - VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), + VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); else NewLI = @@ -2497,7 +2869,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (Reverse) NewLI = reverseVector(NewLI); } - VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); + + State.set(Def, Instr, NewLI, Part); } } @@ -2507,6 +2880,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for + // the first lane and part. + if (isa<NoAliasScopeDeclInst>(Instr)) + if (Instance.Lane != 0 || Instance.Part != 0) + return; + setDebugLocFromInst(Builder, Instr); // Does this instruction return a value ? @@ -2519,7 +2898,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { - auto *NewOp = State.get(User.getOperand(op), Instance); + auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); + auto InputInstance = Instance; + if (!Operand || !OrigLoop->contains(Operand) || + (Cost->isUniformAfterVectorization(Operand, State.VF))) + InputInstance.Lane = 0; + auto *NewOp = State.get(User.getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2527,7 +2911,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - // Add the cloned scalar to the scalar map entry. + // TODO: Set result for VPValue of VPReciplicateRecipe. This requires + // representing scalar values in VPTransformState. Add the cloned scalar to + // the scalar map entry. VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); // If we just cloned a new assumption, add it the assumption cache. @@ -2564,7 +2950,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, Induction->addIncoming(Next, Latch); // Create the compare. Value *ICmp = Builder.CreateICmpEQ(Next, End); - Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); + Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); // Now we have two terminators. Remove the old one from the block. Latch->getTerminator()->eraseFromParent(); @@ -2581,7 +2967,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { // Find the loop boundaries. ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - assert(BackedgeTakenCount != SE->getCouldNotCompute() && + assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); Type *IdxTy = Legal->getWidestInductionType(); @@ -2627,7 +3013,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Type *Ty = TC->getType(); - Constant *Step = ConstantInt::get(Ty, VF * UF); + // This is where we can make the step a runtime constant. + Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2636,9 +3023,12 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF * UF) && + assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); + assert(!VF.isScalable() && + "Tail folding not yet supported for scalable vectors"); + TC = Builder.CreateAdd( + TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -2648,14 +3038,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // unroll factor (number of SIMD instructions). Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); - // If there is a non-reversed interleaved group that may speculatively access - // memory out-of-bounds, we need to ensure that there will be at least one - // iteration of the scalar epilogue loop. Thus, if the step evenly divides + // There are two cases where we need to ensure (at least) the last iteration + // runs in the scalar remainder loop. Thus, if the step evenly divides // the trip count, we set the remainder to be equal to the step. If the step // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations - // check ensures that N >= Step. - if (VF > 1 && Cost->requiresScalarEpilogue()) { + // check ensures that N >= Step. The cases are: + // 1) If there is a non-reversed interleaved group that may speculatively + // access memory out-of-bounds. + // 2) If any instruction may follow a conditionally taken exit. That is, if + // the loop contains multiple exiting blocks, or a single exiting block + // which is not the latch. + if (VF.isVector() && Cost->requiresScalarEpilogue()) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -2668,17 +3062,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. - unsigned VF = DstVTy->getNumElements(); - VectorType *SrcVecTy = cast<VectorType>(V->getType()); + auto *DstFVTy = cast<FixedVectorType>(DstVTy); + unsigned VF = DstFVTy->getNumElements(); + auto *SrcVecTy = cast<FixedVectorType>(V->getType()); assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); - Type *DstElemTy = DstVTy->getElementType(); + Type *DstElemTy = DstFVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"); // Do a direct cast if element types are castable. if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { - return Builder.CreateBitOrPointerCast(V, DstVTy); + return Builder.CreateBitOrPointerCast(V, DstFVTy); } // V cannot be directly casted to desired vector type. // May happen when V is a floating point vector but DstVTy is a vector of @@ -2692,7 +3087,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); auto *VecIntTy = FixedVectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); - return Builder.CreateBitOrPointerCast(CastVal, DstVTy); + return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, @@ -2713,11 +3108,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); - if (!Cost->foldTailByMasking()) - CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF * UF), - "min.iters.check"); - + if (!Cost->foldTailByMasking()) { + Value *Step = + createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); + CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + } // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -2754,7 +3149,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { if (C->isZero()) return; - assert(!SCEVCheckBlock->getParent()->hasOptSize() && + assert(!(SCEVCheckBlock->getParent()->hasOptSize() || + (OptForSizeBasedOnProfile && + Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); @@ -2792,15 +3189,8 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); if (!RtPtrChecking.Need) return; - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, - RtPtrChecking.getChecks(), RtPtrChecking.getSE()); - assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " - "claimed checks are required"); - if (MemCheckBlock->getParent()->hasOptSize()) { + if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -2820,22 +3210,33 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, "vector.ph"); + auto *CondBranch = cast<BranchInst>( + Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); + ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); + LoopBypassBlocks.push_back(MemCheckBlock); + AddedSafetyChecks = true; + // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { DT->changeImmediateDominator(Bypass, MemCheckBlock); DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); } - ReplaceInstWithInst( - MemCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); - LoopBypassBlocks.push_back(MemCheckBlock); - AddedSafetyChecks = true; + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + std::tie(FirstCheckInst, MemRuntimeCheck) = + addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, + RtPtrChecking.getChecks(), RtPtrChecking.getSE()); + assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " + "claimed checks are required"); + CondBranch->setCondition(MemRuntimeCheck); // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. - LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + LVer = std::make_unique<LoopVersioning>( + *Legal->getLAI(), + Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, + DT, PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -2939,74 +3340,35 @@ Value *InnerLoopVectorizer::emitTransformedIndex( llvm_unreachable("invalid enum"); } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { - /* - In this function we generate a new loop. The new loop will contain - the vectorized instructions while the old loop will continue to run the - scalar remainder. - - [ ] <-- loop iteration number check. - / | - / v - | [ ] <-- vector loop bypass (may consist of multiple blocks). - | / | - | / v - || [ ] <-- vector pre header. - |/ | - | v - | [ ] \ - | [ ]_| <-- vector loop. - | | - | v - | -[ ] <--- middle-block. - | / | - | / v - -|- >[ ] <--- new preheader. - | | - | v - | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder. - \ | - \ v - >[ ] <-- exit block. - ... - */ - - MDNode *OrigLoopID = OrigLoop->getLoopID(); - - // Some loops have a single integer induction variable, while other loops - // don't. One example is c++ iterators that often have multiple pointer - // induction variables. In the code below we also support a case where we - // don't have a single induction variable. - // - // We try to obtain an induction variable from the original loop as hard - // as possible. However if we don't find one that: - // - is an integer - // - counts from zero, stepping by one - // - is the size of the widest induction variable type - // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - - // Split the single block loop into the two loop structure described above. +Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); - LoopExitBlock = OrigLoop->getExitBlock(); + LoopExitBlock = OrigLoop->getUniqueExitBlock(); assert(LoopExitBlock && "Must have an exit block"); assert(LoopVectorPreHeader && "Invalid loop structure"); LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - LI, nullptr, "middle.block"); + LI, nullptr, Twine(Prefix) + "middle.block"); LoopScalarPreHeader = SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, - nullptr, "scalar.ph"); + nullptr, Twine(Prefix) + "scalar.ph"); + + // Set up branch from middle block to the exit and scalar preheader blocks. + // completeLoopSkeleton will update the condition to use an iteration check, + // if required to decide whether to execute the remainder. + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); + auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); + BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); + // We intentionally don't let SplitBlock to update LoopInfo since // LoopVectorBody should belong to another loop than LoopVectorPreHeader. // LoopVectorBody is explicitly added to the correct place few lines later. LoopVectorBody = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - nullptr, nullptr, "vector.body"); + nullptr, nullptr, Twine(Prefix) + "vector.body"); // Update dominator for loop exit. DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); @@ -3023,37 +3385,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LI->addTopLevelLoop(Lp); } Lp->addBasicBlockToLoop(LoopVectorBody, *LI); + return Lp; +} - // Find the loop boundaries. - Value *Count = getOrCreateTripCount(Lp); - - Value *StartIdx = ConstantInt::get(IdxTy, 0); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. This check also covers the case where the - // backedge-taken count is uint##_max: adding one to it will overflow leading - // to an incorrect trip count of zero. In this (rare) case we will also jump - // to the scalar loop. - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); - - // Generate the code to check any assumptions that we've made for SCEV - // expressions. - emitSCEVChecks(Lp, LoopScalarPreHeader); - - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - // Generate the induction variable. - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Constant *Step = ConstantInt::get(IdxTy, VF * UF); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); - +void InnerLoopVectorizer::createInductionResumeValues( + Loop *L, Value *VectorTripCount, + std::pair<BasicBlock *, Value *> AdditionalBypass) { + assert(VectorTripCount && L && "Expected valid arguments"); + assert(((AdditionalBypass.first && AdditionalBypass.second) || + (!AdditionalBypass.first && !AdditionalBypass.second)) && + "Inconsistent information about additional bypass."); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3061,10 +3402,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // iteration in the vectorized loop. // If we come from a bypass edge then we need to start from the original // start value. - - // This variable saves the new starting index for the scalar loop. It is used - // to test if there are any tail iterations left once the vector loop has - // completed. for (auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; @@ -3076,20 +3413,32 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; + Value *EndValueFromAdditionalBypass = AdditionalBypass.second; if (OrigPhi == OldInduction) { // We know what the end value is. - EndValue = CountRoundDown; + EndValue = VectorTripCount; } else { - IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); + IRBuilder<> B(L->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); - Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); + CastInst::getCastOpcode(VectorTripCount, true, StepType, true); + Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); - } + // Compute the end value for the additional bypass (if applicable). + if (AdditionalBypass.first) { + B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); + CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, + StepType, true); + CRD = + B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); + EndValueFromAdditionalBypass = + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + EndValueFromAdditionalBypass->setName("ind.end"); + } + } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); @@ -3099,42 +3448,44 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); + + if (AdditionalBypass.first) + BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, + EndValueFromAdditionalBypass); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } +} + +BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, + MDNode *OrigLoopID) { + assert(L && "Expected valid loop."); - // We need the OrigLoop (scalar loop part) latch terminator to help - // produce correct debug info for the middle block BB instructions. - // The legality check stage guarantees that the loop will have a single - // latch. - assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && - "Scalar loop latch terminator isn't a branch"); - BranchInst *ScalarLatchBr = - cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); + // The trip counts should be cached by now. + Value *Count = getOrCreateTripCount(L); + Value *VectorTripCount = getOrCreateVectorTripCount(L); + + auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. // If tail is to be folded, we know we don't need to run the remainder. - Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { - CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", - LoopMiddleBlock->getTerminator()); + Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + Count, VectorTripCount, "cmp.n", + LoopMiddleBlock->getTerminator()); - // Here we use the same DebugLoc as the scalar loop latch branch instead + // Here we use the same DebugLoc as the scalar loop latch terminator instead // of the corresponding compare because they may have ended up with // different line numbers and we want to avoid awkward line stepping while // debugging. Eg. if the compare has got a line number inside the loop. - cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); + CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); + cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); } - BranchInst *BrInst = - BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); - BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); - ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); - // Get ready to start creating new instructions into the vectorized body. - assert(LoopVectorPreHeader == Lp->getLoopPreheader() && + assert(LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"); Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); @@ -3142,7 +3493,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); if (VectorizedLoopID.hasValue()) { - Lp->setLoopID(VectorizedLoopID.getValue()); + L->setLoopID(VectorizedLoopID.getValue()); // Do not setAlreadyVectorized if loop attributes have been defined // explicitly. @@ -3152,9 +3503,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). if (MDNode *LID = OrigLoop->getLoopID()) - Lp->setLoopID(LID); + L->setLoopID(LID); - LoopVectorizeHints Hints(Lp, true, *ORE); + LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); #ifdef EXPENSIVE_CHECKS @@ -3165,6 +3516,91 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { return LoopVectorPreHeader; } +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { + /* + In this function we generate a new loop. The new loop will contain + the vectorized instructions while the old loop will continue to run the + scalar remainder. + + [ ] <-- loop iteration number check. + / | + / v + | [ ] <-- vector loop bypass (may consist of multiple blocks). + | / | + | / v + || [ ] <-- vector pre header. + |/ | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + | v + | -[ ] <--- middle-block. + | / | + | / v + -|- >[ ] <--- new preheader. + | | + | v + | [ ] \ + | [ ]_| <-- old scalar loop to handle remainder. + \ | + \ v + >[ ] <-- exit block. + ... + */ + + // Get the metadata of the original loop before it gets modified. + MDNode *OrigLoopID = OrigLoop->getLoopID(); + + // Create an empty vector loop, and prepare basic blocks for the runtime + // checks. + Loop *Lp = createVectorLoopSkeleton(""); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. This check also covers the case where the + // backedge-taken count is uint##_max: adding one to it will overflow leading + // to an incorrect trip count of zero. In this (rare) case we will also jump + // to the scalar loop. + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); + + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + emitSCEVChecks(Lp, LoopScalarPreHeader); + + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + + // Some loops have a single integer induction variable, while other loops + // don't. One example is c++ iterators that often have multiple pointer + // induction variables. In the code below we also support a case where we + // don't have a single induction variable. + // + // We try to obtain an induction variable from the original loop as hard + // as possible. However if we don't find one that: + // - is an integer + // - counts from zero, stepping by one + // - is the size of the widest induction variable type + // then we create a new one. + OldInduction = Legal->getPrimaryInduction(); + Type *IdxTy = Legal->getWidestInductionType(); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); + Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Emit phis for the new starting index of the scalar loop. + createInductionResumeValues(Lp, CountRoundDown); + + return completeLoopSkeleton(Lp, OrigLoopID); +} + // Fix up external users of the induction variable. At this point, we are // in LCSSA form, with all external PHIs that use the IV having one input value, // coming from the remainder loop. We need those PHIs to also have a correct @@ -3178,7 +3614,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, // value (the value that feeds into the phi from the loop latch). // We allow both, but they, obviously, have different values. - assert(OrigLoop->getExitBlock() && "Expected a single exit block"); + assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); DenseMap<Value *, Value *> MissingVals; @@ -3284,9 +3720,10 @@ static void cse(BasicBlock *BB) { } } -unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, - unsigned VF, - bool &NeedToScalarize) { +InstructionCost +LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, + bool &NeedToScalarize) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; @@ -3297,9 +3734,9 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // to be vectors, so we need to extract individual elements from there, // execute VF scalar calls, and then gather the result into the vector return // value. - unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, - TTI::TCK_RecipThroughput); - if (VF == 1) + InstructionCost ScalarCallCost = + TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); + if (VF.isScalar()) return ScalarCallCost; // Compute corresponding vector type for return value and arguments. @@ -3309,31 +3746,33 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); + InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); - unsigned Cost = ScalarCallCost * VF + ScalarizationCost; + InstructionCost Cost = + ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); + VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); if (!TLI || CI->isNoBuiltin() || !VecFunc) return Cost; // If the corresponding vector cost is cheaper, return its cost. - unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, - TTI::TCK_RecipThroughput); + InstructionCost VectorCallCost = + TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); if (VectorCallCost < Cost) { NeedToScalarize = false; - return VectorCallCost; + Cost = VectorCallCost; } return Cost; } -unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, + ElementCount VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -3373,7 +3812,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), KV.second); auto *TruncatedTy = FixedVectorType::get( - ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); + ScalarTruncatedTy, + cast<FixedVectorType>(OriginalTy)->getNumElements()); if (TruncatedTy == OriginalTy) continue; @@ -3423,13 +3863,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { break; } } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { - auto Elements0 = - cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); + auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( SI->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = - cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); + auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) + ->getNumElements(); auto *O1 = B.CreateZExtOrTrunc( SI->getOperand(1), FixedVectorType::get(ScalarTruncatedTy, Elements1)); @@ -3439,16 +3879,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { - auto Elements = - cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); + auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( IE->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements)); auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { - auto Elements = - cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); + auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( EE->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements)); @@ -3490,7 +3930,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. - if (VF > 1) + if (VF.isVector()) truncateToMinimalBitwidths(); // Fix widened non-induction PHIs by setting up the PHI operands. @@ -3531,9 +3971,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. - setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), - LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF * UF); + // + // For scalable vectorization we can't know at compile time how many iterations + // of the loop are handled in one vector iteration, so instead assume a pessimistic + // vscale of '1'. + setProfileInfoAfterUnrolling( + LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), + LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -3612,11 +4056,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Create a vector from the initial value. auto *VectorInit = ScalarInit; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); VectorInit = Builder.CreateInsertElement( - UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), - VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); + PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, + Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -3657,10 +4102,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - SmallVector<int, 8> ShuffleMask(VF); - ShuffleMask[0] = VF - 1; - for (unsigned I = 1; I < VF; ++I) - ShuffleMask[I] = I + VF - 1; + assert(!VF.isScalable()); + SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); + ShuffleMask[0] = VF.getKnownMinValue() - 1; + for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) + ShuffleMask[I] = I + VF.getKnownMinValue() - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -3670,9 +4116,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *PreviousPart = getOrCreateVectorValue(Previous, Part); Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, - ShuffleMask) - : Incoming; + auto *Shuffle = + VF.isVector() + ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) + : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast<Instruction>(PhiPart)->eraseFromParent(); VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); @@ -3685,10 +4132,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. auto *ExtractForScalar = Incoming; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); + ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), + "vector.recur.extract"); } // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself @@ -3696,9 +4144,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // will be the value when jumping to the exit block from the LoopMiddleBlock, // when the scalar loop is not run at all. Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF > 1) + if (VF.isVector()) ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); + Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), + "vector.recur.extract.for.phi"); // When loop is unrolled without vectorizing, initialize // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of // `Incoming`. This is analogous to the vectorized case above: extracting the @@ -3722,69 +4171,31 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // vector recurrence we extracted in the middle block. Since the loop is in // LCSSA form, we just need to find all the phi nodes for the original scalar // recurrence in the exit block, and then add an edge for the middle block. - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getIncomingValue(0) == Phi) { + // Note that LCSSA does not imply single entry when the original scalar loop + // had multiple exiting edges (as we always run the last iteration in the + // scalar epilogue); in that case, the exiting path through middle will be + // dynamically dead and the value picked for the phi doesn't matter. + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) + if (any_of(LCSSAPhi.incoming_values(), + [Phi](Value *V) { return V == Phi; })) LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); - } - } } void InnerLoopVectorizer::fixReduction(PHINode *Phi) { - Constant *Zero = Builder.getInt32(0); - // Get it's reduction variable descriptor. assert(Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; - RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); + RecurKind RK = RdxDesc.getRecurrenceKind(); TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = - RdxDesc.getMinMaxRecurrenceKind(); setDebugLocFromInst(Builder, ReductionStartValue); - - // We need to generate a reduction vector from the incoming scalar. - // To do so, we need to generate the 'identity' vector and override - // one of the elements with the incoming scalar reduction. We need - // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); // This is the vector-clone of the value that leaves the loop. Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); - // Find the reduction identity variable. Zero for addition, or, xor, - // one for multiplication, -1 for And. - Value *Identity; - Value *VectorStart; - if (RK == RecurrenceDescriptor::RK_IntegerMinMax || - RK == RecurrenceDescriptor::RK_FloatMinMax) { - // MinMax reduction have the start value as their identify. - if (VF == 1) { - VectorStart = Identity = ReductionStartValue; - } else { - VectorStart = Identity = - Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); - } - } else { - // Handle other reduction kinds: - Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( - RK, VecTy->getScalarType()); - if (VF == 1) { - Identity = Iden; - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = ReductionStartValue; - } else { - Identity = ConstantVector::getSplat({VF, false}, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = - Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); - } - } - // Wrap flags are in general invalid after vectorization, clear them. clearReductionWrapFlags(RdxDesc); @@ -3798,10 +4209,6 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); Value *Val = getOrCreateVectorValue(LoopVal, Part); - // Make sure to add the reduction start value only to the - // first unroll part. - Value *StartVal = (Part == 0) ? VectorStart : Identity; - cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); cast<PHINode>(VecRdxPhi) ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); } @@ -3816,8 +4223,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // If tail is folded by masking, the vector value to leave the loop should be // a Select choosing between the vectorized LoopExitInst and vectorized Phi, - // instead of the former. - if (Cost->foldTailByMasking()) { + // instead of the former. For an inloop reduction the reduction will already + // be predicated, and does not need to be handled here. + if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); @@ -3831,14 +4239,31 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } assert(Sel && "Reduction exit feeds no select"); VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); + + // If the target can create a predicated operator for the reduction at no + // extra cost in the loop (for example a predicated vadd), it can be + // cheaper for the select to remain in the loop than be sunk out of it, + // and so use the select value for the phi instead of the old + // LoopExitValue. + RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; + if (PreferPredicatedReductionSelect || + TTI->preferPredicatedReductionSelect( + RdxDesc.getOpcode(), Phi->getType(), + TargetTransformInfo::ReductionFlags())) { + auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); + VecRdxPhi->setIncomingValueForBlock( + LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); + } } } // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. - if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { - Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); + if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { + assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); @@ -3865,7 +4290,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); - unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); + unsigned Op = RecurrenceDescriptor::getOpcode(RK); // The middle block terminator has already been assigned a DebugLoc here (the // OrigLoop's single latch terminator). We want the whole middle block to @@ -3884,14 +4309,14 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { ReducedPartRdx, "bin.rdx"), RdxDesc.getFastMathFlags()); else - ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, - RdxPart); + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); } - if (VF > 1) { - bool NoNaN = Legal->hasFunNoNaNAttr(); + // Create the reduction after the loop. Note that inloop reductions create the + // target reduction in the loop using a Reduction recipe. + if (VF.isVector() && !IsInLoopReductionPhi) { ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (Phi->getType() != RdxDesc.getRecurrenceType()) @@ -3911,21 +4336,17 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. - // We know that the loop is in LCSSA form. We need to update the - // PHI nodes in the exit blocks. - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - // All PHINodes need to have a single entry edge, or two if - // we already fixed them. - assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); - // We found a reduction value exit-PHI. Update it with the - // incoming bypass edge. - if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) + // We know that the loop is in LCSSA form. We need to update the PHI nodes + // in the exit blocks. See comment on analogous loop in + // fixFirstOrderRecurrence for a more complete explaination of the logic. + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) + if (any_of(LCSSAPhi.incoming_values(), + [LoopExitInst](Value *V) { return V == LoopExitInst; })) LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); - } // end of the LCSSA phi scan. - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. int IncomingEdgeBlockIdx = Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); @@ -3937,9 +4358,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { void InnerLoopVectorizer::clearReductionWrapFlags( RecurrenceDescriptor &RdxDesc) { - RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); - if (RK != RecurrenceDescriptor::RK_IntegerAdd && - RK != RecurrenceDescriptor::RK_IntegerMult) + RecurKind RK = RdxDesc.getRecurrenceKind(); + if (RK != RecurKind::Add && RK != RecurKind::Mul) return; Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); @@ -3968,22 +4388,27 @@ void InnerLoopVectorizer::clearReductionWrapFlags( void InnerLoopVectorizer::fixLCSSAPHIs() { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getNumIncomingValues() == 1) { - auto *IncomingValue = LCSSAPhi.getIncomingValue(0); - // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa<Instruction>(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast<Instruction>(IncomingValue), VF) - ? 0 - : VF - 1; - // Can be a loop invariant incoming value or the last scalar value to be - // extracted from the vectorized loop. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - Value *lastIncomingValue = - getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); - LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); - } + if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) + // Some phis were already hand updated by the reduction and recurrence + // code above, leave them alone. + continue; + + auto *IncomingValue = LCSSAPhi.getIncomingValue(0); + // Non-instruction incoming values will have only one value. + unsigned LastLane = 0; + if (isa<Instruction>(IncomingValue)) + LastLane = Cost->isUniformAfterVectorization( + cast<Instruction>(IncomingValue), VF) + ? 0 + : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "scalable vectors dont support non-uniform scalars yet"); + // Can be a loop invariant incoming value or the last scalar value to be + // extracted from the vectorized loop. + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + Value *lastIncomingValue = + getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); + LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } @@ -4087,9 +4512,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } } -void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, - unsigned UF, unsigned VF, - bool IsPtrLoopInvariant, +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, + VPUser &Operands, unsigned UF, + ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State) { // Construct a vector GEP by widening the operands of the scalar GEP as @@ -4098,7 +4523,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. - if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we @@ -4114,7 +4539,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, auto *Clone = Builder.Insert(GEP->clone()); for (unsigned Part = 0; Part < UF; ++Part) { Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); + State.set(VPDef, GEP, EntryPart, Part); addMetadata(EntryPart, GEP); } } else { @@ -4149,16 +4574,19 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, Indices) : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); - VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); + State.set(VPDef, GEP, NewGEP, Part); addMetadata(NewGEP, GEP); } } } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, - unsigned VF) { +void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, + RecurrenceDescriptor *RdxDesc, + Value *StartV, unsigned UF, + ElementCount VF) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); PHINode *P = cast<PHINode>(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction @@ -4166,7 +4594,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = - (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); + (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -4181,18 +4609,60 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. - if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { + if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { + Value *Iden = nullptr; + bool ScalarPHI = + (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); + Type *VecTy = + ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); + + if (RdxDesc) { + assert(Legal->isReductionVariable(P) && StartV && + "RdxDesc should only be set for reduction variables; in that case " + "a StartV is also required"); + RecurKind RK = RdxDesc->getRecurrenceKind(); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { + // MinMax reduction have the start value as their identify. + if (ScalarPHI) { + Iden = StartV; + } else { + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); + } + } else { + Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( + RK, VecTy->getScalarType()); + Iden = IdenC; + + if (!ScalarPHI) { + Iden = ConstantVector::getSplat(VF, IdenC); + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + Constant *Zero = Builder.getInt32(0); + StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + } + } + } + for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. - Type *VecTy = - (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); VectorLoopValueMap.setVectorValue(P, Part, EntryPart); + if (StartV) { + // Make sure to add the reduction start value only to the + // first unroll part. + Value *StartVal = (Part == 0) ? StartV : Iden; + cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); + } } return; } + assert(!Legal->isReductionVariable(P) && + "reductions should be handled above"); + setDebugLocFromInst(Builder, P); // This PHINode must be an induction variable. @@ -4213,26 +4683,74 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); - // This is the normalized GEP that starts counting at zero. - Value *PtrInd = Induction; - PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); - // Determine the number of scalars we need to generate for each unroll - // iteration. If the instruction is uniform, we only need to generate the - // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; - // These are the scalar results. Notice that we don't generate vector GEPs - // because scalar GEPs result in better code. - for (unsigned Part = 0; Part < UF; ++Part) { - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); - Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); - SclrGep->setName("next.gep"); - VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); + + if (Cost->isScalarAfterVectorization(P, VF)) { + // This is the normalized GEP that starts counting at zero. + Value *PtrInd = + Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + unsigned Lanes = + Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); + for (unsigned Part = 0; Part < UF; ++Part) { + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Constant *Idx = ConstantInt::get(PtrInd->getType(), + Lane + Part * VF.getKnownMinValue()); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); + Value *SclrGep = + emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); + SclrGep->setName("next.gep"); + VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); + } } + return; + } + assert(isa<SCEVConstant>(II.getStep()) && + "Induction step not a SCEV constant!"); + Type *PhiType = II.getStep()->getType(); + + // Build a pointer phi + Value *ScalarStartValue = II.getStartValue(); + Type *ScStValueType = ScalarStartValue->getType(); + PHINode *NewPointerPhi = + PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); + NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); + + // A pointer induction, performed by using a gep + BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); + Instruction *InductionLoc = LoopLatch->getTerminator(); + const SCEV *ScalarStep = II.getStep(); + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + Value *ScalarStepValue = + Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); + Value *InductionGEP = GetElementPtrInst::Create( + ScStValueType->getPointerElementType(), NewPointerPhi, + Builder.CreateMul( + ScalarStepValue, + ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), + "ptr.ind", InductionLoc); + NewPointerPhi->addIncoming(InductionGEP, LoopLatch); + + // Create UF many actual address geps that use the pointer + // phi as base and a vectorized version of the step value + // (<step*0, ..., step*N>) as offset. + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Constant *, 8> Indices; + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) + Indices.push_back( + ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); + Constant *StartOffset = ConstantVector::get(Indices); + + Value *GEP = Builder.CreateGEP( + ScStValueType->getPointerElementType(), NewPointerPhi, + Builder.CreateMul( + StartOffset, + Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), + "vector.gep")); + VectorLoopValueMap.setVectorValue(P, Part, GEP); } - return; } } } @@ -4255,7 +4773,8 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, +void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, + VPUser &User, VPTransformState &State) { switch (I.getOpcode()) { case Instruction::Call: @@ -4297,7 +4816,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, VecOp->copyIRFlags(&I); // Use this vector value for all users of the original instruction. - VectorLoopValueMap.setVectorValue(&I, Part, V); + State.set(Def, &I, V, Part); addMetadata(V, &I); } @@ -4321,7 +4840,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, } else { C = Builder.CreateICmp(Cmp->getPredicate(), A, B); } - VectorLoopValueMap.setVectorValue(&I, Part, C); + State.set(Def, &I, C, Part); addMetadata(C, &I); } @@ -4345,12 +4864,12 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, /// Vectorize casts. Type *DestTy = - (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); + (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = State.get(User.getOperand(0), Part); Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - VectorLoopValueMap.setVectorValue(&I, Part, Cast); + State.set(Def, &I, Cast, Part); addMetadata(Cast, &I); } break; @@ -4362,7 +4881,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, } // end of switch. } -void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, +void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, + VPUser &ArgOperands, VPTransformState &State) { assert(!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); @@ -4373,7 +4893,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, SmallVector<Type *, 4> Tys; for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -4381,11 +4901,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize = false; - unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; + InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; + bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."); + assert(IntrinsicCost.isValid() && CallCost.isValid() && + "Cannot have invalid costs while widening"); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; @@ -4404,15 +4926,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, if (UseVectorIntrinsic) { // Use vector version of the intrinsic. Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = - FixedVectorType::get(CI->getType()->getScalarType(), VF); + if (VF.isVector()) { + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + } VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. - const VFShape Shape = - VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); + const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); #ifndef NDEBUG assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && "Can't create vector function."); @@ -4426,12 +4948,12 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, if (isa<FPMathOperator>(V)) V->copyFastMathFlags(CI); - VectorLoopValueMap.setVectorValue(&I, Part, V); + State.set(Def, &I, V, Part); addMetadata(V, &I); } } -void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, +void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, bool InvariantCond, VPTransformState &State) { @@ -4450,16 +4972,16 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, Value *Op0 = State.get(Operands.getOperand(1), Part); Value *Op1 = State.get(Operands.getOperand(2), Part); Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); - VectorLoopValueMap.setVectorValue(&I, Part, Sel); + State.set(VPDef, &I, Sel, Part); addMetadata(Sel, &I); } } -void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { +void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && + assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); SmallSetVector<Instruction *, 8> Worklist; @@ -4468,6 +4990,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // accesses that will remain scalar. SmallSetVector<Instruction *, 8> ScalarPtrs; SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; + auto *Latch = TheLoop->getLoopLatch(); // A helper that returns true if the use of Ptr by MemAccess will be scalar. // The pointer operands of loads and stores will be scalar as long as the @@ -4493,11 +5016,33 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { !TheLoop->isLoopInvariant(V); }; - // A helper that evaluates a memory access's use of a pointer. If the use - // will be a scalar use, and the pointer is only used by memory accesses, we - // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in - // PossibleNonScalarPtrs. + auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { + if (!isa<PHINode>(Ptr) || + !Legal->getInductionVars().count(cast<PHINode>(Ptr))) + return false; + auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; + if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) + return false; + return isScalarUse(MemAccess, Ptr); + }; + + // A helper that evaluates a memory access's use of a pointer. If the + // pointer is actually the pointer induction of a loop, it is being + // inserted into Worklist. If the use will be a scalar use, and the + // pointer is only used by memory accesses, we place the pointer in + // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { + if (isScalarPtrInduction(MemAccess, Ptr)) { + Worklist.insert(cast<Instruction>(Ptr)); + Instruction *Update = cast<Instruction>( + cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); + Worklist.insert(Update); + LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr + << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update + << "\n"); + return; + } // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -4521,10 +5066,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { }; // We seed the scalars analysis with three classes of instructions: (1) - // instructions marked uniform-after-vectorization, (2) bitcast and - // getelementptr instructions used by memory accesses requiring a scalar use, - // and (3) pointer induction variables and their update instructions (we - // currently only scalarize these). + // instructions marked uniform-after-vectorization and (2) bitcast, + // getelementptr and (pointer) phi instructions used by memory accesses + // requiring a scalar use. // // (1) Add to the worklist all instructions that have been identified as // uniform-after-vectorization. @@ -4550,24 +5094,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { Worklist.insert(I); } - // (3) Add to the worklist all pointer induction variables and their update - // instructions. - // - // TODO: Once we are able to vectorize pointer induction variables we should - // no longer insert them into the worklist here. - auto *Latch = TheLoop->getLoopLatch(); - for (auto &Induction : Legal->getInductionVars()) { - auto *Ind = Induction.first; - auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) - continue; - Worklist.insert(Ind); - Worklist.insert(IndUpdate); - LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); - LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate - << "\n"); - } - // Insert the forced scalars. // FIXME: Currently widenPHIInstruction() often creates a dead vector // induction variable when the PHI user is scalarized. @@ -4603,14 +5129,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - // We already considered pointer induction variables, so there's no reason - // to look at their users again. - // - // TODO: Once we are able to vectorize pointer induction variables we - // should no longer skip over them here. - if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) - continue; - // If tail-folding is applied, the primary induction variable will be used // to feed a vector compare. if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) @@ -4646,7 +5164,8 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, + ElementCount VF) { if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -4660,7 +5179,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne auto *Ty = getMemInstValueType(I); // We have already decided how to vectorize this instruction, get that // result. - if (VF > 1) { + if (VF.isVector()) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -4681,8 +5200,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return false; } -bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( + Instruction *I, ElementCount VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); @@ -4718,8 +5237,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, : TTI.isLegalMaskedStore(Ty, Alignment); } -bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( + Instruction *I, ElementCount VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast<LoadInst>(I); StoreInst *SI = dyn_cast<StoreInst>(I); @@ -4746,13 +5265,13 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, return true; } -void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { +void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && + assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -4778,6 +5297,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // replicating region where only a single instance out of VF should be formed. // TODO: optimize such seldom cases if found important, see PR40816. auto addToWorklistIfAllowed = [&](Instruction *I) -> void { + if (isOutOfScope(I)) { + LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " + << *I << "\n"); + return; + } if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); @@ -4794,65 +5318,71 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) addToWorklistIfAllowed(Cmp); - // Holds consecutive and consecutive-like pointers. Consecutive-like pointers - // are pointers that are treated like consecutive pointers during - // vectorization. The pointer operands of interleaved accesses are an - // example. - SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; - - // Holds pointer operands of instructions that are possibly non-uniform. - SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; - - auto isUniformDecision = [&](Instruction *I, unsigned VF) { + auto isUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); + // A uniform memory op is itself uniform. We exclude uniform stores + // here as they demand the last lane, not the first one. + if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { + assert(WideningDecision == CM_Scalarize); + return true; + } + return (WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || WideningDecision == CM_Interleave); }; - // Iterate over the instructions in the loop, and collect all - // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible - // that a consecutive-like pointer operand will be scalarized, we collect it - // in PossibleNonUniformPtrs instead. We use two sets here because a single - // getelementptr instruction can be used by both vectorized and scalarized - // memory instructions. For example, if a loop loads and stores from the same - // location, but the store is conditional, the store will be scalarized, and - // the getelementptr won't remain uniform. + + + // Returns true if Ptr is the pointer operand of a memory access instruction + // I, and I is known to not require scalarization. + auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { + return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); + }; + + // Holds a list of values which are known to have at least one uniform use. + // Note that there may be other uses which aren't uniform. A "uniform use" + // here is something which only demands lane 0 of the unrolled iterations; + // it does not imply that all lanes produce the same value (e.g. this is not + // the usual meaning of uniform) + SmallPtrSet<Value *, 8> HasUniformUse; + + // Scan the loop for instructions which are either a) known to have only + // lane 0 demanded or b) are uses which demand only lane 0 of their operand. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { // If there's no pointer operand, there's nothing to do. - auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); + auto *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) continue; - // True if all users of Ptr are memory accesses that have Ptr as their - // pointer operand. - auto UsersAreMemAccesses = - llvm::all_of(Ptr->users(), [&](User *U) -> bool { - return getLoadStorePointerOperand(U) == Ptr; - }); - - // Ensure the memory instruction will not be scalarized or used by - // gather/scatter, making its pointer operand non-uniform. If the pointer - // operand is used by any instruction other than a memory access, we - // conservatively assume the pointer operand may be non-uniform. - if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) - PossibleNonUniformPtrs.insert(Ptr); - - // If the memory instruction will be vectorized and its pointer operand - // is consecutive-like, or interleaving - the pointer operand should - // remain uniform. - else - ConsecutiveLikePtrs.insert(Ptr); + // A uniform memory op is itself uniform. We exclude uniform stores + // here as they demand the last lane, not the first one. + if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) + addToWorklistIfAllowed(&I); + + if (isUniformDecision(&I, VF)) { + assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); + HasUniformUse.insert(Ptr); + } } - // Add to the Worklist all consecutive and consecutive-like pointers that - // aren't also identified as possibly non-uniform. - for (auto *V : ConsecutiveLikePtrs) - if (!PossibleNonUniformPtrs.count(V)) - addToWorklistIfAllowed(V); + // Add to the worklist any operands which have *only* uniform (e.g. lane 0 + // demanding) users. Since loops are assumed to be in LCSSA form, this + // disallows uses outside the loop as well. + for (auto *V : HasUniformUse) { + if (isOutOfScope(V)) + continue; + auto *I = cast<Instruction>(V); + auto UsersAreMemAccesses = + llvm::all_of(I->users(), [&](User *U) -> bool { + return isVectorizedMemAccessUse(cast<Instruction>(U), V); + }); + if (UsersAreMemAccesses) + addToWorklistIfAllowed(I); + } // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures @@ -4875,20 +5405,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { auto *OI = cast<Instruction>(OV); if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast<Instruction>(U); - return Worklist.count(J) || - (OI == getLoadStorePointerOperand(J) && - isUniformDecision(J, VF)); + return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); })) addToWorklistIfAllowed(OI); } } - // Returns true if Ptr is the pointer operand of a memory access instruction - // I, and I is known to not require scalarization. - auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); - }; - // For an instruction to be added into Worklist above, all its users inside // the loop should also be in Worklist. However, this condition cannot be // true for phi nodes that form a cyclic dependence. We must process phi @@ -4961,8 +5483,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, - unsigned UserIC) { +Optional<ElementCount> +LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -4982,9 +5504,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, return None; } + ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); + switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return UserVF ? UserVF : computeFeasibleMaxVF(TC); + return MaxVF; + case CM_ScalarEpilogueNotAllowedUsePredicate: + LLVM_FALLTHROUGH; case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" @@ -5005,9 +5531,26 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, // for size. if (runtimeChecksRequired()) return None; + break; } + // The only loops we can vectorize without a scalar epilogue, are loops with + // a bottom-test and a single exiting block. We'd have to handle the fact + // that not every instruction executes on the last iteration. This will + // require a lane mask which varies through the vector loop body. (TODO) + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + // If there was a tail-folding hint/switch, but we can't fold the tail by + // masking, fallback to a vectorization with a scalar epilogue. + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " + "scalar epilogue instead.\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + return MaxVF; + } + return None; + } + // Now try the tail folding // Invalidate interleave groups that require an epilogue if we can't mask @@ -5020,10 +5563,21 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); - assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); - unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; - if (TC > 0 && TC % MaxVFtimesIC == 0) { + assert(!MaxVF.isScalable() && + "Scalable vectors do not yet support tail folding"); + assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && + "MaxVF must be a power of 2"); + unsigned MaxVFtimesIC = + UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); + // Avoid tail folding if the trip count is known to be a multiple of any VF we + // chose. + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + const SCEV *Rem = SE->getURemExpr( + ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + if (Rem->isZero()) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxVF; @@ -5038,6 +5592,20 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, return MaxVF; } + // If there was a tail-folding hint/switch, but we can't fold the tail by + // masking, fallback to a vectorization with a scalar epilogue. + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " + "scalar epilogue instead.\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + return MaxVF; + } + + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); + return None; + } + if (TC == 0) { reportVectorizationFailure( "Unable to calculate the loop count due to complex control flow", @@ -5055,8 +5623,33 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, return None; } -unsigned -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { +ElementCount +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF) { + bool IgnoreScalableUserVF = UserVF.isScalable() && + !TTI.supportsScalableVectors() && + !ForceTargetSupportsScalableVectors; + if (IgnoreScalableUserVF) { + LLVM_DEBUG( + dbgs() << "LV: Ignoring VF=" << UserVF + << " because target does not support scalable vectors.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Ignoring VF=" << ore::NV("UserVF", UserVF) + << " because target does not support scalable vectors."; + }); + } + + // Beyond this point two scenarios are handled. If UserVF isn't specified + // then a suitable VF is chosen. If UserVF is specified and there are + // dependencies, check if it's legal. However, if a UserVF is specified and + // there are no dependencies, then there's nothing to do. + if (UserVF.isNonZero() && !IgnoreScalableUserVF && + Legal->isSafeForAnyVectorWidth()) + return UserVF; + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5066,9 +5659,62 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). - unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); + unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); + + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value. Otherwise, return as is. + if (UserVF.isNonZero() && !IgnoreScalableUserVF) { + unsigned MaxSafeElements = + PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); + ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); + + if (UserVF.isScalable()) { + Optional<unsigned> MaxVScale = TTI.getMaxVScale(); + + // Scale VF by vscale before checking if it's safe. + MaxSafeVF = ElementCount::getScalable( + MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + + if (MaxSafeVF.isZero()) { + // The dependence distance is too small to use scalable vectors, + // fallback on fixed. + LLVM_DEBUG( + dbgs() + << "LV: Max legal vector width too small, scalable vectorization " + "unfeasible. Using fixed-width vectorization instead.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Max legal vector width too small, scalable vectorization " + << "unfeasible. Using fixed-width vectorization instead."; + }); + return computeFeasibleMaxVF( + ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); + } + } + + LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); + + if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) + return UserVF; + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" << MaxSafeVF + << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeVF); + }); + return MaxSafeVF; + } - WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); + WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. @@ -5079,12 +5725,13 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister << " bits.\n"); - assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" - " into one vector!"); + assert(MaxVectorSize <= WidestRegister && + "Did not expect to pack so many elements" + " into one vector!"); if (MaxVectorSize == 0) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); MaxVectorSize = 1; - return MaxVectorSize; + return ElementCount::getFixed(MaxVectorSize); } else if (ConstTripCount && ConstTripCount < MaxVectorSize && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in @@ -5092,7 +5739,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " << ConstTripCount << "\n"); MaxVectorSize = ConstTripCount; - return MaxVectorSize; + return ElementCount::getFixed(MaxVectorSize); } unsigned MaxVF = MaxVectorSize; @@ -5100,10 +5747,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). - SmallVector<unsigned, 8> VFs; + SmallVector<ElementCount, 8> VFs; unsigned NewMaxVectorSize = WidestRegister / SmallestType; for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) - VFs.push_back(VS); + VFs.push_back(ElementCount::getFixed(VS)); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5118,7 +5765,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { Selected = false; } if (Selected) { - MaxVF = VFs[i]; + MaxVF = VFs[i].getKnownMinValue(); break; } } @@ -5130,30 +5777,39 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { } } } - return MaxVF; + return ElementCount::getFixed(MaxVF); } VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { - float Cost = expectedCost(1).first; - const float ScalarCost = Cost; +LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { + // FIXME: This can be fixed for scalable vectors later, because at this stage + // the LoopVectorizer will only consider vectorizing a loop with scalable + // vectors when the loop has a hint to enable vectorization for a given VF. + assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); + + InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); + assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); + unsigned Width = 1; - LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); + const float ScalarCost = *ExpectedCost.getValue(); + float Cost = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && MaxVF > 1) { + if (ForceVectorization && MaxVF.isVector()) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. Cost = std::numeric_limits<float>::max(); } - for (unsigned i = 2; i <= MaxVF; i *= 2) { + for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. - VectorizationCostTy C = expectedCost(i); - float VectorCost = C.first / (float)i; + VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); + assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); + float VectorCost = *C.first.getValue() / (float)i; LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); if (!C.second && !ForceVectorization) { @@ -5162,6 +5818,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { << " because it will not generate any vector instructions.\n"); continue; } + + // If profitable add it to ProfitableVF list. + if (VectorCost < ScalarCost) { + ProfitableVFs.push_back(VectorizationFactor( + {ElementCount::getFixed(i), (unsigned)VectorCost})); + } + if (VectorCost < Cost) { Cost = VectorCost; Width = i; @@ -5180,10 +5843,131 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; + VectorizationFactor Factor = {ElementCount::getFixed(Width), + (unsigned)(Width * Cost)}; return Factor; } +bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( + const Loop &L, ElementCount VF) const { + // Cross iteration phis such as reductions need special handling and are + // currently unsupported. + if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { + return Legal->isFirstOrderRecurrence(&Phi) || + Legal->isReductionVariable(&Phi); + })) + return false; + + // Phis with uses outside of the loop require special handling and are + // currently unsupported. + for (auto &Entry : Legal->getInductionVars()) { + // Look for uses of the value of the induction at the last iteration. + Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); + for (User *U : PostInc->users()) + if (!L.contains(cast<Instruction>(U))) + return false; + // Look for uses of penultimate value of the induction. + for (User *U : Entry.first->users()) + if (!L.contains(cast<Instruction>(U))) + return false; + } + + // Induction variables that are widened require special handling that is + // currently not supported. + if (any_of(Legal->getInductionVars(), [&](auto &Entry) { + return !(this->isScalarAfterVectorization(Entry.first, VF) || + this->isProfitableToScalarize(Entry.first, VF)); + })) + return false; + + return true; +} + +bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( + const ElementCount VF) const { + // FIXME: We need a much better cost-model to take different parameters such + // as register pressure, code size increase and cost of extra branches into + // account. For now we apply a very crude heuristic and only consider loops + // with vectorization factors larger than a certain value. + // We also consider epilogue vectorization unprofitable for targets that don't + // consider interleaving beneficial (eg. MVE). + if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) + return false; + if (VF.getFixedValue() >= EpilogueVectorizationMinVF) + return true; + return false; +} + +VectorizationFactor +LoopVectorizationCostModel::selectEpilogueVectorizationFactor( + const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { + VectorizationFactor Result = VectorizationFactor::Disabled(); + if (!EnableEpilogueVectorization) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); + return Result; + } + + if (!isScalarEpilogueAllowed()) { + LLVM_DEBUG( + dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " + "allowed.\n";); + return Result; + } + + // FIXME: This can be fixed for scalable vectors later, because at this stage + // the LoopVectorizer will only consider vectorizing a loop with scalable + // vectors when the loop has a hint to enable vectorization for a given VF. + if (MainLoopVF.isScalable()) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " + "yet supported.\n"); + return Result; + } + + // Not really a cost consideration, but check for unsupported cases here to + // simplify the logic. + if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { + LLVM_DEBUG( + dbgs() << "LEV: Unable to vectorize epilogue because the loop is " + "not a supported candidate.\n";); + return Result; + } + + if (EpilogueVectorizationForceVF > 1) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); + if (LVP.hasPlanWithVFs( + {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) + return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; + else { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization forced factor is not viable.\n";); + return Result; + } + } + + if (TheLoop->getHeader()->getParent()->hasOptSize() || + TheLoop->getHeader()->getParent()->hasMinSize()) { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization skipped due to opt for size.\n";); + return Result; + } + + if (!isEpilogueVectorizationProfitable(MainLoopVF)) + return Result; + + for (auto &NextVF : ProfitableVFs) + if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && + LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) + Result = NextVF; + + if (Result != VectorizationFactor::Disabled()) + LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " + << Result.Width.getFixedValue() << "\n";); + return Result; +} + std::pair<unsigned, unsigned> LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; @@ -5210,6 +5994,11 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { if (!Legal->isReductionVariable(PN)) continue; RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; + if (PreferInLoopReductions || + TTI.preferInLoopReduction(RdxDesc.getOpcode(), + RdxDesc.getRecurrenceType(), + TargetTransformInfo::ReductionFlags())) + continue; T = RdxDesc.getRecurrenceType(); } @@ -5240,7 +6029,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -5263,10 +6052,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not interleave loops with a relatively small known or estimated trip - // count. auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); - if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) + const bool HasReductions = !Legal->getReductionVars().empty(); + // Do not interleave loops with a relatively small known or estimated trip + // count. But we will interleave when InterleaveSmallLoopScalarReduction is + // enabled, and the code has scalar reductions(HasReductions && VF = 1), + // because with the above conditions interleaving can expose ILP and break + // cross iteration dependences for reductions. + if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && + !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) return 1; RegisterUsage R = calculateRegisterUsage({VF})[0]; @@ -5294,7 +6088,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName(pair.first) << " register class\n"); - if (VF == 1) { + if (VF.isScalar()) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; } else { @@ -5318,10 +6112,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + unsigned MaxInterleaveCount = + TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); // Check if the user has overridden the max. - if (VF == 1) { + if (VF.isScalar()) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { @@ -5330,28 +6125,47 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } // If trip count is known or estimated compile time constant, limit the - // interleave count to be less than the trip count divided by VF. + // interleave count to be less than the trip count divided by VF, provided it + // is at least 1. + // + // For scalable vectors we can't know if interleaving is beneficial. It may + // not be beneficial for small loops if none of the lanes in the second vector + // iterations is enabled. However, for larger loops, there is likely to be a + // similar benefit as for fixed-width vectors. For now, we choose to leave + // the InterleaveCount as if vscale is '1', although if some information about + // the vector is known (e.g. min vector size), we can make a better decision. if (BestKnownTC) { - MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); + MaxInterleaveCount = + std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); + // Make sure MaxInterleaveCount is greater than 0. + MaxInterleaveCount = std::max(1u, MaxInterleaveCount); } - // If we did not calculate the cost for VF (because the user selected the VF) - // then we calculate the cost of VF here. - if (LoopCost == 0) - LoopCost = expectedCost(VF).first; - - assert(LoopCost && "Non-zero loop cost expected"); + assert(MaxInterleaveCount > 0 && + "Maximum interleave count must be greater than 0"); // Clamp the calculated IC to be between the 1 and the max interleave count // that the target and trip count allows. if (IC > MaxInterleaveCount) IC = MaxInterleaveCount; - else if (IC < 1) - IC = 1; + else + // Make sure IC is greater than 0. + IC = std::max(1u, IC); + + assert(IC > 0 && "Interleave count must be greater than 0."); + + // If we did not calculate the cost for VF (because the user selected the VF) + // then we calculate the cost of VF here. + if (LoopCost == 0) { + assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); + LoopCost = *expectedCost(VF).first.getValue(); + } + + assert(LoopCost && "Non-zero loop cost expected"); // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && !Legal->getReductionVars().empty()) { + if (VF.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5359,11 +6173,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Note that if we've already vectorized the loop we will have done the // runtime check and so interleaving won't require further checks. bool InterleavingRequiresRuntimePointerCheck = - (VF == 1 && Legal->getRuntimePointerChecking()->Need); + (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. - LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' + << "LV: IC is " << IC << '\n' + << "LV: VF is " << VF << '\n'); + const bool AggressivelyInterleaveReductions = + TTI.enableAggressiveInterleaving(HasReductions); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the @@ -5382,7 +6200,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. - if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { + if (HasReductions && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -5396,14 +6214,23 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, return std::max(StoresIC, LoadsIC); } - LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); - return SmallIC; + // If there are scalar reductions and TTI has enabled aggressive + // interleaving for reductions, we will interleave to expose ILP. + if (InterleaveSmallLoopScalarReduction && VF.isScalar() && + AggressivelyInterleaveReductions) { + LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + // Interleave no less than SmallIC but not as aggressive as the normal IC + // to satisfy the rare situation when resources are too limited. + return std::max(IC / 2, SmallIC); + } else { + LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + return SmallIC; + } } // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. - bool HasReductions = !Legal->getReductionVars().empty(); - if (TTI.enableAggressiveInterleaving(HasReductions)) { + if (AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; } @@ -5413,7 +6240,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { +LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -5485,26 +6312,17 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { TransposeEnds[Interval.second].push_back(Interval.first); SmallPtrSet<Instruction *, 8> OpenIntervals; - - // Get the size of the widest register. - unsigned MaxSafeDepDist = -1U; - if (Legal->getMaxSafeDepDistBytes() != -1U) - MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; - unsigned WidestRegister = - std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); - const DataLayout &DL = TheFunction->getParent()->getDataLayout(); - SmallVector<RegisterUsage, 8> RUs(VFs.size()); SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); // A lambda that gets the register usage for the given type and VF. - auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { - if (Ty->isTokenTy()) + const auto &TTICapture = TTI; + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { + if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0U; - unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); - return std::max<unsigned>(1, VF * TypeSize / WidestRegister); + return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -5528,7 +6346,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { // Count the number of live intervals. SmallMapVector<unsigned, unsigned, 4> RegUsage; - if (VFs[j] == 1) { + if (VFs[j].isScalar()) { for (auto Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); if (RegUsage.find(ClassID) == RegUsage.end()) @@ -5557,7 +6375,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { } } } - + for (auto& pair : RegUsage) { if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); @@ -5575,10 +6393,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { for (unsigned i = 0, e = VFs.size(); i < e; ++i) { SmallMapVector<unsigned, unsigned, 4> Invariant; - + for (auto Inst : LoopInvariants) { - unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); - unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); + unsigned Usage = + VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = + TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); if (Invariant.find(ClassID) == Invariant.end()) Invariant[ClassID] = Usage; else @@ -5626,12 +6446,13 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ NumPredStores > NumberOfStoresToPredicate); } -void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { +void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) + if (VF.isScalar() || VF.isZero() || + InstsToScalarize.find(VF) != InstsToScalarize.end()) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -5660,14 +6481,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { } int LoopVectorizationCostModel::computePredInstDiscount( - Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, - unsigned VF) { + Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the // vector version cost the same. - int Discount = 0; + InstructionCost Discount = 0; // Holds instructions to analyze. The instructions we visit are mapped in // ScalarCosts. Those instructions are the ones that would be scalarized if @@ -5722,22 +6542,27 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the cost of the vector instruction. Note that this cost already // includes the scalarization overhead of the predicated instruction. - unsigned VectorCost = getInstructionCost(I, VF).first; + InstructionCost VectorCost = getInstructionCost(I, VF).first; // Compute the cost of the scalarized instruction. This cost is the cost of // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. - unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + assert(!VF.isScalable() && "scalable vectors not yet supported."); + InstructionCost ScalarCost = + VF.getKnownMinValue() * + getInstructionCost(I, ElementCount::getFixed(1)).first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF), true, false); - ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, - TTI::TCK_RecipThroughput); + APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + ScalarCost += + VF.getKnownMinValue() * + TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); } // Compute the scalarization overhead of needed extractelement @@ -5750,10 +6575,12 @@ int LoopVectorizationCostModel::computePredInstDiscount( "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J, VF)) + else if (needsExtract(J, VF)) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF), false, true); + APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); + } } // Scale the total scalar cost by block probability. @@ -5765,11 +6592,11 @@ int LoopVectorizationCostModel::computePredInstDiscount( ScalarCosts[I] = ScalarCost; } - return Discount; + return *Discount.getValue(); } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(unsigned VF) { +LoopVectorizationCostModel::expectedCost(ElementCount VF) { VectorizationCostTy Cost; // For each block. @@ -5779,14 +6606,15 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) + if (ValuesToIgnore.count(&I) || + (VF.isVector() && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) - C.first = ForceTargetInstructionCost; + C.first = InstructionCost(ForceTargetInstructionCost); BlockCost.first += C.first; BlockCost.second |= C.second; @@ -5799,9 +6627,10 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // if-converted. This means that the block's instructions (aside from // stores and instructions that may divide by zero) will now be // unconditionally executed. For the scalar case, we may not always execute - // the predicated block. Thus, scale the block's cost by the probability of - // executing it. - if (VF == 1 && blockNeedsPredication(BB)) + // the predicated block, if it is an if-else block. Thus, scale the block's + // cost by the probability of executing it. blockNeedsPredication from + // Legal is used so as to not include all blocks in tail folded loops. + if (VF.isScalar() && Legal->blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -5846,9 +6675,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { Legal->hasStride(I->getOperand(1)); } -unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, - unsigned VF) { - assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); +InstructionCost +LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + ElementCount VF) { + assert(VF.isVector() && + "Scalarization cost of instruction implies vectorization."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); @@ -5861,14 +6693,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + InstructionCost Cost = + VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. const Align Alignment = getLoadStoreAlignment(I); - Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS, - TTI::TCK_RecipThroughput); + Cost += VF.getKnownMinValue() * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS, TTI::TCK_RecipThroughput); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5889,8 +6722,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, return Cost; } -unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); @@ -5901,7 +6735,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); - unsigned Cost = 0; + InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); @@ -5915,8 +6749,11 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, return Cost; } -unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, + ElementCount VF) { + assert(Legal->isUniformMemOp(*I)); + Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -5937,11 +6774,12 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF - 1)); + VF.getKnownMinValue() - 1)); } -unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -5953,8 +6791,9 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, TargetTransformInfo::TCK_RecipThroughput, I); } -unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -5963,7 +6802,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. // An interleaved store group doesn't need this as it doesn't allow gaps. @@ -5977,7 +6817,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, // Calculate the cost of the whole interleaved group. bool UseMaskForGaps = Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); - unsigned Cost = TTI.getInterleavedMemoryOpCost( + InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); @@ -5991,11 +6831,122 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, return Cost; } -unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - unsigned VF) { +InstructionCost LoopVectorizationCostModel::getReductionPatternCost( + Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { + // Early exit for no inloop reductions + if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) + return InstructionCost::getInvalid(); + auto *VectorTy = cast<VectorType>(Ty); + + // We are looking for a pattern of, and finding the minimal acceptable cost: + // reduce(mul(ext(A), ext(B))) or + // reduce(mul(A, B)) or + // reduce(ext(A)) or + // reduce(A). + // The basic idea is that we walk down the tree to do that, finding the root + // reduction instruction in InLoopReductionImmediateChains. From there we find + // the pattern of mul/ext and test the cost of the entire pattern vs the cost + // of the components. If the reduction cost is lower then we return it for the + // reduction instruction and 0 for the other instructions in the pattern. If + // it is not we return an invalid cost specifying the orignal cost method + // should be used. + Instruction *RetI = I; + if ((RetI->getOpcode() == Instruction::SExt || + RetI->getOpcode() == Instruction::ZExt)) { + if (!RetI->hasOneUser()) + return InstructionCost::getInvalid(); + RetI = RetI->user_back(); + } + if (RetI->getOpcode() == Instruction::Mul && + RetI->user_back()->getOpcode() == Instruction::Add) { + if (!RetI->hasOneUser()) + return InstructionCost::getInvalid(); + RetI = RetI->user_back(); + } + + // Test if the found instruction is a reduction, and if not return an invalid + // cost specifying the parent to use the original cost modelling. + if (!InLoopReductionImmediateChains.count(RetI)) + return InstructionCost::getInvalid(); + + // Find the reduction this chain is a part of and calculate the basic cost of + // the reduction on its own. + Instruction *LastChain = InLoopReductionImmediateChains[RetI]; + Instruction *ReductionPhi = LastChain; + while (!isa<PHINode>(ReductionPhi)) + ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; + + RecurrenceDescriptor RdxDesc = + Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; + unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), + VectorTy, false, CostKind); + + // Get the operand that was not the reduction chain and match it to one of the + // patterns, returning the better cost if it is found. + Instruction *RedOp = RetI->getOperand(1) == LastChain + ? dyn_cast<Instruction>(RetI->getOperand(0)) + : dyn_cast<Instruction>(RetI->getOperand(1)); + + VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); + + if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && + !TheLoop->isLoopInvariant(RedOp)) { + bool IsUnsigned = isa<ZExtInst>(RedOp); + auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + unsigned ExtCost = + TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, RedOp); + if (RedCost.isValid() && RedCost < BaseCost + ExtCost) + return I == RetI ? *RedCost.getValue() : 0; + } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { + Instruction *Mul = RedOp; + Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); + Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); + if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && + Op0->getOpcode() == Op1->getOpcode() && + Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && + !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { + bool IsUnsigned = isa<ZExtInst>(Op0); + auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); + // reduce(mul(ext, ext)) + unsigned ExtCost = + TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, Op0); + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) + return I == RetI ? *RedCost.getValue() : 0; + } else { + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, + CostKind); + + if (RedCost.isValid() && RedCost < MulCost + BaseCost) + return I == RetI ? *RedCost.getValue() : 0; + } + } + + return I == RetI ? BaseCost : InstructionCost::getInvalid(); +} + +InstructionCost +LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, + ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. - if (VF == 1) { + if (VF.isScalar()) { Type *ValTy = getMemInstValueType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6008,43 +6959,52 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, + ElementCount VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) - VF = 1; + VF = ElementCount::getFixed(1); - if (VF > 1 && isProfitableToScalarize(I, VF)) + if (VF.isVector() && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); // Forced scalars do not have any scalarization overhead. auto ForcedScalar = ForcedScalars.find(VF); - if (VF > 1 && ForcedScalar != ForcedScalars.end()) { + if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) - return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + return VectorizationCostTy( + (getInstructionCost(I, ElementCount::getFixed(1)).first * + VF.getKnownMinValue()), + false); } Type *VectorTy; - unsigned C = getInstructionCost(I, VF, VectorTy); + InstructionCost C = getInstructionCost(I, VF, VectorTy); bool TypeNotScalarized = - VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; + VF.isVector() && VectorTy->isVectorTy() && + TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); return VectorizationCostTy(C, TypeNotScalarized); } -unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, + ElementCount VF) { - if (VF == 1) + assert(!VF.isScalable() && + "cannot compute scalarization overhead for scalable vectorization"); + if (VF.isScalar()) return 0; - unsigned Cost = 0; + InstructionCost Cost = 0; Type *RetTy = ToVectorTy(I->getType(), VF); if (!RetTy->isVoidTy() && (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); + cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), + true, false); // Some targets keep addresses scalar. if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) @@ -6061,11 +7021,11 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Skip operands that do not require extraction/scalarization and do not incur // any overhead. return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF); + filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); } -void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { - if (VF == 1) +void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { + if (VF.isScalar()) return; NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { @@ -6082,23 +7042,19 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (Legal->isUniform(Ptr) && - // Conditional loads and stores should be scalarized and predicated. - // isScalarWithPredication cannot be used here since masked - // gather/scatters are not considered scalar with predication. - !Legal->blockNeedsPredication(I.getParent())) { + if (Legal->isUniformMemOp(I)) { // TODO: Avoid replicating loads and stores instead of // relying on instcombine to remove them. // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract - unsigned Cost = getUniformMemOpCost(&I, VF); + InstructionCost Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); continue; } // We assume that widening is the best solution when possible. if (memoryInstructionCanBeWidened(&I, VF)) { - unsigned Cost = getConsecutiveMemOpCost(&I, VF); + InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); int ConsecutiveStride = Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && @@ -6110,7 +7066,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } // Choose between Interleaving, Gather/Scatter or Scalarization. - unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); + InstructionCost InterleaveCost = std::numeric_limits<int>::max(); unsigned NumAccesses = 1; if (isAccessInterleaved(&I)) { auto Group = getInterleavedAccessGroup(&I); @@ -6125,17 +7081,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { InterleaveCost = getInterleaveGroupCost(&I, VF); } - unsigned GatherScatterCost = + InstructionCost GatherScatterCost = isLegalGatherOrScatter(&I) ? getGatherScatterCost(&I, VF) * NumAccesses - : std::numeric_limits<unsigned>::max(); + : std::numeric_limits<int>::max(); - unsigned ScalarizationCost = + InstructionCost ScalarizationCost = getMemInstScalarizationCost(&I, VF) * NumAccesses; // Choose better solution for the current VF, // write down this decision and use it during vectorization. - unsigned Cost; + InstructionCost Cost; InstWidening Decision; if (InterleaveCost <= GatherScatterCost && InterleaveCost < ScalarizationCost) { @@ -6179,8 +7135,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // Add all instructions used to generate the addresses. SmallVector<Instruction *, 4> Worklist; - for (auto *I : AddrDefs) - Worklist.push_back(I); + append_range(Worklist, AddrDefs); while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); for (auto &Op : I->operands()) @@ -6199,14 +7154,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { InstWidening Decision = getWideningDecision(I, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. - setWideningDecision(I, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(I, 1))); + setWideningDecision( + I, VF, CM_Scalarize, + (VF.getKnownMinValue() * + getMemoryInstructionCost(I, ElementCount::getFixed(1)))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) - setWideningDecision(Member, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(Member, 1))); + setWideningDecision( + Member, VF, CM_Scalarize, + (VF.getKnownMinValue() * + getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); } } } else @@ -6216,9 +7175,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } } -unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, - unsigned VF, - Type *&VectorTy) { +InstructionCost +LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, + Type *&VectorTy) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); @@ -6240,19 +7199,22 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // blocks requires also an extract of its vector compare i1 element. bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); - if (VF > 1 && BI->isConditional() && + if (VF.isVector() && BI->isConditional() && (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. + assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *Vec_i1Ty = - FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), - false, true) + - (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); - } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) + VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), + false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * + VF.getKnownMinValue())); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); else @@ -6267,20 +7229,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. - if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) - return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - cast<VectorType>(VectorTy), VF - 1, - FixedVectorType::get(RetTy, 1)); + if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) + return TTI.getShuffleCost( + TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), + VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. - if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) + if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), - CostKind); + CmpInst::BAD_ICMP_PREDICATE, CostKind); return TTI.getCFInstrCost(Instruction::PHI, CostKind); } @@ -6292,17 +7254,19 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF > 1 && isScalarWithPredication(I)) { - unsigned Cost = 0; + if (VF.isVector() && isScalarWithPredication(I)) { + InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); + Cost += VF.getKnownMinValue() * + TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); + Cost += VF.getKnownMinValue() * + TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -6331,6 +7295,13 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // Since we will replace the stride by 1 the multiplication should go away. if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) return 0; + + // Detect reduction patterns + InstructionCost RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + .isValid()) + return RedCost; + // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); @@ -6341,14 +7312,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector<const Value *, 4> Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, @@ -6362,10 +7334,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) - CondTy = FixedVectorType::get(CondTy, VF); - + CondTy = VectorType::get(CondTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, - CostKind, I); + CmpInst::BAD_ICMP_PREDICATE, CostKind, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -6374,18 +7345,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, - I); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, + CmpInst::BAD_ICMP_PREDICATE, CostKind, I); } case Instruction::Store: case Instruction::Load: { - unsigned Width = VF; - if (Width > 1) { + ElementCount Width = VF; + if (Width.isVector()) { InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); if (Decision == CM_Scalarize) - Width = 1; + Width = ElementCount::getFixed(1); } VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); @@ -6402,15 +7373,62 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { + // Computes the CastContextHint from a Load/Store instruction. + auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Expected a load or a store!"); + + if (VF.isScalar() || !TheLoop->contains(I)) + return TTI::CastContextHint::Normal; + + switch (getWideningDecision(I, VF)) { + case LoopVectorizationCostModel::CM_GatherScatter: + return TTI::CastContextHint::GatherScatter; + case LoopVectorizationCostModel::CM_Interleave: + return TTI::CastContextHint::Interleave; + case LoopVectorizationCostModel::CM_Scalarize: + case LoopVectorizationCostModel::CM_Widen: + return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal; + case LoopVectorizationCostModel::CM_Widen_Reverse: + return TTI::CastContextHint::Reversed; + case LoopVectorizationCostModel::CM_Unknown: + llvm_unreachable("Instr did not go through cost modelling?"); + } + + llvm_unreachable("Unhandled case!"); + }; + + unsigned Opcode = I->getOpcode(); + TTI::CastContextHint CCH = TTI::CastContextHint::None; + // For Trunc, the context is the only user, which must be a StoreInst. + if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { + if (I->hasOneUse()) + if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) + CCH = ComputeCCH(Store); + } + // For Z/Sext, the context is the operand, which must be a LoadInst. + else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || + Opcode == Instruction::FPExt) { + if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) + CCH = ComputeCCH(Load); + } + // We optimize the truncation of induction variables having constant // integer steps. The cost of these truncations is the same as the scalar // operation. if (isOptimizableIVTruncate(I, VF)) { auto *Trunc = cast<TruncInst>(I); return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), - Trunc->getSrcTy(), CostKind, Trunc); + Trunc->getSrcTy(), CCH, CostKind, Trunc); } + // Detect reduction patterns + InstructionCost RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + .isValid()) + return RedCost; + Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; @@ -6421,35 +7439,39 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // // Calculate the modified src and dest types. Type *MinVecTy = VectorTy; - if (I->getOpcode() == Instruction::Trunc) { + if (Opcode == Instruction::Trunc) { SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); - } else if (I->getOpcode() == Instruction::ZExt || - I->getOpcode() == Instruction::SExt) { + } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } } - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; - return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, - CostKind, I); + assert(!VF.isScalable() && "VF is assumed to be non scalable"); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; + return N * + TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); - unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); - if (getVectorIntrinsicIDForCall(CI, TLI)) - return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); + InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); + if (getVectorIntrinsicIDForCall(CI, TLI)) { + InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); + return std::min(CallCost, IntrinsicCost); + } return CallCost; } + case Instruction::ExtractValue: + return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, - CostKind) + + return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( + Instruction::Mul, VectorTy, CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6502,7 +7524,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // detection. for (auto &Reduction : Legal->getReductionVars()) { RecurrenceDescriptor &RedDes = Reduction.second; - SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); + const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction @@ -6514,6 +7536,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } } +void LoopVectorizationCostModel::collectInLoopReductions() { + for (auto &Reduction : Legal->getReductionVars()) { + PHINode *Phi = Reduction.first; + RecurrenceDescriptor &RdxDesc = Reduction.second; + + // We don't collect reductions that are type promoted (yet). + if (RdxDesc.getRecurrenceType() != Phi->getType()) + continue; + + // If the target would prefer this reduction to happen "in-loop", then we + // want to record it as such. + unsigned Opcode = RdxDesc.getOpcode(); + if (!PreferInLoopReductions && + !TTI.preferInLoopReduction(Opcode, Phi->getType(), + TargetTransformInfo::ReductionFlags())) + continue; + + // Check that we can correctly put the reductions into the loop, by + // finding the chain of operations that leads from the phi to the loop + // exit value. + SmallVector<Instruction *, 4> ReductionOperations = + RdxDesc.getReductionOpChain(Phi, TheLoop); + bool InLoop = !ReductionOperations.empty(); + if (InLoop) { + InLoopReductionChains[Phi] = ReductionOperations; + // Add the elements to InLoopReductionImmediateChains for cost modelling. + Instruction *LastChain = Phi; + for (auto *I : ReductionOperations) { + InLoopReductionImmediateChains[I] = LastChain; + LastChain = I; + } + } + LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") + << " reduction for phi: " << *Phi << "\n"); + } +} + // TODO: we could return a pair of values that specify the max VF and // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment @@ -6527,37 +7586,40 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { - unsigned VF = UserVF; +LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { + assert(!UserVF.isScalable() && "scalable vectors not yet supported"); + ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. - if (!OrigLoop->empty()) { + if (!OrigLoop->isInnermost()) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. - if (!UserVF) { - VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); + if (UserVF.isZero()) { + VF = ElementCount::getFixed( + determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. - if (VPlanBuildStressTest && VF < 2) { + if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " << "overriding computed VF.\n"); - VF = 4; + VF = ElementCount::getFixed(4); } } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF - << " to build VPlans.\n"); + assert(isPowerOf2_32(VF.getKnownMinValue()) && + "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") + << "VF " << VF << " to build VPlans.\n"); buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0}; + return {VF, 0 /*Cost*/}; } LLVM_DEBUG( @@ -6566,10 +7628,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { return VectorizationFactor::Disabled(); } -Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, - unsigned UserIC) { - assert(OrigLoop->empty() && "Inner loop expected."); - Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); +Optional<VectorizationFactor> +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + assert(OrigLoop->isInnermost() && "Inner loop expected."); + Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6587,40 +7649,55 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, CM.invalidateCostModelingDecisions(); } - if (UserVF) { - LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + ElementCount MaxVF = MaybeMaxVF.getValue(); + assert(MaxVF.isNonZero() && "MaxVF is zero."); + + bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); + if (!UserVF.isZero() && + (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { + // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable + // VFs here, this should be reverted to only use legal UserVFs once the + // loop below supports scalable VFs. + ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; + LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") + << " VF " << VF << ".\n"); + assert(isPowerOf2_32(VF.getKnownMinValue()) && + "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(UserVF); - buildVPlansWithVPRecipes(UserVF, UserVF); + CM.selectUserVectorizationFactor(VF); + CM.collectInLoopReductions(); + buildVPlansWithVPRecipes(VF, VF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + return {{VF, 0}}; } - unsigned MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF != 0 && "MaxVF is zero."); + assert(!MaxVF.isScalable() && + "Scalable vectors not yet supported beyond this point"); - for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { + for (ElementCount VF = ElementCount::getFixed(1); + ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF > 1) + if (VF.isVector()) CM.collectInstsToScalarize(VF); } - buildVPlansWithVPRecipes(1, MaxVF); + CM.collectInLoopReductions(); + + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF == 1) + if (MaxVF.isScalar()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { +void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); BestVF = VF; @@ -6639,13 +7716,23 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPCallbackILV CallbackILV(ILV); - VPTransformState State{BestVF, BestUF, LI, - DT, ILV.Builder, ILV.VectorLoopValueMap, - &ILV, CallbackILV}; + assert(BestVF.hasValue() && "Vectorization Factor is missing"); + + VPTransformState State{*BestVF, + BestUF, + OrigLoop, + LI, + DT, + ILV.Builder, + ILV.VectorLoopValueMap, + &ILV, + CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; + ILV.printDebugTracesAtStart(); + //===------------------------------------------------===// // // Notice: any optimization or new instruction that go @@ -6661,25 +7748,48 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(); + + ILV.printDebugTracesAtEnd(); } void LoopVectorizationPlanner::collectTriviallyDeadInstructions( SmallPtrSetImpl<Instruction *> &DeadInstructions) { - BasicBlock *Latch = OrigLoop->getLoopLatch(); - // We create new control-flow for the vectorized loop, so the original - // condition will be dead after vectorization if it's only used by the - // branch. - auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && Cmp->hasOneUse()) - DeadInstructions.insert(Cmp); + // We create new control-flow for the vectorized loop, so the original exit + // conditions will be dead after vectorization if it's only used by the + // terminator + SmallVector<BasicBlock*> ExitingBlocks; + OrigLoop->getExitingBlocks(ExitingBlocks); + for (auto *BB : ExitingBlocks) { + auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); + if (!Cmp || !Cmp->hasOneUse()) + continue; + + // TODO: we should introduce a getUniqueExitingBlocks on Loop + if (!DeadInstructions.insert(Cmp).second) + continue; + + // The operands of the icmp is often a dead trunc, used by IndUpdate. + // TODO: can recurse through operands in general + for (Value *Op : Cmp->operands()) { + if (isa<TruncInst>(Op) && Op->hasOneUse()) + DeadInstructions.insert(cast<Instruction>(Op)); + } + } // We create new "steps" for induction variable updates to which the original // induction variables map. An original update instruction will be dead if // all its users except the induction variable are dead. + auto *Latch = OrigLoop->getLoopLatch(); for (auto &Induction : Legal->getInductionVars()) { PHINode *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + + // If the tail is to be folded by masking, the primary induction variable, + // if exists, isn't dead: it will be used for masking. Don't kill it. + if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) + continue; + if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { return U == Ind || DeadInstructions.count(cast<Instruction>(U)); })) @@ -6754,12 +7864,284 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } +//===--------------------------------------------------------------------===// +// EpilogueVectorizerMainLoop +//===--------------------------------------------------------------------===// + +/// This function is partially responsible for generating the control flow +/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. +BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { + MDNode *OrigLoopID = OrigLoop->getLoopID(); + Loop *Lp = createVectorLoopSkeleton(""); + + // Generate the code to check the minimum iteration count of the vector + // epilogue (see below). + EPI.EpilogueIterationCountCheck = + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); + EPI.EpilogueIterationCountCheck->setName("iter.check"); + + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + BasicBlock *SavedPreHeader = LoopVectorPreHeader; + emitSCEVChecks(Lp, LoopScalarPreHeader); + + // If a safety check was generated save it. + if (SavedPreHeader != LoopVectorPreHeader) + EPI.SCEVSafetyCheck = SavedPreHeader; + + // Generate the code that checks at runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + SavedPreHeader = LoopVectorPreHeader; + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + + // If a safety check was generated save/overwite it. + if (SavedPreHeader != LoopVectorPreHeader) + EPI.MemSafetyCheck = SavedPreHeader; + + // Generate the iteration count check for the main loop, *after* the check + // for the epilogue loop, so that the path-length is shorter for the case + // that goes directly through the vector epilogue. The longer-path length for + // the main loop is compensated for, by the gain from vectorizing the larger + // trip count. Note: the branch will get updated later on when we vectorize + // the epilogue. + EPI.MainLoopIterationCountCheck = + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); + + // Generate the induction variable. + OldInduction = Legal->getPrimaryInduction(); + Type *IdxTy = Legal->getWidestInductionType(); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + EPI.VectorTripCount = CountRoundDown; + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Skip induction resume value creation here because they will be created in + // the second pass. If we created them here, they wouldn't be used anyway, + // because the vplan in the second pass still contains the inductions from the + // original loop. + + return completeLoopSkeleton(Lp, OrigLoopID); +} + +void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { + LLVM_DEBUG({ + dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" + << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() + << ", Main Loop UF:" << EPI.MainLoopUF + << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + }); +} + +void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { + DEBUG_WITH_TYPE(VerboseDebug, { + dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; + }); +} + +BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( + Loop *L, BasicBlock *Bypass, bool ForEpilogue) { + assert(L && "Expected valid Loop."); + assert(Bypass && "Expected valid bypass basic block."); + unsigned VFactor = + ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); + unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; + Value *Count = getOrCreateTripCount(L); + // Reuse existing vector loop preheader for TC checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + IRBuilder<> Builder(TCCheckBlock->getTerminator()); + + // Generate code to check if the loop's trip count is less than VF * UF of the + // main vector loop. + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + + Value *CheckMinIters = Builder.CreateICmp( + P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), + "min.iters.check"); + + if (!ForEpilogue) + TCCheckBlock->setName("vector.main.loop.iter.check"); + + // Create new preheader for vector loop. + LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), + DT, LI, nullptr, "vector.ph"); + + if (ForEpilogue) { + assert(DT->properlyDominates(DT->getNode(TCCheckBlock), + DT->getNode(Bypass)->getIDom()) && + "TC check is expected to dominate Bypass"); + + // Update dominator for Bypass & LoopExit. + DT->changeImmediateDominator(Bypass, TCCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + + LoopBypassBlocks.push_back(TCCheckBlock); + + // Save the trip count so we don't have to regenerate it in the + // vec.epilog.iter.check. This is safe to do because the trip count + // generated here dominates the vector epilog iter check. + EPI.TripCount = Count; + } + + ReplaceInstWithInst( + TCCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + + return TCCheckBlock; +} + +//===--------------------------------------------------------------------===// +// EpilogueVectorizerEpilogueLoop +//===--------------------------------------------------------------------===// + +/// This function is partially responsible for generating the control flow +/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. +BasicBlock * +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { + MDNode *OrigLoopID = OrigLoop->getLoopID(); + Loop *Lp = createVectorLoopSkeleton("vec.epilog."); + + // Now, compare the remaining count and if there aren't enough iterations to + // execute the vectorized epilogue skip to the scalar part. + BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; + VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); + LoopVectorPreHeader = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + LI, nullptr, "vec.epilog.ph"); + emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, + VecEpilogueIterationCountCheck); + + // Adjust the control flow taking the state info from the main loop + // vectorization into account. + assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && + "expected this to be saved from the previous pass."); + EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopVectorPreHeader); + + DT->changeImmediateDominator(LoopVectorPreHeader, + EPI.MainLoopIterationCountCheck); + + EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + + if (EPI.SCEVSafetyCheck) + EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + if (EPI.MemSafetyCheck) + EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + + DT->changeImmediateDominator( + VecEpilogueIterationCountCheck, + VecEpilogueIterationCountCheck->getSinglePredecessor()); + + DT->changeImmediateDominator(LoopScalarPreHeader, + EPI.EpilogueIterationCountCheck); + DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); + + // Keep track of bypass blocks, as they feed start values to the induction + // phis in the scalar loop preheader. + if (EPI.SCEVSafetyCheck) + LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); + if (EPI.MemSafetyCheck) + LoopBypassBlocks.push_back(EPI.MemSafetyCheck); + LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); + + // Generate a resume induction for the vector epilogue and put it in the + // vector epilogue preheader + Type *IdxTy = Legal->getWidestInductionType(); + PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", + LoopVectorPreHeader->getFirstNonPHI()); + EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); + EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), + EPI.MainLoopIterationCountCheck); + + // Generate the induction variable. + OldInduction = Legal->getPrimaryInduction(); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Value *StartIdx = EPResumeVal; + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Generate induction resume values. These variables save the new starting + // indexes for the scalar loop. They are used to test if there are any tail + // iterations left once the vector loop has completed. + // Note that when the vectorized epilogue is skipped due to iteration count + // check, then the resume value for the induction variable comes from + // the trip count of the main vector loop, hence passing the AdditionalBypass + // argument. + createInductionResumeValues(Lp, CountRoundDown, + {VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); + + AddRuntimeUnrollDisableMetaData(Lp); + return completeLoopSkeleton(Lp, OrigLoopID); +} + +BasicBlock * +EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( + Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { + + assert(EPI.TripCount && + "Expected trip count to have been safed in the first pass."); + assert( + (!isa<Instruction>(EPI.TripCount) || + DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && + "saved trip count does not dominate insertion point."); + Value *TC = EPI.TripCount; + IRBuilder<> Builder(Insert->getTerminator()); + Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); + + // Generate code to check if the loop's trip count is less than VF * UF of the + // vector epilogue loop. + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + + Value *CheckMinIters = Builder.CreateICmp( + P, Count, + ConstantInt::get(Count->getType(), + EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), + "min.epilog.iters.check"); + + ReplaceInstWithInst( + Insert->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + + LoopBypassBlocks.push_back(Insert); + return Insert; +} + +void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { + LLVM_DEBUG({ + dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" + << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() + << ", Main Loop UF:" << EPI.MainLoopUF + << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + }); +} + +void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { + DEBUG_WITH_TYPE(VerboseDebug, { + dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; + }); +} + bool LoopVectorizationPlanner::getDecisionAndClampRange( - const std::function<bool(unsigned)> &Predicate, VFRange &Range) { - assert(Range.End > Range.Start && "Trying to test an empty VF range."); + const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { + assert(!Range.isEmpty() && "Trying to test an empty VF range."); bool PredicateAtRangeStart = Predicate(Range.Start); - for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) + for (ElementCount TmpVF = Range.Start * 2; + ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) if (Predicate(TmpVF) != PredicateAtRangeStart) { Range.End = TmpVF; break; @@ -6773,9 +8155,11 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( /// of VF's starting at a given VF and extending it as much as possible. Each /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). -void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { - for (unsigned VF = MinVF; VF < MaxVF + 1;) { - VFRange SubRange = {VF, MaxVF + 1}; +void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, + ElementCount MaxVF) { + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { + VFRange SubRange = {VF, MaxVFPlusOne}; VPlans.push_back(buildVPlan(SubRange)); VF = SubRange.End; } @@ -6800,7 +8184,13 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) return EdgeMaskCache[Edge] = SrcMask; - VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); + // If source is an exiting block, we know the exit edge is dynamically dead + // in the vector loop, and thus we don't need to restrict the mask. Avoid + // adding uses of an otherwise potentially dead instruction. + if (OrigLoop->isLoopExiting(Src)) + return EdgeMaskCache[Edge] = SrcMask; + + VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) @@ -6828,23 +8218,34 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredication(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + // Create the block in mask as the first non-phi instruction in the block. + VPBuilder::InsertPointGuard Guard(Builder); + auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); + Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. // Start by constructing the desired canonical IV. VPValue *IV = nullptr; if (Legal->getPrimaryInduction()) - IV = Plan->getVPValue(Legal->getPrimaryInduction()); + IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); else { auto IVRecipe = new VPWidenCanonicalIVRecipe(); - Builder.getInsertBlock()->appendRecipe(IVRecipe); + Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); IV = IVRecipe->getVPValue(); } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); - if (TailFolded && CM.TTI.emitGetActiveLaneMask()) - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); - else + + if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { + // While ActiveLaneMask is a binary op that consumes the loop tripcount + // as a second argument, we only pass the IV here and extract the + // tripcount from the transform state where codegen of the VP instructions + // happen. + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + } else { BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + } return BlockMaskCache[BB] = BlockMask; } @@ -6865,14 +8266,13 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; } -VPWidenMemoryInstructionRecipe * -VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan) { +VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Must be called with either a load or store"); - auto willWiden = [&](unsigned VF) -> bool { - if (VF == 1) + auto willWiden = [&](ElementCount VF) -> bool { + if (VF.isScalar()) return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); @@ -6903,20 +8303,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, } VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { +VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. InductionDescriptor II = Legal->getInductionVars().lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) - return new VPWidenIntOrFpInductionRecipe(Phi); + II.getKind() == InductionDescriptor::IK_FpInduction) { + VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); + return new VPWidenIntOrFpInductionRecipe(Phi, Start); + } return nullptr; } VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, - VFRange &Range) const { +VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, + VPlan &Plan) const { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -6925,15 +8327,21 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, // Determine whether \p K is a truncation based on an induction variable that // can be optimized. auto isOptimizableIVTruncate = - [&](Instruction *K) -> std::function<bool(unsigned)> { - return - [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; + [&](Instruction *K) -> std::function<bool(ElementCount)> { + return [=](ElementCount VF) -> bool { + return CM.isOptimizableIVTruncate(K, VF); + }; }; if (LoopVectorizationPlanner::getDecisionAndClampRange( - isOptimizableIVTruncate(I), Range)) + isOptimizableIVTruncate(I), Range)) { + + InductionDescriptor II = + Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); + VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - I); + Start, I); + } return nullptr; } @@ -6962,7 +8370,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, VPlan &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -6970,19 +8380,23 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) + ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || + ID == Intrinsic::pseudoprobe || + ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - auto willWiden = [&](unsigned VF) -> bool { + auto willWiden = [&](ElementCount VF) -> bool { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // The following case may be scalarized depending on the VF. // The flag shows whether we use Intrinsic or a usual Call for vectorized // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize = false; - unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; + InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; + bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; + assert(IntrinsicCost.isValid() && CallCost.isValid() && + "Cannot have invalid costs while widening"); return UseVectorIntrinsic || !NeedToScalarize; }; @@ -6997,7 +8411,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { !isa<StoreInst>(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](unsigned VF) -> bool { + auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I, VF); @@ -7060,15 +8474,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, VPlanPtr &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, + [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, + Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); setRecipe(I, Recipe); + Plan->addVPValue(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar // value. Avoid hoisting the insert-element which packs the scalar value into @@ -7110,8 +8526,9 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, assert(Instr->getParent() && "Predicated instruction not in any basic block"); auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); - auto *PHIRecipe = - Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); + auto *PHIRecipe = Instr->getType()->isVoidTy() + ? nullptr + : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); @@ -7139,13 +8556,21 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (auto Phi = dyn_cast<PHINode>(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Plan); - if ((Recipe = tryToOptimizeInductionPHI(Phi))) + if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) return Recipe; + + if (Legal->isReductionVariable(Phi)) { + RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + VPValue *StartV = + Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); + return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); + } + return new VPWidenPHIRecipe(Phi); } - if (isa<TruncInst>(Instr) && - (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) + if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( + cast<TruncInst>(Instr), Range, *Plan))) return Recipe; if (!shouldWiden(Instr, Range)) @@ -7165,35 +8590,9 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return tryToWiden(Instr, *Plan); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, - unsigned MaxVF) { - assert(OrigLoop->empty() && "Inner loop expected."); - - // Collect conditions feeding internal conditional branches; they need to be - // represented in VPlan for it to model masking. - SmallPtrSet<Value *, 1> NeedDef; - - auto *Latch = OrigLoop->getLoopLatch(); - for (BasicBlock *BB : OrigLoop->blocks()) { - if (BB == Latch) - continue; - BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); - if (Branch && Branch->isConditional()) - NeedDef.insert(Branch->getCondition()); - } - - // If the tail is to be folded by masking, the primary induction variable, if - // exists needs to be represented in VPlan for it to model early-exit masking. - // Also, both the Phi and the live-out instruction of each reduction are - // required in order to introduce a select between them in VPlan. - if (CM.foldTailByMasking()) { - if (Legal->getPrimaryInduction()) - NeedDef.insert(Legal->getPrimaryInduction()); - for (auto &Reduction : Legal->getReductionVars()) { - NeedDef.insert(Reduction.first); - NeedDef.insert(Reduction.second.getLoopExitInstr()); - } - } +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, + ElementCount MaxVF) { + assert(OrigLoop->isInnermost() && "Inner loop expected."); // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -7216,17 +8615,17 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, for (Instruction *I : DeadInstructions) SinkAfter.erase(I); - for (unsigned VF = MinVF; VF < MaxVF + 1;) { - VFRange SubRange = {VF, MaxVF + 1}; - VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, - DeadInstructions, SinkAfter)); + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { + VFRange SubRange = {VF, MaxVFPlusOne}; + VPlans.push_back( + buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); VF = SubRange.End; } } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, - SmallPtrSetImpl<Instruction *> &DeadInstructions, + VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const DenseMap<Instruction *, Instruction *> &SinkAfter) { // Hold a mapping from predicated instructions to their recipes, in order to @@ -7249,14 +8648,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RecipeBuilder.recordRecipeOf(Entry.first); RecipeBuilder.recordRecipeOf(Entry.second); } + for (auto &Reduction : CM.getInLoopReductionChains()) { + PHINode *Phi = Reduction.first; + RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); + const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; + + RecipeBuilder.recordRecipeOf(Phi); + for (auto &R : ReductionOperations) { + RecipeBuilder.recordRecipeOf(R); + // For min/max reducitons, where we have a pair of icmp/select, we also + // need to record the ICmp recipe, so it can be removed later. + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) + RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); + } + } // For each interleave group which is relevant for this (possibly trimmed) // Range, add it to the set of groups to be later applied to the VPlan and add // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 + auto applyIG = [IG, this](ElementCount VF) -> bool { + return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); }; @@ -7278,10 +8691,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); Plan->setEntry(VPBB); - // Represent values that will have defs inside VPlan. - for (Value *V : NeedDef) - Plan->addVPValue(V); - // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -7308,6 +8717,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (auto Recipe = RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { + for (auto *Def : Recipe->definedValues()) { + auto *UV = Def->getUnderlyingValue(); + Plan->addVPValue(UV, Def); + } + RecipeBuilder.setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); continue; @@ -7343,6 +8757,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( for (auto &Entry : SinkAfter) { VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); + // If the target is in a replication region, make sure to move Sink to the + // block after it, not into the replication region itself. + if (auto *Region = + dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { + if (Region->isReplicator()) { + assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); + VPBasicBlock *NextBlock = + cast<VPBasicBlock>(Region->getSuccessors().front()); + Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); + continue; + } + } Sink->moveAfter(Target); } @@ -7352,33 +8778,52 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( for (auto IG : InterleaveGroups) { auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( RecipeBuilder.getRecipe(IG->getInsertPos())); - (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) - ->insertBefore(Recipe); + SmallVector<VPValue *, 4> StoredValues; + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) + StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); + auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, + Recipe->getMask()); + VPIG->insertBefore(Recipe); + unsigned J = 0; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *Member = IG->getMember(i)) { + if (!Member->getType()->isVoidTy()) { + VPValue *OriginalV = Plan->getVPValue(Member); + Plan->removeVPValueFor(Member); + Plan->addVPValue(Member, VPIG->getVPValue(J)); + OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); + J++; + } RecipeBuilder.getRecipe(Member)->eraseFromParent(); } } + // Adjust the recipes for any inloop reductions. + if (Range.Start.isVector()) + adjustRecipesForInLoopReductions(Plan, RecipeBuilder); + // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. - if (CM.foldTailByMasking()) { + if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { Builder.setInsertPoint(VPBB); auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); for (auto &Reduction : Legal->getReductionVars()) { - VPValue *Phi = Plan->getVPValue(Reduction.first); - VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); + if (CM.isInLoopReduction(Reduction.first)) + continue; + VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); + VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); } } std::string PlanName; raw_string_ostream RSO(PlanName); - unsigned VF = Range.Start; + ElementCount VF = Range.Start; Plan->addVF(VF); RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; VF < Range.End; VF *= 2) { + for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { Plan->addVF(VF); RSO << "," << VF; } @@ -7394,7 +8839,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. - assert(!OrigLoop->empty()); + assert(!OrigLoop->isInnermost()); assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan @@ -7404,7 +8849,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); - for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) + for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); + VF *= 2) Plan->addVF(VF); if (EnableVPlanPredication) { @@ -7422,6 +8868,67 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { return Plan; } +// Adjust the recipes for any inloop reductions. The chain of instructions +// leading from the loop exit instr to the phi need to be converted to +// reductions, with one operand being vector and the other being the scalar +// reduction chain. +void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( + VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { + for (auto &Reduction : CM.getInLoopReductionChains()) { + PHINode *Phi = Reduction.first; + RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; + + // ReductionOperations are orders top-down from the phi's use to the + // LoopExitValue. We keep a track of the previous item (the Chain) to tell + // which of the two operands will remain scalar and which will be reduced. + // For minmax the chain will be the select instructions. + Instruction *Chain = Phi; + for (Instruction *R : ReductionOperations) { + VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); + RecurKind Kind = RdxDesc.getRecurrenceKind(); + + VPValue *ChainOp = Plan->getVPValue(Chain); + unsigned FirstOpId; + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + assert(isa<VPWidenSelectRecipe>(WidenRecipe) && + "Expected to replace a VPWidenSelectSC"); + FirstOpId = 1; + } else { + assert(isa<VPWidenRecipe>(WidenRecipe) && + "Expected to replace a VPWidenSC"); + FirstOpId = 0; + } + unsigned VecOpId = + R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; + VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); + + auto *CondOp = CM.foldTailByMasking() + ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) + : nullptr; + VPReductionRecipe *RedRecipe = new VPReductionRecipe( + &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); + WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); + Plan->removeVPValueFor(R); + Plan->addVPValue(R, RedRecipe); + WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); + WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); + WidenRecipe->eraseFromParent(); + + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + VPRecipeBase *CompareRecipe = + RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); + assert(isa<VPWidenRecipe>(CompareRecipe) && + "Expected to replace a VPWidenSC"); + assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && + "Expected no remaining users"); + CompareRecipe->eraseFromParent(); + } + Chain = R; + } + } +} + Value* LoopVectorizationPlanner::VPCallbackILV:: getOrCreateVectorValues(Value *V, unsigned Part) { return ILV.getOrCreateVectorValue(V, Part); @@ -7449,29 +8956,35 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } void VPWidenCallRecipe::execute(VPTransformState &State) { - State.ILV->widenCallInstruction(Ingredient, User, State); + State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, + *this, State); } void VPWidenSelectRecipe::execute(VPTransformState &State) { - State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); + State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), + this, *this, InvariantCond, State); } void VPWidenRecipe::execute(VPTransformState &State) { - State.ILV->widenInstruction(Ingredient, User, State); + State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); } void VPWidenGEPRecipe::execute(VPTransformState &State) { - State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, + State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, + *this, State.UF, State.VF, IsPtrLoopInvariant, IsIndexLoopInvariant, State); } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, Trunc); + State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), + Trunc); } void VPWidenPHIRecipe::execute(VPTransformState &State) { - State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); + Value *StartV = + getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; + State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); } void VPBlendRecipe::execute(VPTransformState &State) { @@ -7515,22 +9028,59 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); + State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), + getStoredValues(), getMask()); +} + +void VPReductionRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Reduction being replicated."); + for (unsigned Part = 0; Part < State.UF; ++Part) { + RecurKind Kind = RdxDesc->getRecurrenceKind(); + Value *NewVecOp = State.get(getVecOp(), Part); + if (VPValue *Cond = getCondOp()) { + Value *NewCond = State.get(Cond, Part); + VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); + Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( + Kind, VecTy->getElementType()); + Constant *IdenVec = + ConstantVector::getSplat(VecTy->getElementCount(), Iden); + Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); + NewVecOp = Select; + } + Value *NewRed = + createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); + Value *PrevInChain = State.get(getChainOp(), Part); + Value *NextInChain; + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + NextInChain = + createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), + NewRed, PrevInChain); + } else { + NextInChain = State.Builder.CreateBinOp( + (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, + PrevInChain); + } + State.set(this, getUnderlyingInstr(), NextInChain, Part); + } } void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. - State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, - IsPredicated, State); + assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, + *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. - if (AlsoPack && State.VF > 1) { - // If we're constructing lane 0, initialize to start from undef. + if (AlsoPack && State.VF.isVector()) { + // If we're constructing lane 0, initialize to start from poison. if (State.Instance->Lane == 0) { - Value *Undef = UndefValue::get( - FixedVectorType::get(Ingredient->getType(), State.VF)); - State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + Value *Poison = PoisonValue::get( + VectorType::get(getUnderlyingValue()->getType(), State.VF)); + State.ValueMap.setVectorValue(getUnderlyingInstr(), + State.Instance->Part, Poison); } - State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); + State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), + *State.Instance); } return; } @@ -7538,10 +9088,12 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Generate scalar instances for all VF lanes of all UF parts, unless the // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF; + unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); + assert((!State.VF.isScalable() || IsUniform) && + "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, IsPredicated, State); } @@ -7573,8 +9125,8 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); - Instruction *ScalarPredInst = cast<Instruction>( - State.ValueMap.getScalarValue(PredInst, *State.Instance)); + Instruction *ScalarPredInst = + cast<Instruction>(State.get(getOperand(0), *State.Instance)); BasicBlock *PredicatedBB = ScalarPredInst->getParent(); BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); assert(PredicatingBB && "Predicated block has no single predecessor."); @@ -7586,6 +9138,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. unsigned Part = State.Instance->Part; + Instruction *PredInst = + cast<Instruction>(getOperand(0)->getUnderlyingValue()); if (State.ValueMap.hasVectorValue(PredInst, Part)) { Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); @@ -7596,16 +9150,17 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { } else { Type *PredInstType = PredInst->getType(); PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); - Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); + Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); Phi->addIncoming(ScalarPredInst, PredicatedBB); State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); } } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; - State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, - getMask()); + VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; + State.ILV->vectorizeMemoryInstruction(&Ingredient, State, + StoredValue ? nullptr : getVPValue(), + getAddr(), StoredValue, getMask()); } // Determine how to lower the scalar epilogue, which depends on 1) optimising @@ -7617,35 +9172,53 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, LoopVectorizationLegality &LVL) { - bool OptSize = - F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, - PGSOQueryType::IRPass); // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. - if (OptSize) + // (For PGSO, as shouldOptimizeForSize isn't currently accessible from + // LoopAccessInfo (due to code dependency and not being able to reliably get + // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection + // of strides in LoopAccessInfo::analyzeLoop() and vectorize without + // versioning when the vectorization is forced, unlike hasOptSize. So revert + // back to the old way and vectorize with versioning when forced. See D81345.) + if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass) && + Hints.getForce() != LoopVectorizeHints::FK_Enabled)) return CM_ScalarEpilogueNotAllowedOptSize; - bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && - !PreferPredicateOverEpilog; + // 2) If set, obey the directives + if (PreferPredicateOverEpilogue.getNumOccurrences()) { + switch (PreferPredicateOverEpilogue) { + case PreferPredicateTy::ScalarEpilogue: + return CM_ScalarEpilogueAllowed; + case PreferPredicateTy::PredicateElseScalarEpilogue: + return CM_ScalarEpilogueNotNeededUsePredicate; + case PreferPredicateTy::PredicateOrDontVectorize: + return CM_ScalarEpilogueNotAllowedUsePredicate; + }; + } - // 2) Next, if disabling predication is requested on the command line, honour - // this and request a scalar epilogue. - if (PredicateOptDisabled) + // 3) If set, obey the hints + switch (Hints.getPredicate()) { + case LoopVectorizeHints::FK_Enabled: + return CM_ScalarEpilogueNotNeededUsePredicate; + case LoopVectorizeHints::FK_Disabled: return CM_ScalarEpilogueAllowed; + }; - // 3) and 4) look if enabling predication is requested on the command line, - // with a loop hint, or if the TTI hook indicates this is profitable, request - // predication . - if (PreferPredicateOverEpilog || - Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || - (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, - LVL.getLAI()) && - Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) + // 4) if the TTI hook indicates this is profitable, request predication. + if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, + LVL.getLAI())) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; } +void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, + unsigned Part) { + set(Def, V, Part); + ILV->setVectorValue(IRDef, Part, V); +} + // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the @@ -7657,7 +9230,7 @@ static bool processLoopInVPlanNativePath( OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { - if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { + if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); return false; } @@ -7676,7 +9249,7 @@ static bool processLoopInVPlanNativePath( LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); // Get user vectorization factor. - const unsigned UserVF = Hints.getWidth(); + ElementCount UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); @@ -7691,7 +9264,7 @@ static bool processLoopInVPlanNativePath( LVP.setBestPlan(VF.Width, 1); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM); + &CM, BFI, PSI); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -7710,7 +9283,7 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) !EnableLoopVectorization) {} bool LoopVectorizePass::processLoop(Loop *L) { - assert((EnableVPlanNativePath || L->empty()) && + assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); #ifndef NDEBUG @@ -7755,7 +9328,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, - &Requirements, &Hints, DB, AC); + &Requirements, &Hints, DB, AC, BFI, PSI); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); @@ -7772,11 +9345,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { // even evaluating whether vectorization is profitable. Since we cannot modify // the incoming IR, we need to build VPlan upfront in the vectorization // pipeline. - if (!L->empty()) + if (!L->isInnermost()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, ORE, BFI, PSI, Hints); - assert(L->empty() && "Inner loop expected."); + assert(L->isInnermost() && "Inner loop expected."); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. @@ -7841,7 +9414,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); // Get user vectorization factor and interleave count. - unsigned UserVF = Hints.getWidth(); + ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. @@ -7866,7 +9439,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (VF.Width == 1) { + if (VF.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( "VectorizationNotBeneficial", @@ -7955,8 +9528,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, + BFI, PSI); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -7967,16 +9540,51 @@ bool LoopVectorizePass::processLoop(Loop *L) { }); } else { // If we decided that it is *legal* to vectorize the loop, then do it. - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM); - LVP.executePlan(LB, DT); - ++LoopsVectorized; - // Add metadata to disable runtime unrolling a scalar loop when there are - // no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + if (EpilogueVF.Width.isVector()) { + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, + EpilogueVF.Width.getKnownMinValue(), 1); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, + &LVL, &CM, BFI, PSI); + + LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); + LVP.executePlan(MainILV, DT); + ++LoopsVectorized; + + simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); + formLCSSARecursively(*L, *DT, LI, SE); + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); + EPI.MainLoopVF = EPI.EpilogueVF; + EPI.MainLoopUF = EPI.EpilogueUF; + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, + ORE, EPI, &LVL, &CM, BFI, PSI); + LVP.executePlan(EpilogILV, DT); + ++LoopsEpilogueVectorized; + + if (!MainILV.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, + &LVL, &CM, BFI, PSI); + LVP.executePlan(LB, DT); + ++LoopsVectorized; + + // Add metadata to disable runtime unrolling a scalar loop when there are + // no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + if (!LB.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } // Report the vectorization decision. ORE->emit([&]() { @@ -8090,7 +9698,8 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, MSSA}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); |