diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-07-29 20:15:26 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2021-07-29 20:15:26 +0000 |
commit | 344a3780b2e33f6ca763666c380202b18aab72a3 (patch) | |
tree | f0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | b60736ec1405bb0a8dd40989f67ef4c93da068ab (diff) |
vendor/llvm-project/llvmorg-13-init-16847-g88e66fa60ae5vendor/llvm-project/llvmorg-12.0.1-rc2-0-ge7dac564cd0evendor/llvm-project/llvmorg-12.0.1-0-gfed41342a82f
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3868 |
1 files changed, 2299 insertions, 1569 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ea0d7673edf6..f24ae6b100d5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -69,8 +69,8 @@ #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -117,6 +117,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -198,6 +199,11 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); +static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( + "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks with a " + "vectorize(enable) pragma.")); + // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body @@ -326,6 +332,11 @@ static cl::opt<bool> cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference.")); +cl::opt<bool> EnableStrictReductions( + "enable-strict-reductions", cl::init(false), cl::Hidden, + cl::desc("Enable the vectorisation of loops with in-order (strict) " + "FP reductions")); + static cl::opt<bool> PreferPredicatedReductionSelect( "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc( @@ -361,30 +372,17 @@ cl::opt<bool> llvm::EnableLoopVectorization( "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); -/// A helper function that returns the type of loaded or stored value. -static Type *getMemInstValueType(Value *I) { - assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && - "Expected Load or Store instruction"); - if (auto *LI = dyn_cast<LoadInst>(I)) - return LI->getType(); - return cast<StoreInst>(I)->getValueOperand()->getType(); -} +cl::opt<bool> PrintVPlansInDotFormat( + "vplan-print-in-dot-format", cl::init(false), cl::Hidden, + cl::desc("Use dot format instead of plain text when dumping VPlans")); /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an -/// element of the corresponding vector type at the given vectorization factor. -static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { - // Determine if an array of VF elements of type Ty is "bitcast compatible" - // with a <VF x Ty> vector. - if (VF.isVector()) { - auto *VectorTy = VectorType::get(Ty, VF); - return TypeSize::get(VF.getKnownMinValue() * - DL.getTypeAllocSize(Ty).getFixedValue(), - VF.isScalable()) != DL.getTypeStoreSize(VectorTy); - } - - // If the vectorization factor is one, we just check if an array of type Ty - // requires padding between elements. +/// element of the corresponding vector type. +static bool hasIrregularType(Type *Ty, const DataLayout &DL) { + // Determine if an array of N elements of type Ty is "bitcast compatible" + // with a <N x Ty> vector. + // This is only true if there is no padding between the array elements. return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); } @@ -396,19 +394,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { /// we always assume predicated blocks have a 50% chance of executing. static unsigned getReciprocalPredBlockProb() { return 2; } -/// A helper function that adds a 'fast' flag to floating-point operations. -static Value *addFastMathFlag(Value *V) { - if (isa<FPMathOperator>(V)) - cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); - return V; -} - -static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { - if (isa<FPMathOperator>(V)) - cast<Instruction>(V)->setFastMathFlags(FMF); - return V; -} - /// A helper function that returns an integer or floating-point constant with /// value C. static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { @@ -439,6 +424,9 @@ static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { return None; } +// Forward declare GeneratedRTChecks. +class GeneratedRTChecks; + namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -464,12 +452,11 @@ public: OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), - Builder(PSE.getSE()->getContext()), - VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), - BFI(BFI), PSI(PSI) { + Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), + PSI(PSI), RTChecks(RTChecks) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -500,7 +487,7 @@ public: bool InvariantCond, VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. - void fixVectorizedLoop(); + void fixVectorizedLoop(VPTransformState &State); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -516,62 +503,31 @@ public: unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); - /// Vectorize a single PHINode in a block. This method handles the induction - /// variable canonicalization. It supports both VF = 1 for unrolled loops and - /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, - Value *StartV, unsigned UF, ElementCount VF); + /// Vectorize a single first-order recurrence or pointer induction PHINode in + /// a block. This method handles the induction variable canonicalization. It + /// supports both VF = 1 for unrolled loops and arbitrary length vectors. + void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, + VPTransformState &State); /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, /// inclusive. Uses the VPValue operands from \p Operands instead of \p /// Instr's operands. - void scalarizeInstruction(Instruction *Instr, VPUser &Operands, + void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, Value *Start, - TruncInst *Trunc = nullptr); - - /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a - /// vector or scalar value on-demand if one is not yet available. When - /// vectorizing a loop, we visit the definition of an instruction before its - /// uses. When visiting the definition, we either vectorize or scalarize the - /// instruction, creating an entry for it in the corresponding map. (In some - /// cases, such as induction variables, we will create both vector and scalar - /// entries.) Then, as we encounter uses of the definition, we derive values - /// for each scalar or vector use unless such a value is already available. - /// For example, if we scalarize a definition and one of its uses is vector, - /// we build the required vector on-demand with an insertelement sequence - /// when visiting the use. Otherwise, if the use is scalar, we can use the - /// existing scalar definition. - /// - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll index \p Part. If the value has already been vectorized, - /// the corresponding vector entry in VectorLoopValueMap is returned. If, - /// however, the value has a scalar entry in VectorLoopValueMap, we construct - /// a new vector value on-demand by inserting the scalar values into a vector - /// with an insertelement sequence. If the value has been neither vectorized - /// nor scalarized, it must be loop invariant, so we simply broadcast the - /// value into a vector. - Value *getOrCreateVectorValue(Value *V, unsigned Part); - - void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { - VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); - } - - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll and vector indices \p Instance. If the value has been - /// vectorized but not scalarized, the necessary extractelement instruction - /// will be generated. - Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); + void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, + VPValue *Def, VPValue *CastDef, + VPTransformState &State); /// Construct the vector value of a scalarized value \p V one lane at a time. - void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); + void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, + VPTransformState &State); /// Try to vectorize interleaved access group \p Group with the base address /// given in \p Addr, optionally masking the vector operations if \p @@ -591,12 +547,24 @@ public: VPValue *Def, VPValue *Addr, VPValue *StoredValue, VPValue *BlockInMask); - /// Set the debug location in the builder using the debug location in - /// the instruction. - void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + /// Set the debug location in the builder \p Ptr using the debug location in + /// \p V. If \p Ptr is None then it uses the class member's Builder. + void setDebugLocFromInst(const Value *V, + Optional<IRBuilder<> *> CustomBuilder = None); /// Fix the non-induction PHIs in the OrigPHIsToFix vector. - void fixNonInductionPHIs(void); + void fixNonInductionPHIs(VPTransformState &State); + + /// Returns true if the reordering of FP operations is not allowed, but we are + /// able to vectorize with strict in-order reductions for the given RdxDesc. + bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); + + /// Create a broadcast instruction. This method generates a broadcast + /// instruction (shuffle) for loop invariant values and for the induction + /// value. If this is the induction variable then we extend it to N, N+1, ... + /// this is needed because each iteration in the loop corresponds to a SIMD + /// element. + virtual Value *getBroadcastInstrs(Value *V); protected: friend class LoopVectorizationPlanner; @@ -620,25 +588,26 @@ protected: Value *Step, Instruction *DL); /// Handle all cross-iteration phis in the header. - void fixCrossIterationPHIs(); + void fixCrossIterationPHIs(VPTransformState &State); /// Fix a first-order recurrence. This is the second phase of vectorizing /// this phi node. - void fixFirstOrderRecurrence(PHINode *Phi); + void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); /// Fix a reduction cross-iteration phi. This is the second phase of /// vectorizing this phi node. - void fixReduction(PHINode *Phi); + void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); /// Clear NSW/NUW flags from reduction instructions if necessary. - void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); + void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, + VPTransformState &State); /// Fixup the LCSSA phi nodes in the unique exit block. This simply /// means we need to add the appropriate incoming value from the middle /// block as exiting edges from the scalar epilogue loop (if present) are /// already in place, and we exit the vector loop exclusively to the middle /// block. - void fixLCSSAPHIs(); + void fixLCSSAPHIs(VPTransformState &State); /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. @@ -646,16 +615,10 @@ protected: /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. - void truncateToMinimalBitwidths(); - - /// Create a broadcast instruction. This method generates a broadcast - /// instruction (shuffle) for loop invariant values and for the induction - /// value. If this is the induction variable then we extend it to N, N+1, ... - /// this is needed because each iteration in the loop corresponds to a SIMD - /// element. - virtual Value *getBroadcastInstrs(Value *V); + void truncateToMinimalBitwidths(VPTransformState &State); - /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) + /// This function adds + /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) /// to each vector element of Val. The sequence starts at StartIndex. /// \p Opcode is relevant for FP induction variable. virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, @@ -668,7 +631,8 @@ protected: /// Note that \p EntryVal doesn't have to be an induction variable - it /// can also be a truncate instruction. void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, - const InductionDescriptor &ID); + const InductionDescriptor &ID, VPValue *Def, + VPValue *CastDef, VPTransformState &State); /// Create a vector induction phi node based on an existing scalar one. \p /// EntryVal is the value from the original loop that maps to the vector phi @@ -677,7 +641,9 @@ protected: /// version of the IV truncated to \p EntryVal's type. void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Value *Start, - Instruction *EntryVal); + Instruction *EntryVal, VPValue *Def, + VPValue *CastDef, + VPTransformState &State); /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. @@ -704,11 +670,10 @@ protected: /// latter case \p EntryVal is a TruncInst and we must not record anything for /// that IV, but it's error-prone to expect callers of this routine to care /// about that, hence this explicit parameter. - void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, - const Instruction *EntryVal, - Value *VectorLoopValue, - unsigned Part, - unsigned Lane = UINT_MAX); + void recordVectorLoopValueForInductionCast( + const InductionDescriptor &ID, const Instruction *EntryVal, + Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, + unsigned Part, unsigned Lane = UINT_MAX); /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -729,11 +694,14 @@ protected: void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); /// Emit a bypass check to see if all of the SCEV assumptions we've - /// had to make are correct. - void emitSCEVChecks(Loop *L, BasicBlock *Bypass); + /// had to make are correct. Returns the block containing the checks or + /// nullptr if no checks have been added. + BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. - void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// Returns the block containing the checks or nullptr if no checks have been + /// added. + BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); /// Compute the transformed value of Index at offset StartValue using step /// StepValue. @@ -848,7 +816,7 @@ protected: /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; - /// The (unique) ExitBlock of the scalar loop. Note that + /// The unique ExitBlock of the scalar loop if one exists. Note that /// there can be multiple exiting edges reaching this block. BasicBlock *LoopExitBlock; @@ -867,12 +835,6 @@ protected: /// The induction variable of the old basic block. PHINode *OldInduction = nullptr; - /// Maps values from the original loop to their corresponding values in the - /// vectorized loop. A key value can map to either vector values, scalar - /// values or both kinds of values, depending on whether the key was - /// vectorized and scalarized. - VectorizerValueMap VectorLoopValueMap; - /// Store instructions that were predicated. SmallVector<Instruction *, 4> PredicatedInstructions; @@ -906,6 +868,10 @@ protected: // Whether this loop should be optimized for size based on profile guided size // optimizatios. bool OptForSizeBasedOnProfile; + + /// Structure to hold information about generated runtime checks, responsible + /// for cleaning the checks, if vectorization turns out unprofitable. + GeneratedRTChecks &RTChecks; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -917,10 +883,10 @@ public: OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), UnrollFactor, LVL, CM, - BFI, PSI) {} + BFI, PSI, Check) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -969,9 +935,11 @@ public: const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), + EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, + Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -1005,9 +973,10 @@ public: const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Check) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} + EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; @@ -1027,17 +996,16 @@ protected: // their epilogues. class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { public: - EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, - LoopInfo *LI, DominatorTree *DT, - const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, - EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationLegality *LVL, - llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + EpilogueVectorizerEpilogueLoop( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} + EPI, LVL, CM, BFI, PSI, Checks) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; @@ -1064,8 +1032,8 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { if (I->getDebugLoc() != Empty) return I; - for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { - if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) + for (Use &Op : I->operands()) { + if (Instruction *OpInst = dyn_cast<Instruction>(Op)) if (OpInst->getDebugLoc() != Empty) return OpInst; } @@ -1073,34 +1041,38 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { return I; } -void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { - if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { +void InnerLoopVectorizer::setDebugLocFromInst( + const Value *V, Optional<IRBuilder<> *> CustomBuilder) { + IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; + if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { const DILocation *DIL = Inst->getDebugLoc(); + + // When a FSDiscriminator is enabled, we don't need to add the multiply + // factors to the discriminators. if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && - !isa<DbgInfoIntrinsic>(Inst)) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); + !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { + // FIXME: For scalable vectors, assume vscale=1. auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); if (NewDIL) - B.SetCurrentDebugLocation(NewDIL.getValue()); + B->SetCurrentDebugLocation(NewDIL.getValue()); else LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " << DIL->getFilename() << " Line: " << DIL->getLine()); - } - else - B.SetCurrentDebugLocation(DIL); + } else + B->SetCurrentDebugLocation(DIL); } else - B.SetCurrentDebugLocation(DebugLoc()); + B->SetCurrentDebugLocation(DebugLoc()); } -/// Write a record \p DebugMsg about vectorization failure to the debug -/// output stream. If \p I is passed, it is an instruction that prevents -/// vectorization. +/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I +/// is passed, the message relates to that particular instruction. #ifndef NDEBUG -static void debugVectorizationFailure(const StringRef DebugMsg, - Instruction *I) { - dbgs() << "LV: Not vectorizing: " << DebugMsg; +static void debugVectorizationMessage(const StringRef Prefix, + const StringRef DebugMsg, + Instruction *I) { + dbgs() << "LV: " << Prefix << DebugMsg; if (I != nullptr) dbgs() << " " << *I; else @@ -1129,9 +1101,7 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, DL = I->getDebugLoc(); } - OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); - R << "loop not vectorized: "; - return R; + return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); } /// Return a value for Step multiplied by VF. @@ -1145,13 +1115,31 @@ static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { namespace llvm { +/// Return the runtime value for VF. +Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { + Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(EC) : EC; +} + void reportVectorizationFailure(const StringRef DebugMsg, - const StringRef OREMsg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { - LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); + const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, + Instruction *I) { + LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); - ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), - ORETag, TheLoop, I) << OREMsg); + ORE->emit( + createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) + << "loop not vectorized: " << OREMsg); +} + +void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, + Instruction *I) { + LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); + LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); + ORE->emit( + createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) + << Msg); } } // end namespace llvm @@ -1220,6 +1208,16 @@ enum ScalarEpilogueLowering { CM_ScalarEpilogueNotAllowedUsePredicate }; +/// ElementCountComparator creates a total ordering for ElementCount +/// for the purposes of using it in a set structure. +struct ElementCountComparator { + bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { + return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < + std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); + } +}; +using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; + /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -1242,27 +1240,32 @@ public: TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} - /// \return An upper bound for the vectorization factor, or None if - /// vectorization and interleaving should be avoided up front. - Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); + /// \return An upper bound for the vectorization factors (both fixed and + /// scalable). If the factors are 0, vectorization and interleaving should be + /// avoided up front. + FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. bool runtimeChecksRequired(); /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every power of two up to MaxVF. If UserVF is not ZERO + /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); + VectorizationFactor + selectVectorizationFactor(const ElementCountSet &CandidateVFs); + VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, const LoopVectorizationPlanner &LVP); /// Setup cost-based decisions for user vectorization factor. - void selectUserVectorizationFactor(ElementCount UserVF) { + /// \return true if the UserVF is a feasible VF to be chosen. + bool selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); + return expectedCost(UserVF).first.isValid(); } /// \return The size (in bits) of the smallest and widest types in the code @@ -1304,10 +1307,22 @@ public: /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); + /// Collect all element types in the loop for which widening is needed. + void collectElementTypesForWidening(); + /// Split reductions into those that happen in the loop, and those that happen /// outside. In loop reductions are collected into InLoopReductionChains. void collectInLoopReductions(); + /// Returns true if we should use strict in-order reductions for the given + /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, + /// the IsOrdered flag of RdxDesc is set and we do not allow reordering + /// of FP operations. + bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { + return EnableStrictReductions && !Hints->allowReordering() && + RdxDesc.isOrdered(); + } + /// \returns The smallest bitwidth each instruction can be represented with. /// The vector equivalents of these instructions should be truncated to this /// type. @@ -1411,7 +1426,7 @@ public: /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, ElementCount VF) { + InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { assert(VF.isVector() && "Expected VF to be a vector VF"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. @@ -1479,30 +1494,18 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { + bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType, Alignment); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { + bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType, Alignment); } - /// Returns true if the target machine supports masked scatter operation - /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType, Align Alignment) { - return TTI.isLegalMaskedScatter(DataType, Alignment); - } - - /// Returns true if the target machine supports masked gather operation - /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType, Align Alignment) { - return TTI.isLegalMaskedGather(DataType, Alignment); - } - /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. bool isLegalGatherOrScatter(Value *V) { @@ -1510,10 +1513,19 @@ public: bool SI = isa<StoreInst>(V); if (!LI && !SI) return false; - auto *Ty = getMemInstValueType(V); + auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); - return (LI && isLegalMaskedGather(Ty, Align)) || - (SI && isLegalMaskedScatter(Ty, Align)); + return (LI && TTI.isLegalMaskedGather(Ty, Align)) || + (SI && TTI.isLegalMaskedScatter(Ty, Align)); + } + + /// Returns true if the target machine supports all of the reduction + /// variables found for the given VF. + bool canVectorizeReductions(ElementCount VF) const { + return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return TTI.isLegalToVectorizeReduction(RdxDesc, VF); + })); } /// Returns true if \p I is an instruction that will be scalarized with @@ -1521,8 +1533,7 @@ public: /// instructions that may divide by zero. /// If a non-zero VF has been calculated, we check if I will be scalarized /// predication for that VF. - bool isScalarWithPredication(Instruction *I, - ElementCount VF = ElementCount::getFixed(1)); + bool isScalarWithPredication(Instruction *I) const; // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. @@ -1563,14 +1574,14 @@ public: /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. - bool requiresScalarEpilogue() const { + bool requiresScalarEpilogue(ElementCount VF) const { if (!isScalarEpilogueAllowed()) return false; // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) return true; - return InterleaveInfo.requiresScalarEpilogue(); + return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); } /// Returns true if a scalar epilogue is not allowed due to optsize or a @@ -1582,7 +1593,7 @@ public: /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } - bool blockNeedsPredication(BasicBlock *BB) { + bool blockNeedsPredication(BasicBlock *BB) const { return foldTailByMasking() || Legal->blockNeedsPredication(BB); } @@ -1605,7 +1616,7 @@ public: /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. - InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); + InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead @@ -1613,7 +1624,12 @@ public: /// scalarized - /// i.e. either vector version isn't available, or is too expensive. InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize); + bool &NeedToScalarize) const; + + /// Returns true if the per-lane cost of VectorizationFactor A is lower than + /// that of B. + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B) const; /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1625,26 +1641,48 @@ public: private: unsigned NumPredStores = 0; - /// \return An upper bound for the vectorization factor, a power-of-2 larger - /// than zero. One is returned if vectorization should best be avoided due - /// to cost. - ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + /// \return An upper bound for the vectorization factors for both + /// fixed and scalable vectorization, where the minimum-known number of + /// elements is a power-of-2 larger than zero. If scalable vectorization is + /// disabled or unsupported, then the scalable part will be equal to + /// ElementCount::getScalable(0). + FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); + + /// \return the maximized element count based on the targets vector + /// registers and the loop trip-count, but limited to a maximum safe VF. + /// This is a helper function of computeFeasibleMaxVF. + /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure + /// issue that occurred on one of the buildbots which cannot be reproduced + /// without having access to the properietary compiler (see comments on + /// D98509). The issue is currently under investigation and this workaround + /// will be removed as soon as possible. + ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, + unsigned SmallestType, + unsigned WidestType, + const ElementCount &MaxSafeVF); + + /// \return the maximum legal scalable VF, based on the safe max number + /// of elements. + ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually - /// operate on - /// vector values after type legalization in the backend. If this latter value - /// is - /// false, then all operations will be scalarized (i.e. no vectorization has - /// actually taken place). + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). using VectorizationCostTy = std::pair<InstructionCost, bool>; /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. - VectorizationCostTy expectedCost(ElementCount VF); + /// the factor width. If \p Invalid is not nullptr, this function + /// will add a pair(Instruction*, ElementCount) to \p Invalid for + /// each instruction that has an Invalid cost for the given VF. + using InstructionVFPair = std::pair<Instruction *, ElementCount>; + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. @@ -1657,9 +1695,9 @@ private: /// Return the cost of instructions in an inloop reduction pattern, if I is /// part of that pattern. - InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, - Type *VectorTy, - TTI::TargetCostKind CostKind); + Optional<InstructionCost> + getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, + TTI::TargetCostKind CostKind); /// Calculate vectorization cost of memory instruction \p I. InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); @@ -1685,7 +1723,8 @@ private: /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); + InstructionCost getScalarizationOverhead(Instruction *I, + ElementCount VF) const; /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -1803,7 +1842,7 @@ private: /// Returns a range containing only operands needing to be extracted. SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, - ElementCount VF) { + ElementCount VF) const { return SmallVector<Value *, 4>(make_filter_range( Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } @@ -1861,12 +1900,216 @@ public: /// Values to ignore in the cost model when VF > 1. SmallPtrSet<const Value *, 16> VecValuesToIgnore; + /// All element types found in the loop. + SmallPtrSet<Type *, 16> ElementTypesInLoop; + /// Profitable vector factors. SmallVector<VectorizationFactor, 8> ProfitableVFs; }; - } // end namespace llvm +/// Helper struct to manage generating runtime checks for vectorization. +/// +/// The runtime checks are created up-front in temporary blocks to allow better +/// estimating the cost and un-linked from the existing IR. After deciding to +/// vectorize, the checks are moved back. If deciding not to vectorize, the +/// temporary blocks are completely removed. +class GeneratedRTChecks { + /// Basic block which contains the generated SCEV checks, if any. + BasicBlock *SCEVCheckBlock = nullptr; + + /// The value representing the result of the generated SCEV checks. If it is + /// nullptr, either no SCEV checks have been generated or they have been used. + Value *SCEVCheckCond = nullptr; + + /// Basic block which contains the generated memory runtime checks, if any. + BasicBlock *MemCheckBlock = nullptr; + + /// The value representing the result of the generated memory runtime checks. + /// If it is nullptr, either no memory runtime checks have been generated or + /// they have been used. + Instruction *MemRuntimeCheckCond = nullptr; + + DominatorTree *DT; + LoopInfo *LI; + + SCEVExpander SCEVExp; + SCEVExpander MemCheckExp; + +public: + GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, + const DataLayout &DL) + : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), + MemCheckExp(SE, DL, "scev.check") {} + + /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can + /// accurately estimate the cost of the runtime checks. The blocks are + /// un-linked from the IR and is added back during vector code generation. If + /// there is no vector code generation, the check blocks are removed + /// completely. + void Create(Loop *L, const LoopAccessInfo &LAI, + const SCEVUnionPredicate &UnionPred) { + + BasicBlock *LoopHeader = L->getHeader(); + BasicBlock *Preheader = L->getLoopPreheader(); + + // Use SplitBlock to create blocks for SCEV & memory runtime checks to + // ensure the blocks are properly added to LoopInfo & DominatorTree. Those + // may be used by SCEVExpander. The blocks will be un-linked from their + // predecessors and removed from LI & DT at the end of the function. + if (!UnionPred.isAlwaysTrue()) { + SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, + nullptr, "vector.scevcheck"); + + SCEVCheckCond = SCEVExp.expandCodeForPredicate( + &UnionPred, SCEVCheckBlock->getTerminator()); + } + + const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); + if (RtPtrChecking.Need) { + auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; + MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, + "vector.memcheck"); + + std::tie(std::ignore, MemRuntimeCheckCond) = + addRuntimeChecks(MemCheckBlock->getTerminator(), L, + RtPtrChecking.getChecks(), MemCheckExp); + assert(MemRuntimeCheckCond && + "no RT checks generated although RtPtrChecking " + "claimed checks are required"); + } + + if (!MemCheckBlock && !SCEVCheckBlock) + return; + + // Unhook the temporary block with the checks, update various places + // accordingly. + if (SCEVCheckBlock) + SCEVCheckBlock->replaceAllUsesWith(Preheader); + if (MemCheckBlock) + MemCheckBlock->replaceAllUsesWith(Preheader); + + if (SCEVCheckBlock) { + SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); + new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); + Preheader->getTerminator()->eraseFromParent(); + } + if (MemCheckBlock) { + MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); + new UnreachableInst(Preheader->getContext(), MemCheckBlock); + Preheader->getTerminator()->eraseFromParent(); + } + + DT->changeImmediateDominator(LoopHeader, Preheader); + if (MemCheckBlock) { + DT->eraseNode(MemCheckBlock); + LI->removeBlock(MemCheckBlock); + } + if (SCEVCheckBlock) { + DT->eraseNode(SCEVCheckBlock); + LI->removeBlock(SCEVCheckBlock); + } + } + + /// Remove the created SCEV & memory runtime check blocks & instructions, if + /// unused. + ~GeneratedRTChecks() { + SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); + SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); + if (!SCEVCheckCond) + SCEVCleaner.markResultUsed(); + + if (!MemRuntimeCheckCond) + MemCheckCleaner.markResultUsed(); + + if (MemRuntimeCheckCond) { + auto &SE = *MemCheckExp.getSE(); + // Memory runtime check generation creates compares that use expanded + // values. Remove them before running the SCEVExpanderCleaners. + for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { + if (MemCheckExp.isInsertedInstruction(&I)) + continue; + SE.forgetValue(&I); + SE.eraseValueFromMap(&I); + I.eraseFromParent(); + } + } + MemCheckCleaner.cleanup(); + SCEVCleaner.cleanup(); + + if (SCEVCheckCond) + SCEVCheckBlock->eraseFromParent(); + if (MemRuntimeCheckCond) + MemCheckBlock->eraseFromParent(); + } + + /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and + /// adjusts the branches to branch to the vector preheader or \p Bypass, + /// depending on the generated condition. + BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, + BasicBlock *LoopVectorPreHeader, + BasicBlock *LoopExitBlock) { + if (!SCEVCheckCond) + return nullptr; + if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) + if (C->isZero()) + return nullptr; + + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + + BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); + // Create new preheader for vector loop. + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + + SCEVCheckBlock->getTerminator()->eraseFromParent(); + SCEVCheckBlock->moveBefore(LoopVectorPreHeader); + Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, + SCEVCheckBlock); + + DT->addNewBlock(SCEVCheckBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); + + ReplaceInstWithInst( + SCEVCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); + // Mark the check as used, to prevent it from being removed during cleanup. + SCEVCheckCond = nullptr; + return SCEVCheckBlock; + } + + /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts + /// the branches to branch to the vector preheader or \p Bypass, depending on + /// the generated condition. + BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, + BasicBlock *LoopVectorPreHeader) { + // Check if we generated code that checks in runtime if arrays overlap. + if (!MemRuntimeCheckCond) + return nullptr; + + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, + MemCheckBlock); + + DT->addNewBlock(MemCheckBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); + MemCheckBlock->moveBefore(LoopVectorPreHeader); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(MemCheckBlock, *LI); + + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); + MemCheckBlock->getTerminator()->setDebugLoc( + Pred->getTerminator()->getDebugLoc()); + + // Mark the check as used, to prevent it from being removed during cleanup. + MemRuntimeCheckCond = nullptr; + return MemCheckBlock; + } +}; + // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -2031,7 +2274,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Value *Start, - Instruction *EntryVal) { + Instruction *EntryVal, VPValue *Def, VPValue *CastDef, + VPTransformState &State) { assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); @@ -2063,16 +2307,20 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = - getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); - Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); + Type *StepType = Step->getType(); + if (Step->getType()->isFloatingPointTy()) + StepType = IntegerType::get(StepType->getContext(), + StepType->getScalarSizeInBits()); + Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); // Create a vector splat to use in the induction update. // // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *SplatVF = isa<Constant>(Mul) ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) : Builder.CreateVectorSplat(VF, Mul); @@ -2085,14 +2333,15 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( VecInd->setDebugLoc(EntryVal->getDebugLoc()); Instruction *LastInduction = VecInd; for (unsigned Part = 0; Part < UF; ++Part) { - VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); + State.set(Def, LastInduction, Part); if (isa<TruncInst>(EntryVal)) addMetadata(LastInduction, EntryVal); - recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); + recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, + State, Part); - LastInduction = cast<Instruction>(addFastMathFlag( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); + LastInduction = cast<Instruction>( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); LastInduction->setDebugLoc(EntryVal->getDebugLoc()); } @@ -2125,7 +2374,8 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopVal, unsigned Part, unsigned Lane) { + Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, + unsigned Part, unsigned Lane) { assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); @@ -2144,15 +2394,16 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( // Only the first Cast instruction in the Casts vector is of interest. // The rest of the Casts (if exist) have no uses outside the // induction update chain itself. - Instruction *CastInst = *Casts.begin(); if (Lane < UINT_MAX) - VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); + State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); else - VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); + State.set(CastDef, VectorLoopVal, Part); } void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, - TruncInst *Trunc) { + TruncInst *Trunc, VPValue *Def, + VPValue *CastDef, + VPTransformState &State) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); @@ -2214,13 +2465,19 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, Value *EntryPart = getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, ID.getInductionOpcode()); - VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); + State.set(Def, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); - recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); + recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, + State, Part); } }; + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) + Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); + // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); if (VF.isZero() || VF.isScalar()) { @@ -2234,7 +2491,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // least one user in the loop that is not widened. auto NeedsScalarIV = needsScalarInduction(EntryVal); if (!NeedsScalarIV) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, + State); return; } @@ -2242,13 +2500,14 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, + State); Value *ScalarIV = CreateScalarIV(Step); // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase // the number of instructions in the loop in the common case prior to // InstCombine. We will be trading one vector extract for each scalar step. - buildScalarSteps(ScalarIV, Step, EntryVal, ID); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); return; } @@ -2258,14 +2517,14 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, Value *ScalarIV = CreateScalarIV(Step); if (!Cost->isScalarEpilogueAllowed()) CreateSplatIV(ScalarIV, Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); } Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. - auto *ValVTy = cast<FixedVectorType>(Val->getType()); - int VLen = ValVTy->getNumElements(); + auto *ValVTy = cast<VectorType>(Val->getType()); + ElementCount VLen = ValVTy->getElementCount(); Type *STy = Val->getType()->getScalarType(); assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && @@ -2274,52 +2533,44 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, SmallVector<Constant *, 8> Indices; - if (STy->isIntegerTy()) { - // Create a vector of consecutive numbers from zero to VF. - for (int i = 0; i < VLen; ++i) - Indices.push_back(ConstantInt::get(STy, StartIdx + i)); + // Create a vector of consecutive numbers from zero to VF. + VectorType *InitVecValVTy = ValVTy; + Type *InitVecValSTy = STy; + if (STy->isFloatingPointTy()) { + InitVecValSTy = + IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); + InitVecValVTy = VectorType::get(InitVecValSTy, VLen); + } + Value *InitVec = Builder.CreateStepVector(InitVecValVTy); + + // Add on StartIdx + Value *StartIdxSplat = Builder.CreateVectorSplat( + VLen, ConstantInt::get(InitVecValSTy, StartIdx)); + InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); - // Add the consecutive indices to the vector value. - Constant *Cv = ConstantVector::get(Indices); - assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + if (STy->isIntegerTy()) { Step = Builder.CreateVectorSplat(VLen, Step); assert(Step->getType() == Val->getType() && "Invalid step vec"); // FIXME: The newly created binary instructions should contain nsw/nuw flags, // which can be found from the original scalar operations. - Step = Builder.CreateMul(Cv, Step); + Step = Builder.CreateMul(InitVec, Step); return Builder.CreateAdd(Val, Step, "induction"); } // Floating point induction. assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && "Binary Opcode should be specified for FP induction"); - // Create a vector of consecutive numbers from zero to VF. - for (int i = 0; i < VLen; ++i) - Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); - - // Add the consecutive indices to the vector value. - Constant *Cv = ConstantVector::get(Indices); - + InitVec = Builder.CreateUIToFP(InitVec, ValVTy); Step = Builder.CreateVectorSplat(VLen, Step); - - // Floating point operations had to be 'fast' to enable the induction. - FastMathFlags Flags; - Flags.setFast(); - - Value *MulOp = Builder.CreateFMul(Cv, Step); - if (isa<Instruction>(MulOp)) - // Have to check, MulOp may be a constant - cast<Instruction>(MulOp)->setFastMathFlags(Flags); - - Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); - if (isa<Instruction>(BOp)) - cast<Instruction>(BOp)->setFastMathFlags(Flags); - return BOp; + Value *MulOp = Builder.CreateFMul(InitVec, Step); + return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, - const InductionDescriptor &ID) { + const InductionDescriptor &ID, + VPValue *Def, VPValue *CastDef, + VPTransformState &State) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF.isVector() && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. @@ -2342,169 +2593,74 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // Determine the number of scalars we need to generate for each unroll // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. - unsigned Lanes = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) - ? 1 - : VF.getKnownMinValue(); - assert((!VF.isScalable() || Lanes == 1) && - "Should never scalarize a scalable vector"); - // Compute the scalar steps and save the results in VectorLoopValueMap. + bool IsUniform = + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); + unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); + // Compute the scalar steps and save the results in State. + Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), + ScalarIVTy->getScalarSizeInBits()); + Type *VecIVTy = nullptr; + Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; + if (!IsUniform && VF.isScalable()) { + VecIVTy = VectorType::get(ScalarIVTy, VF); + UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); + SplatStep = Builder.CreateVectorSplat(VF, Step); + SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); + } + for (unsigned Part = 0; Part < UF; ++Part) { - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), - ScalarIVTy->getScalarSizeInBits()); - Value *StartIdx = - createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + Value *StartIdx0 = + createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + + if (!IsUniform && VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); + auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (ScalarIVTy->isFloatingPointTy()) - StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); - StartIdx = addFastMathFlag(Builder.CreateBinOp( - AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); + InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); + auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); + auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); + State.set(Def, Add, Part); + recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, + Part); + // It's useful to record the lane values too for the known minimum number + // of elements so we do those below. This improves the code quality when + // trying to extract the first element, for example. + } + + if (ScalarIVTy->isFloatingPointTy()) + StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); + + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Value *StartIdx = Builder.CreateBinOp( + AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value // when VF is scalable. Otherwise, it should be folded into a Constant. assert((VF.isScalable() || isa<Constant>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable"); - auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); - auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); - VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); - } - } -} - -Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { - assert(V != Induction && "The new induction variable should not be used."); - assert(!V->getType()->isVectorTy() && "Can't widen a vector"); - assert(!V->getType()->isVoidTy() && "Type does not produce a value"); - - // If we have a stride that is replaced by one, do it here. Defer this for - // the VPlan-native path until we start running Legal checks in that path. - if (!EnableVPlanNativePath && Legal->hasStride(V)) - V = ConstantInt::get(V->getType(), 1); - - // If we have a vector mapped to this value, return it. - if (VectorLoopValueMap.hasVectorValue(V, Part)) - return VectorLoopValueMap.getVectorValue(V, Part); - - // If the value has not been vectorized, check if it has been scalarized - // instead. If it has been scalarized, and we actually need the value in - // vector form, we will construct the vector values on demand. - if (VectorLoopValueMap.hasAnyScalarValue(V)) { - Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); - - // If we've scalarized a value, that value should be an instruction. - auto *I = cast<Instruction>(V); - - // If we aren't vectorizing, we can just copy the scalar map values over to - // the vector map. - if (VF.isScalar()) { - VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); - return ScalarValue; + auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); + auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); + State.set(Def, Add, VPIteration(Part, Lane)); + recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, + Part, Lane); } - - // Get the last scalar instruction we generated for V and Part. If the value - // is known to be uniform after vectorization, this corresponds to lane zero - // of the Part unroll iteration. Otherwise, the last instruction is the one - // we created for the last vector lane of the Part unroll iteration. - unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "Scalable vectorization can't lead to any scalarized values."); - auto *LastInst = cast<Instruction>( - VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); - - // Set the insert point after the last scalarized instruction. This ensures - // the insertelement sequence will directly follow the scalar definitions. - auto OldIP = Builder.saveIP(); - auto NewIP = std::next(BasicBlock::iterator(LastInst)); - Builder.SetInsertPoint(&*NewIP); - - // However, if we are vectorizing, we need to construct the vector values. - // If the value is known to be uniform after vectorization, we can just - // broadcast the scalar value corresponding to lane zero for each unroll - // iteration. Otherwise, we construct the vector values using insertelement - // instructions. Since the resulting vectors are stored in - // VectorLoopValueMap, we will only generate the insertelements once. - Value *VectorValue = nullptr; - if (Cost->isUniformAfterVectorization(I, VF)) { - VectorValue = getBroadcastInstrs(ScalarValue); - VectorLoopValueMap.setVectorValue(V, Part, VectorValue); - } else { - // Initialize packing with insertelements to start from poison. - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); - VectorLoopValueMap.setVectorValue(V, Part, Poison); - for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) - packScalarIntoVectorValue(V, {Part, Lane}); - VectorValue = VectorLoopValueMap.getVectorValue(V, Part); - } - Builder.restoreIP(OldIP); - return VectorValue; } - - // If this scalar is unknown, assume that it is a constant or that it is - // loop invariant. Broadcast V and save the value for future uses. - Value *B = getBroadcastInstrs(V); - VectorLoopValueMap.setVectorValue(V, Part, B); - return B; } -Value * -InnerLoopVectorizer::getOrCreateScalarValue(Value *V, - const VPIteration &Instance) { - // If the value is not an instruction contained in the loop, it should - // already be scalar. - if (OrigLoop->isLoopInvariant(V)) - return V; - - assert(Instance.Lane > 0 - ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) - : true && "Uniform values only have lane zero"); - - // If the value from the original loop has not been vectorized, it is - // represented by UF x VF scalar values in the new loop. Return the requested - // scalar value. - if (VectorLoopValueMap.hasScalarValue(V, Instance)) - return VectorLoopValueMap.getScalarValue(V, Instance); - - // If the value has not been scalarized, get its entry in VectorLoopValueMap - // for the given unroll part. If this entry is not a vector type (i.e., the - // vectorization factor is one), there is no need to generate an - // extractelement instruction. - auto *U = getOrCreateVectorValue(V, Instance.Part); - if (!U->getType()->isVectorTy()) { - assert(VF.isScalar() && "Value not scalarized has non-vector type"); - return U; - } - - // Otherwise, the value from the original loop has been vectorized and is - // represented by UF vector values. Extract and return the requested scalar - // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); -} - -void InnerLoopVectorizer::packScalarIntoVectorValue( - Value *V, const VPIteration &Instance) { - assert(V != Induction && "The new induction variable should not be used."); - assert(!V->getType()->isVectorTy() && "Can't pack a vector"); - assert(!V->getType()->isVoidTy() && "Type does not produce a value"); - - Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); - Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); - VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, - Builder.getInt32(Instance.Lane)); - VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); +void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, + const VPIteration &Instance, + VPTransformState &State) { + Value *ScalarInst = State.get(Def, Instance); + Value *VectorValue = State.get(Def, Instance.Part); + VectorValue = Builder.CreateInsertElement( + VectorValue, ScalarInst, + Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); + State.set(Def, VectorValue, Instance.Part); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - assert(!VF.isScalable() && "Cannot reverse scalable vectors"); - SmallVector<int, 8> ShuffleMask; - for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) - ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); - - return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); + return Builder.CreateVectorReverse(Vec, "reverse"); } // Return whether we allow using masked interleave-groups (for dealing with @@ -2554,7 +2710,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( const DataLayout &DL = Instr->getModule()->getDataLayout(); // Prepare for the vector type of the interleaved load/store. - Type *ScalarTy = getMemInstValueType(Instr); + Type *ScalarTy = getLoadStoreType(Instr); unsigned InterleaveFactor = Group->getFactor(); assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); @@ -2573,14 +2729,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. - assert(!VF.isScalable() && - "scalable vector reverse operation is not implemented"); if (Group->isReverse()) Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { - Value *AddrPart = State.get(Addr, {Part, 0}); - setDebugLocFromInst(Builder, AddrPart); + Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); + setDebugLocFromInst(AddrPart); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2606,12 +2760,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); } - setDebugLocFromInst(Builder, Instr); + setDebugLocFromInst(Instr); Value *PoisonVec = PoisonValue::get(VecTy); Value *MaskForGaps = nullptr; if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2628,7 +2781,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *GroupMask = MaskForGaps; if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); - assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), @@ -2639,7 +2791,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( : ShuffledMask; } NewLoad = - Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), + Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), GroupMask, PoisonVec, "wide.masked.vec"); } else @@ -2659,7 +2811,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (!Member) continue; - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto StrideMask = createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); for (unsigned Part = 0; Part < UF; Part++) { @@ -2676,7 +2827,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (Group->isReverse()) StridedVec = reverseVector(StridedVec); - State.set(VPDefs[J], Member, StridedVec, Part); + State.set(VPDefs[J], StridedVec, Part); } ++J; } @@ -2684,7 +2835,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } // The sub vector type for current instruction. - assert(!VF.isScalable() && "VF is assumed to be non scalable."); auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. @@ -2712,7 +2862,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. - assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *IVec = Builder.CreateShuffleVector( WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), "interleaved.vec"); @@ -2753,7 +2902,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( Decision == LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"); - Type *ScalarDataTy = getMemInstValueType(Instr); + Type *ScalarDataTy = getLoadStoreType(Instr); auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); @@ -2785,18 +2934,21 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( bool InBounds = false; if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) InBounds = gep->isInBounds(); - if (Reverse) { - assert(!VF.isScalable() && - "Reversing vectors is not yet supported for scalable vectors."); - // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); + // RunTimeVF = VScale * VF.getKnownMinValue() + // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() + Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); + // NumElt = -Part * RunTimeVF + Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); + // LastLane = 1 - RunTimeVF + Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); + PartPtr = + cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( - ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); @@ -2813,7 +2965,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( // Handle Stores: if (SI) { - setDebugLocFromInst(Builder, SI); + setDebugLocFromInst(SI); for (unsigned Part = 0; Part < UF; ++Part) { Instruction *NewSI = nullptr; @@ -2831,7 +2983,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); if (isMaskRequired) NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); @@ -2845,21 +2997,21 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( // Handle loads. assert(LI && "Must have a load instruction"); - setDebugLocFromInst(Builder, LI); + setDebugLocFromInst(LI); for (unsigned Part = 0; Part < UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(Addr, Part); - NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, + NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad( - VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), - "wide.masked.load"); + DataTy, VecPtr, Alignment, BlockInMaskParts[Part], + PoisonValue::get(DataTy), "wide.masked.load"); else NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); @@ -2870,11 +3022,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( NewLI = reverseVector(NewLI); } - State.set(Def, Instr, NewLI, Part); + State.set(Def, NewLI, Part); } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, + VPUser &User, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State) { @@ -2883,10 +3036,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for // the first lane and part. if (isa<NoAliasScopeDeclInst>(Instr)) - if (Instance.Lane != 0 || Instance.Part != 0) + if (!Instance.isFirstIteration()) return; - setDebugLocFromInst(Builder, Instr); + setDebugLocFromInst(Instr); // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); @@ -2895,6 +3048,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); + State.Builder.SetInsertPoint(Builder.GetInsertBlock(), + Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { @@ -2902,7 +3057,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) - InputInstance.Lane = 0; + InputInstance.Lane = VPLane::getFirstLane(); auto *NewOp = State.get(User.getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } @@ -2911,15 +3066,11 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - // TODO: Set result for VPValue of VPReciplicateRecipe. This requires - // representing scalar values in VPTransformState. Add the cloned scalar to - // the scalar map entry. - VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); + State.set(Def, Cloned, Instance); // If we just cloned a new assumption, add it the assumption cache. - if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); + if (auto *II = dyn_cast<AssumeInst>(Cloned)) + AC->registerAssumption(II); // End if-block. if (IfPredicateInstr) @@ -2936,21 +3087,28 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, if (!Latch) Latch = Header; - IRBuilder<> Builder(&*Header->getFirstInsertionPt()); + IRBuilder<> B(&*Header->getFirstInsertionPt()); Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); - setDebugLocFromInst(Builder, OldInst); - auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); + setDebugLocFromInst(OldInst, &B); + auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); - Builder.SetInsertPoint(Latch->getTerminator()); - setDebugLocFromInst(Builder, OldInst); + B.SetInsertPoint(Latch->getTerminator()); + setDebugLocFromInst(OldInst, &B); // Create i+1 and fill the PHINode. - Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); + // + // If the tail is not folded, we know that End - Start >= Step (either + // statically or through the minimum iteration checks). We also know that both + // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + + // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned + // overflows and we can mark the induction increment as NUW. + Value *Next = B.CreateAdd(Induction, Step, "index.next", + /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); Induction->addIncoming(Start, L->getLoopPreheader()); Induction->addIncoming(Next, Latch); // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(Next, End); - Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); + Value *ICmp = B.CreateICmpEQ(Next, End); + B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); // Now we have two terminators. Remove the old one from the block. Latch->getTerminator()->eraseFromParent(); @@ -3038,18 +3196,13 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // unroll factor (number of SIMD instructions). Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); - // There are two cases where we need to ensure (at least) the last iteration - // runs in the scalar remainder loop. Thus, if the step evenly divides - // the trip count, we set the remainder to be equal to the step. If the step - // does not evenly divide the trip count, no adjustment is necessary since - // there will already be scalar iterations. Note that the minimum iterations - // check ensures that N >= Step. The cases are: - // 1) If there is a non-reversed interleaved group that may speculatively - // access memory out-of-bounds. - // 2) If any instruction may follow a conditionally taken exit. That is, if - // the loop contains multiple exiting blocks, or a single exiting block - // which is not the latch. - if (VF.isVector() && Cost->requiresScalarEpilogue()) { + // There are cases where we *must* run at least one iteration in the remainder + // loop. See the cost model for when this can happen. If the step evenly + // divides the trip count, we set the remainder to be equal to the step. If + // the step does not evenly divide the trip count, no adjustment is necessary + // since there will already be scalar iterations. Note that the minimum + // iterations check ensures that N >= Step. + if (Cost->requiresScalarEpilogue(VF)) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -3103,8 +3256,8 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. - auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); @@ -3122,9 +3275,13 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"); - // Update dominator for Bypass & LoopExit. + // Update dominator for Bypass & LoopExit (if needed). DT->changeImmediateDominator(Bypass, TCCheckBlock); - DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + if (!Cost->requiresScalarEpilogue(VF)) + // If there is an epilogue which must run, there's no edge from the + // middle block to exit blocks and thus no need to update the immediate + // dominator of the exit blocks. + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); ReplaceInstWithInst( TCCheckBlock->getTerminator(), @@ -3132,63 +3289,48 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, LoopBypassBlocks.push_back(TCCheckBlock); } -void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - // Reuse existing vector loop preheader for SCEV checks. - // Note that new preheader block is generated for vector loop. - BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; - - // Generate the code to check that the SCEV assumptions that we made. - // We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), - "scev.check"); - Value *SCEVCheck = Exp.expandCodeForPredicate( - &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); - - if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) - if (C->isZero()) - return; +BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { + + BasicBlock *const SCEVCheckBlock = + RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); + if (!SCEVCheckBlock) + return nullptr; assert(!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"); - SCEVCheckBlock->setName("vector.scevcheck"); - // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, - nullptr, "vector.ph"); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { DT->changeImmediateDominator(Bypass, SCEVCheckBlock); - DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); + if (!Cost->requiresScalarEpilogue(VF)) + // If there is an epilogue which must run, there's no edge from the + // middle block to exit blocks and thus no need to update the immediate + // dominator of the exit blocks. + DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); } - ReplaceInstWithInst( - SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; + return SCEVCheckBlock; } -void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { // VPlan-native path does not do any analysis for runtime checks currently. if (EnableVPlanNativePath) - return; + return nullptr; - // Reuse existing vector loop preheader for runtime memory checks. - // Note that new preheader block is generated for vector loop. - BasicBlock *const MemCheckBlock = L->getLoopPreheader(); + BasicBlock *const MemCheckBlock = + RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - auto *LAI = Legal->getLAI(); - const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); - if (!RtPtrChecking.Need) - return; + // Check if we generated code that checks in runtime if arrays overlap. We put + // the checks into a separate block to make the more common case of few + // elements faster. + if (!MemCheckBlock) + return nullptr; if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && @@ -3204,32 +3346,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { }); } - MemCheckBlock->setName("vector.memcheck"); - // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); - - auto *CondBranch = cast<BranchInst>( - Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); - ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); LoopBypassBlocks.push_back(MemCheckBlock); - AddedSafetyChecks = true; - // Update dominator only if this is first RT check. - if (LoopBypassBlocks.empty()) { - DT->changeImmediateDominator(Bypass, MemCheckBlock); - DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); - } - - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, - RtPtrChecking.getChecks(), RtPtrChecking.getSE()); - assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " - "claimed checks are required"); - CondBranch->setCondition(MemRuntimeCheck); + AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. @@ -3238,6 +3357,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, PSE.getSE()); LVer->prepareNoAliasMetadata(); + return MemCheckBlock; } Value *InnerLoopVectorizer::emitTransformedIndex( @@ -3247,8 +3367,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex( SCEVExpander Exp(*SE, DL, "induction"); auto Step = ID.getStep(); auto StartValue = ID.getStartValue(); - assert(Index->getType() == Step->getType() && - "Index type does not match StepValue type"); + assert(Index->getType()->getScalarType() == Step->getType() && + "Index scalar type does not match StepValue type"); // Note: the IR at this point is broken. We cannot use SE to create any new // SCEV and then expand it, hoping that SCEV's simplification will give us @@ -3267,14 +3387,20 @@ Value *InnerLoopVectorizer::emitTransformedIndex( return B.CreateAdd(X, Y); }; + // We allow X to be a vector type, in which case Y will potentially be + // splatted into a vector with the same element count. auto CreateMul = [&B](Value *X, Value *Y) { - assert(X->getType() == Y->getType() && "Types don't match!"); + assert(X->getType()->getScalarType() == Y->getType() && + "Types don't match!"); if (auto *CX = dyn_cast<ConstantInt>(X)) if (CX->isOne()) return Y; if (auto *CY = dyn_cast<ConstantInt>(Y)) if (CY->isOne()) return X; + VectorType *XVTy = dyn_cast<VectorType>(X->getType()); + if (XVTy && !isa<VectorType>(Y->getType())) + Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); return B.CreateMul(X, Y); }; @@ -3290,8 +3416,11 @@ Value *InnerLoopVectorizer::emitTransformedIndex( return LoopVectorBody->getTerminator(); return &*B.GetInsertPoint(); }; + switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { + assert(!isa<VectorType>(Index->getType()) && + "Vector indices not supported for integer inductions yet"); assert(Index->getType() == StartValue->getType() && "Index type does not match StartValue type"); if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) @@ -3306,9 +3435,12 @@ Value *InnerLoopVectorizer::emitTransformedIndex( return B.CreateGEP( StartValue->getType()->getPointerElementType(), StartValue, CreateMul(Index, - Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); + Exp.expandCodeFor(Step, Index->getType()->getScalarType(), + GetInsertPoint()))); } case InductionDescriptor::IK_FpInduction: { + assert(!isa<VectorType>(Index->getType()) && + "Vector indices not supported for FP inductions yet"); assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); auto InductionBinOp = ID.getInductionBinOp(); assert(InductionBinOp && @@ -3317,22 +3449,9 @@ Value *InnerLoopVectorizer::emitTransformedIndex( "Original bin op should be defined for FP induction"); Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); - - // Floating point operations had to be 'fast' to enable the induction. - FastMathFlags Flags; - Flags.setFast(); - Value *MulExp = B.CreateFMul(StepValue, Index); - if (isa<Instruction>(MulExp)) - // We have to check, the MulExp may be a constant. - cast<Instruction>(MulExp)->setFastMathFlags(Flags); - - Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, - "induction"); - if (isa<Instruction>(BOp)) - cast<Instruction>(BOp)->setFastMathFlags(Flags); - - return BOp; + return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, + "induction"); } case InductionDescriptor::IK_NoInduction: return nullptr; @@ -3343,9 +3462,10 @@ Value *InnerLoopVectorizer::emitTransformedIndex( Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); - LoopExitBlock = OrigLoop->getUniqueExitBlock(); - assert(LoopExitBlock && "Must have an exit block"); assert(LoopVectorPreHeader && "Invalid loop structure"); + LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr + assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && + "multiple exit loop without required epilogue?"); LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, @@ -3354,12 +3474,20 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "scalar.ph"); - // Set up branch from middle block to the exit and scalar preheader blocks. - // completeLoopSkeleton will update the condition to use an iteration check, - // if required to decide whether to execute the remainder. - BranchInst *BrInst = - BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); + + // Set up the middle block terminator. Two cases: + // 1) If we know that we must execute the scalar epilogue, emit an + // unconditional branch. + // 2) Otherwise, we must have a single unique exit block (due to how we + // implement the multiple exit case). In this case, set up a conditonal + // branch from the middle block to the loop scalar preheader, and the + // exit block. completeLoopSkeleton will update the condition to use an + // iteration check, if required to decide whether to execute the remainder. + BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? + BranchInst::Create(LoopScalarPreHeader) : + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, + Builder.getTrue()); BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); @@ -3371,7 +3499,11 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { nullptr, nullptr, Twine(Prefix) + "vector.body"); // Update dominator for loop exit. - DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); + if (!Cost->requiresScalarEpilogue(VF)) + // If there is an epilogue which must run, there's no edge from the + // middle block to exit blocks and thus no need to update the immediate + // dominator of the exit blocks. + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); @@ -3419,6 +3551,11 @@ void InnerLoopVectorizer::createInductionResumeValues( EndValue = VectorTripCount; } else { IRBuilder<> B(L->getLoopPreheader()->getTerminator()); + + // Fast-math-flags propagate from the original induction instruction. + if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) + B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); + Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = CastInst::getCastOpcode(VectorTripCount, true, StepType, true); @@ -3468,10 +3605,14 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); // Add a check in the middle block to see if we have completed - // all of the iterations in the first vector loop. - // If (N - N%VF) == N, then we *don't* need to run the remainder. - // If tail is to be folded, we know we don't need to run the remainder. - if (!Cost->foldTailByMasking()) { + // all of the iterations in the first vector loop. Three cases: + // 1) If we require a scalar epilogue, there is no conditional branch as + // we unconditionally branch to the scalar preheader. Do nothing. + // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. + // Thus if tail is to be folded, we know we don't need to run the + // remainder and we can use the previous value for the condition (true). + // 3) Otherwise, construct a runtime check. + if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, VectorTripCount, "cmp.n", LoopMiddleBlock->getTerminator()); @@ -3535,23 +3676,32 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { | [ ]_| <-- vector loop. | | | v - | -[ ] <--- middle-block. - | / | - | / v - -|- >[ ] <--- new preheader. + \ -[ ] <--- middle-block. + \/ | + /\ v + | ->[ ] <--- new preheader. | | - | v + (opt) v <-- edge from middle to exit iff epilogue is not required. | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder. + | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). \ | \ v - >[ ] <-- exit block. + >[ ] <-- exit block(s). ... */ // Get the metadata of the original loop before it gets modified. MDNode *OrigLoopID = OrigLoop->getLoopID(); + // Workaround! Compute the trip count of the original loop and cache it + // before we start modifying the CFG. This code has a systemic problem + // wherein it tries to run analysis over partially constructed IR; this is + // wrong, and not simply for SCEV. The trip count of the original loop + // simply happens to be prone to hitting this in practice. In theory, we + // can hit the same issue for any SCEV, or ValueTracking query done during + // mutation. See PR49900. + getOrCreateTripCount(OrigLoop); + // Create an empty vector loop, and prepare basic blocks for the runtime // checks. Loop *Lp = createVectorLoopSkeleton(""); @@ -3640,6 +3790,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, assert(isa<PHINode>(UI) && "Expected LCSSA form"); IRBuilder<> B(MiddleBlock->getTerminator()); + + // Fast-math-flags propagate from the original induction instruction. + if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) + B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); + Value *CountMinusOne = B.CreateSub( CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); Value *CMO = @@ -3722,8 +3877,7 @@ static void cse(BasicBlock *BB) { InstructionCost LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); + bool &NeedToScalarize) const { Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; @@ -3770,13 +3924,31 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, return Cost; } +static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { + if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) + return Elt; + return VectorType::get(Elt, VF); +} + InstructionCost LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, - ElementCount VF) { + ElementCount VF) const { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - - IntrinsicCostAttributes CostAttrs(ID, *CI, VF); + Type *RetTy = MaybeVectorizeType(CI->getType(), VF); + FastMathFlags FMF; + if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) + FMF = FPMO->getFastMathFlags(); + + SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); + FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); + SmallVector<Type *> ParamTys; + std::transform(FTy->param_begin(), FTy->param_end(), + std::back_inserter(ParamTys), + [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); + + IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, + dyn_cast<IntrinsicInst>(CI)); return TTI.getIntrinsicInstrCost(CostAttrs, TargetTransformInfo::TCK_RecipThroughput); } @@ -3793,27 +3965,27 @@ static Type *largestIntegerVectorType(Type *T1, Type *T2) { return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; } -void InnerLoopVectorizer::truncateToMinimalBitwidths() { +void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { // For every instruction `I` in MinBWs, truncate the operands, create a // truncated version of `I` and reextend its result. InstCombine runs // later and will remove any ext/trunc pairs. SmallPtrSet<Value *, 4> Erased; for (const auto &KV : Cost->getMinimalBitwidths()) { // If the value wasn't vectorized, we must maintain the original scalar - // type. The absence of the value from VectorLoopValueMap indicates that it + // type. The absence of the value from State indicates that it // wasn't vectorized. - if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) + VPValue *Def = State.Plan->getVPValue(KV.first); + if (!State.hasAnyVectorValue(Def)) continue; for (unsigned Part = 0; Part < UF; ++Part) { - Value *I = getOrCreateVectorValue(KV.first, Part); + Value *I = State.get(Def, Part); if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) continue; Type *OriginalTy = I->getType(); Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), KV.second); - auto *TruncatedTy = FixedVectorType::get( - ScalarTruncatedTy, - cast<FixedVectorType>(OriginalTy)->getNumElements()); + auto *TruncatedTy = VectorType::get( + ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); if (TruncatedTy == OriginalTy) continue; @@ -3863,35 +4035,31 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { break; } } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { - auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) - ->getNumElements(); + auto Elements0 = + cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); auto *O0 = B.CreateZExtOrTrunc( - SI->getOperand(0), - FixedVectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) - ->getNumElements(); + SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); + auto Elements1 = + cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); auto *O1 = B.CreateZExtOrTrunc( - SI->getOperand(1), - FixedVectorType::get(ScalarTruncatedTy, Elements1)); + SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { - auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) - ->getNumElements(); + auto Elements = + cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); auto *O0 = B.CreateZExtOrTrunc( - IE->getOperand(0), - FixedVectorType::get(ScalarTruncatedTy, Elements)); + IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { - auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) - ->getNumElements(); + auto Elements = + cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); auto *O0 = B.CreateZExtOrTrunc( - EE->getOperand(0), - FixedVectorType::get(ScalarTruncatedTy, Elements)); + EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); NewI = B.CreateExtractElement(O0, EE->getOperand(2)); } else { // If we don't know what to do, be conservative and don't do anything. @@ -3904,58 +4072,65 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { I->replaceAllUsesWith(Res); cast<Instruction>(I)->eraseFromParent(); Erased.insert(I); - VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); + State.reset(Def, Res, Part); } } // We'll have created a bunch of ZExts that are now parentless. Clean up. for (const auto &KV : Cost->getMinimalBitwidths()) { // If the value wasn't vectorized, we must maintain the original scalar - // type. The absence of the value from VectorLoopValueMap indicates that it + // type. The absence of the value from State indicates that it // wasn't vectorized. - if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) + VPValue *Def = State.Plan->getVPValue(KV.first); + if (!State.hasAnyVectorValue(Def)) continue; for (unsigned Part = 0; Part < UF; ++Part) { - Value *I = getOrCreateVectorValue(KV.first, Part); + Value *I = State.get(Def, Part); ZExtInst *Inst = dyn_cast<ZExtInst>(I); if (Inst && Inst->use_empty()) { Value *NewI = Inst->getOperand(0); Inst->eraseFromParent(); - VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); + State.reset(Def, NewI, Part); } } } } -void InnerLoopVectorizer::fixVectorizedLoop() { +void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF.isVector()) - truncateToMinimalBitwidths(); + truncateToMinimalBitwidths(State); // Fix widened non-induction PHIs by setting up the PHI operands. if (OrigPHIsToFix.size()) { assert(EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"); - fixNonInductionPHIs(); + fixNonInductionPHIs(State); } // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. // This is the second stage of vectorizing recurrences. - fixCrossIterationPHIs(); + fixCrossIterationPHIs(State); // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); - // Fix-up external users of the induction variables. - for (auto &Entry : Legal->getInductionVars()) - fixupIVUsers(Entry.first, Entry.second, - getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), - IVEndValues[Entry.first], LoopMiddleBlock); + // If we inserted an edge from the middle block to the unique exit block, + // update uses outside the loop (phis) to account for the newly inserted + // edge. + if (!Cost->requiresScalarEpilogue(VF)) { + // Fix-up external users of the induction variables. + for (auto &Entry : Legal->getInductionVars()) + fixupIVUsers(Entry.first, Entry.second, + getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), + IVEndValues[Entry.first], LoopMiddleBlock); + + fixLCSSAPHIs(State); + } - fixLCSSAPHIs(); for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); @@ -3980,23 +4155,24 @@ void InnerLoopVectorizer::fixVectorizedLoop() { LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); } -void InnerLoopVectorizer::fixCrossIterationPHIs() { +void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #2: We now need to fix the recurrences by adding incoming edges to // the currently empty PHI nodes. At this point every instruction in the // original loop is widened to a vector form so we can use them to construct // the incoming edges. - for (PHINode &Phi : OrigLoop->getHeader()->phis()) { - // Handle first-order recurrences and reductions that need to be fixed. - if (Legal->isFirstOrderRecurrence(&Phi)) - fixFirstOrderRecurrence(&Phi); - else if (Legal->isReductionVariable(&Phi)) - fixReduction(&Phi); + VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) + fixReduction(ReductionPhi, State); + else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) + fixFirstOrderRecurrence(FOR, State); } } -void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { +void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, + VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An // overview of the transformation is described below. Suppose we have the // following loop. @@ -4020,7 +4196,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // // In this example, s1 is a recurrence because it's value depends on the // previous iteration. In the first phase of vectorization, we created a - // temporary value for s1. We now complete the vectorization and produce the + // vector phi v1 for s1. We now complete the vectorization and produce the // shorthand vector IR shown below (for VF = 4, UF = 1). // // vector.ph: @@ -4046,97 +4222,24 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // After execution completes the vector loop, we extract the next value of // the recurrence (x) to use as the initial value in the scalar loop. - // Get the original loop preheader and single loop latch. - auto *Preheader = OrigLoop->getLoopPreheader(); - auto *Latch = OrigLoop->getLoopLatch(); - - // Get the initial and previous values of the scalar recurrence. - auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); - auto *Previous = Phi->getIncomingValueForBlock(Latch); - - // Create a vector from the initial value. - auto *VectorInit = ScalarInit; - if (VF.isVector()) { - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - VectorInit = Builder.CreateInsertElement( - PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, - Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); - } - - // We constructed a temporary phi node in the first phase of vectorization. - // This phi node will eventually be deleted. - Builder.SetInsertPoint( - cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); - - // Create a phi node for the new recurrence. The current value will either be - // the initial value inserted into a vector or loop-varying vector value. - auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); - VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); - - // Get the vectorized previous value of the last part UF - 1. It appears last - // among all unrolled iterations, due to the order of their construction. - Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); - - // Find and set the insertion point after the previous value if it is an - // instruction. - BasicBlock::iterator InsertPt; - // Note that the previous value may have been constant-folded so it is not - // guaranteed to be an instruction in the vector loop. - // FIXME: Loop invariant values do not form recurrences. We should deal with - // them earlier. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) - InsertPt = LoopVectorBody->getFirstInsertionPt(); - else { - Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); - if (isa<PHINode>(PreviousLastPart)) - // If the previous value is a phi node, we should insert after all the phi - // nodes in the block containing the PHI to avoid breaking basic block - // verification. Note that the basic block may be different to - // LoopVectorBody, in case we predicate the loop. - InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); - else - InsertPt = ++PreviousInst->getIterator(); - } - Builder.SetInsertPoint(&*InsertPt); - - // We will construct a vector for the recurrence by combining the values for - // the current and previous iterations. This is the required shuffle mask. - assert(!VF.isScalable()); - SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); - ShuffleMask[0] = VF.getKnownMinValue() - 1; - for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) - ShuffleMask[I] = I + VF.getKnownMinValue() - 1; - - // The vector from which to take the initial value for the current iteration - // (actual or unrolled). Initially, this is the vector phi node. - Value *Incoming = VecPhi; - - // Shuffle the current and previous vector and update the vector parts. - for (unsigned Part = 0; Part < UF; ++Part) { - Value *PreviousPart = getOrCreateVectorValue(Previous, Part); - Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = - VF.isVector() - ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) - : Incoming; - PhiPart->replaceAllUsesWith(Shuffle); - cast<Instruction>(PhiPart)->eraseFromParent(); - VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); - Incoming = PreviousPart; - } + auto *IdxTy = Builder.getInt32Ty(); + auto *VecPhi = cast<PHINode>(State.get(PhiR, 0)); // Fix the latch value of the new recurrence in the vector loop. + VPValue *PreviousDef = PhiR->getBackedgeValue(); + Value *Incoming = State.get(PreviousDef, UF - 1); VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. auto *ExtractForScalar = Incoming; if (VF.isVector()) { + auto *One = ConstantInt::get(IdxTy, 1); Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), - "vector.recur.extract"); + auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); + auto *LastIdx = Builder.CreateSub(RuntimeVF, One); + ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, + "vector.recur.extract"); } // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself @@ -4144,20 +4247,23 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // will be the value when jumping to the exit block from the LoopMiddleBlock, // when the scalar loop is not run at all. Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF.isVector()) + if (VF.isVector()) { + auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); + auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), - "vector.recur.extract.for.phi"); - // When loop is unrolled without vectorizing, initialize - // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of - // `Incoming`. This is analogous to the vectorized case above: extracting the - // second last element when VF > 1. - else if (UF > 1) - ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); + Incoming, Idx, "vector.recur.extract.for.phi"); + } else if (UF > 1) + // When loop is unrolled without vectorizing, initialize + // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value + // of `Incoming`. This is analogous to the vectorized case above: extracting + // the second last element when VF > 1. + ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); // Fix the initial value of the original recurrence in the scalar loop. Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); + PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); + auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); for (auto *BB : predecessors(LoopScalarPreHeader)) { auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; Start->addIncoming(Incoming, BB); @@ -4173,44 +4279,49 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // recurrence in the exit block, and then add an edge for the middle block. // Note that LCSSA does not imply single entry when the original scalar loop // had multiple exiting edges (as we always run the last iteration in the - // scalar epilogue); in that case, the exiting path through middle will be - // dynamically dead and the value picked for the phi doesn't matter. - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (any_of(LCSSAPhi.incoming_values(), - [Phi](Value *V) { return V == Phi; })) - LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); -} - -void InnerLoopVectorizer::fixReduction(PHINode *Phi) { + // scalar epilogue); in that case, there is no edge from middle to exit and + // and thus no phis which needed updated. + if (!Cost->requiresScalarEpilogue(VF)) + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) + if (any_of(LCSSAPhi.incoming_values(), + [Phi](Value *V) { return V == Phi; })) + LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); +} + +void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, + VPTransformState &State) { + PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); // Get it's reduction variable descriptor. - assert(Legal->isReductionVariable(Phi) && + assert(Legal->isReductionVariable(OrigPhi) && "Unable to find the reduction variable"); - RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind RK = RdxDesc.getRecurrenceKind(); TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - setDebugLocFromInst(Builder, ReductionStartValue); - bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); + setDebugLocFromInst(ReductionStartValue); + VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); // This is the vector-clone of the value that leaves the loop. - Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); + Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); // Wrap flags are in general invalid after vectorization, clear them. - clearReductionWrapFlags(RdxDesc); + clearReductionWrapFlags(RdxDesc, State); // Fix the vector-loop phi. // Reductions do not have to start at zero. They can start with // any loop invariant values. - BasicBlock *Latch = OrigLoop->getLoopLatch(); - Value *LoopVal = Phi->getIncomingValueForBlock(Latch); + BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); - Value *Val = getOrCreateVectorValue(LoopVal, Part); - cast<PHINode>(VecRdxPhi) - ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); + unsigned LastPartForNewPhi = PhiR->isOrdered() ? 1 : UF; + for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { + Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); + Value *Val = State.get(PhiR->getBackedgeValue(), Part); + if (PhiR->isOrdered()) + Val = State.get(PhiR->getBackedgeValue(), UF - 1); + + cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); } // Before each round, move the insertion point right between @@ -4219,16 +4330,16 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // instructions. Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - setDebugLocFromInst(Builder, LoopExitInst); + setDebugLocFromInst(LoopExitInst); + Type *PhiTy = OrigPhi->getType(); // If tail is folded by masking, the vector value to leave the loop should be // a Select choosing between the vectorized LoopExitInst and vectorized Phi, // instead of the former. For an inloop reduction the reduction will already // be predicated, and does not need to be handled here. - if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { + if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { for (unsigned Part = 0; Part < UF; ++Part) { - Value *VecLoopExitInst = - VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); Value *Sel = nullptr; for (User *U : VecLoopExitInst->users()) { if (isa<SelectInst>(U)) { @@ -4238,19 +4349,19 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); } assert(Sel && "Reduction exit feeds no select"); - VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); + State.reset(LoopExitInstDef, Sel, Part); // If the target can create a predicated operator for the reduction at no // extra cost in the loop (for example a predicated vadd), it can be // cheaper for the select to remain in the loop than be sunk out of it, // and so use the select value for the phi instead of the old // LoopExitValue. - RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; if (PreferPredicatedReductionSelect || TTI->preferPredicatedReductionSelect( - RdxDesc.getOpcode(), Phi->getType(), + RdxDesc.getOpcode(), PhiTy, TargetTransformInfo::ReductionFlags())) { - auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); + auto *VecRdxPhi = + cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); VecRdxPhi->setIncomingValueForBlock( LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); } @@ -4260,15 +4371,14 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. - if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { - assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); - assert(!VF.isScalable() && "scalable vectors not yet supported."); + if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { + assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); for (unsigned Part = 0; Part < UF; ++Part) { - RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + RdxParts[Part] = State.get(LoopExitInstDef, Part); Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) : Builder.CreateZExt(Trunc, VecTy); @@ -4284,12 +4394,12 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); for (unsigned Part = 0; Part < UF; ++Part) { RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); - VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); + State.reset(LoopExitInstDef, RdxParts[Part], Part); } } // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); + Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); unsigned Op = RecurrenceDescriptor::getOpcode(RK); // The middle block terminator has already been assigned a DebugLoc here (the @@ -4299,36 +4409,40 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // conditional branch, and (c) other passes may add new predecessors which // terminate on this line. This is the easiest way to ensure we don't // accidentally cause an extra step back into the loop while debugging. - setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); - for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - ReducedPartRdx = addFastMathFlag( - Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, - ReducedPartRdx, "bin.rdx"), - RdxDesc.getFastMathFlags()); - else - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + setDebugLocFromInst(LoopMiddleBlock->getTerminator()); + if (PhiR->isOrdered()) + ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); + else { + // Floating-point operations should have some FMF to enable the reduction. + IRBuilderBase::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + for (unsigned Part = 1; Part < UF; ++Part) { + Value *RdxPart = State.get(LoopExitInstDef, Part); + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + ReducedPartRdx = Builder.CreateBinOp( + (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); + } else { + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + } + } } // Create the reduction after the loop. Note that inloop reductions create the // target reduction in the loop using a Reduction recipe. - if (VF.isVector() && !IsInLoopReductionPhi) { + if (VF.isVector() && !PhiR->isInLoop()) { ReducedPartRdx = createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. - if (Phi->getType() != RdxDesc.getRecurrenceType()) - ReducedPartRdx = - RdxDesc.isSigned() - ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) - : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); + if (PhiTy != RdxDesc.getRecurrenceType()) + ReducedPartRdx = RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, PhiTy) + : Builder.CreateZExt(ReducedPartRdx, PhiTy); } // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. - PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", + PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); @@ -4340,24 +4454,25 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // We know that the loop is in LCSSA form. We need to update the PHI nodes // in the exit blocks. See comment on analogous loop in // fixFirstOrderRecurrence for a more complete explaination of the logic. - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (any_of(LCSSAPhi.incoming_values(), - [LoopExitInst](Value *V) { return V == LoopExitInst; })) - LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); + if (!Cost->requiresScalarEpilogue(VF)) + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) + if (any_of(LCSSAPhi.incoming_values(), + [LoopExitInst](Value *V) { return V == LoopExitInst; })) + LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); // Fix the scalar loop reduction variable with the incoming reduction sum // from the vector body and from the backedge value. int IncomingEdgeBlockIdx = - Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); + OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); + OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); + OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } -void InnerLoopVectorizer::clearReductionWrapFlags( - RecurrenceDescriptor &RdxDesc) { +void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, + VPTransformState &State) { RecurKind RK = RdxDesc.getRecurrenceKind(); if (RK != RecurKind::Add && RK != RecurKind::Mul) return; @@ -4373,7 +4488,7 @@ void InnerLoopVectorizer::clearReductionWrapFlags( Instruction *Cur = Worklist.pop_back_val(); if (isa<OverflowingBinaryOperator>(Cur)) for (unsigned Part = 0; Part < UF; ++Part) { - Value *V = getOrCreateVectorValue(Cur, Part); + Value *V = State.get(State.Plan->getVPValue(Cur), Part); cast<Instruction>(V)->dropPoisonGeneratingFlags(); } @@ -4386,7 +4501,7 @@ void InnerLoopVectorizer::clearReductionWrapFlags( } } -void InnerLoopVectorizer::fixLCSSAPHIs() { +void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) // Some phis were already hand updated by the reduction and recurrence @@ -4395,19 +4510,21 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa<Instruction>(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast<Instruction>(IncomingValue), VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "scalable vectors dont support non-uniform scalars yet"); + + VPLane Lane = VPLane::getFirstLane(); + if (isa<Instruction>(IncomingValue) && + !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), + VF)) + Lane = VPLane::getLastLaneForVF(VF); + // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Value *lastIncomingValue = - getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); + OrigLoop->isLoopInvariant(IncomingValue) + ? IncomingValue + : State.get(State.Plan->getVPValue(IncomingValue), + VPIteration(UF - 1, Lane)); LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } @@ -4450,12 +4567,22 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { while (!Worklist.empty()) { auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); - // We can't sink an instruction if it is a phi node, is already in the - // predicated block, is not in the loop, or may have side effects. - if (!I || isa<PHINode>(I) || I->getParent() == PredBB || - !VectorLoop->contains(I) || I->mayHaveSideEffects()) + // We can't sink an instruction if it is a phi node, is not in the loop, + // or may have side effects. + if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || + I->mayHaveSideEffects()) continue; + // If the instruction is already in PredBB, check if we can sink its + // operands. In that case, VPlan's sinkScalarOperands() succeeded in + // sinking the scalar instruction I, hence it appears in PredBB; but it + // may have failed to sink I's operands (recursively), which we try + // (again) here. + if (I->getParent() == PredBB) { + Worklist.insert(I->op_begin(), I->op_end()); + continue; + } + // It's legal to sink the instruction if all its uses occur in the // predicated block. Otherwise, there's nothing to do yet, and we may // need to reanalyze the instruction. @@ -4476,42 +4603,25 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { } while (Changed); } -void InnerLoopVectorizer::fixNonInductionPHIs() { +void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { for (PHINode *OrigPhi : OrigPHIsToFix) { - PHINode *NewPhi = - cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); - unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); - - SmallVector<BasicBlock *, 2> ScalarBBPredecessors( - predecessors(OrigPhi->getParent())); - SmallVector<BasicBlock *, 2> VectorBBPredecessors( - predecessors(NewPhi->getParent())); - assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && - "Scalar and Vector BB should have the same number of predecessors"); - - // The insertion point in Builder may be invalidated by the time we get - // here. Force the Builder insertion point to something valid so that we do - // not run into issues during insertion point restore in - // getOrCreateVectorValue calls below. + VPWidenPHIRecipe *VPPhi = + cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); + PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); + // Make sure the builder has a valid insert point. Builder.SetInsertPoint(NewPhi); - - // The predecessor order is preserved and we can rely on mapping between - // scalar and vector block predecessors. - for (unsigned i = 0; i < NumIncomingValues; ++i) { - BasicBlock *NewPredBB = VectorBBPredecessors[i]; - - // When looking up the new scalar/vector values to fix up, use incoming - // values from original phi. - Value *ScIncV = - OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); - - // Scalar incoming value may need a broadcast - Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); - NewPhi->addIncoming(NewIncV, NewPredBB); + for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { + VPValue *Inc = VPPhi->getIncomingValue(i); + VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); + NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); } } } +bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { + return Cost->useOrderedReductions(RdxDesc); +} + void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Operands, unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, @@ -4539,7 +4649,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, auto *Clone = Builder.Insert(GEP->clone()); for (unsigned Part = 0; Part < UF; ++Part) { Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - State.set(VPDef, GEP, EntryPart, Part); + State.set(VPDef, EntryPart, Part); addMetadata(EntryPart, GEP); } } else { @@ -4553,8 +4663,9 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, for (unsigned Part = 0; Part < UF; ++Part) { // The pointer operand of the new GEP. If it's loop-invariant, we // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) - : State.get(Operands.getOperand(0), Part); + auto *Ptr = IsPtrLoopInvariant + ? State.get(Operands.getOperand(0), VPIteration(0, 0)) + : State.get(Operands.getOperand(0), Part); // Collect all the indices for the new GEP. If any index is // loop-invariant, we won't broadcast it. @@ -4562,7 +4673,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { VPValue *Operand = Operands.getOperand(I); if (IsIndexLoopInvariant[I - 1]) - Indices.push_back(State.get(Operand, {0, 0})); + Indices.push_back(State.get(Operand, VPIteration(0, 0))); else Indices.push_back(State.get(Operand, Part)); } @@ -4576,27 +4687,26 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); - State.set(VPDef, GEP, NewGEP, Part); + State.set(VPDef, NewGEP, Part); addMetadata(NewGEP, GEP); } } } void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, - RecurrenceDescriptor *RdxDesc, - Value *StartV, unsigned UF, - ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); + VPWidenPHIRecipe *PhiR, + VPTransformState &State) { PHINode *P = cast<PHINode>(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction // PHIs where all control flow is uniform. We simply widen these PHIs. // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. - Type *VecTy = - (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); + Type *VecTy = (State.VF.isScalar()) + ? PN->getType() + : VectorType::get(PN->getType(), State.VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); - VectorLoopValueMap.setVectorValue(P, 0, VecPhi); + State.set(PhiR, VecPhi, 0); OrigPHIsToFix.push_back(P); return; @@ -4609,61 +4719,11 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. - if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { - Value *Iden = nullptr; - bool ScalarPHI = - (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); - Type *VecTy = - ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); - - if (RdxDesc) { - assert(Legal->isReductionVariable(P) && StartV && - "RdxDesc should only be set for reduction variables; in that case " - "a StartV is also required"); - RecurKind RK = RdxDesc->getRecurrenceKind(); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { - // MinMax reduction have the start value as their identify. - if (ScalarPHI) { - Iden = StartV; - } else { - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); - } - } else { - Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( - RK, VecTy->getScalarType()); - Iden = IdenC; - - if (!ScalarPHI) { - Iden = ConstantVector::getSplat(VF, IdenC); - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - Constant *Zero = Builder.getInt32(0); - StartV = Builder.CreateInsertElement(Iden, StartV, Zero); - } - } - } - - for (unsigned Part = 0; Part < UF; ++Part) { - // This is phase one of vectorizing PHIs. - Value *EntryPart = PHINode::Create( - VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); - VectorLoopValueMap.setVectorValue(P, Part, EntryPart); - if (StartV) { - // Make sure to add the reduction start value only to the - // first unroll part. - Value *StartVal = (Part == 0) ? StartV : Iden; - cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); - } - } - return; - } assert(!Legal->isReductionVariable(P) && - "reductions should be handled above"); + "reductions should be handled elsewhere"); - setDebugLocFromInst(Builder, P); + setDebugLocFromInst(P); // This PHINode must be an induction variable. // Make sure that we know about it. @@ -4684,24 +4744,49 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); - if (Cost->isScalarAfterVectorization(P, VF)) { + if (Cost->isScalarAfterVectorization(P, State.VF)) { // This is the normalized GEP that starts counting at zero. Value *PtrInd = Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = - Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); + bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); + unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); + + bool NeedsVectorIndex = !IsUniform && VF.isScalable(); + Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; + if (NeedsVectorIndex) { + Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); + UnitStepVec = Builder.CreateStepVector(VecIVTy); + PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); + } + for (unsigned Part = 0; Part < UF; ++Part) { + Value *PartStart = createStepForVF( + Builder, ConstantInt::get(PtrInd->getType(), Part), VF); + + if (NeedsVectorIndex) { + Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); + Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); + Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); + Value *SclrGep = + emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); + SclrGep->setName("next.gep"); + State.set(PhiR, SclrGep, Part); + // We've cached the whole vector, which means we can support the + // extraction of any lane. + continue; + } + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Constant *Idx = ConstantInt::get(PtrInd->getType(), - Lane + Part * VF.getKnownMinValue()); + Value *Idx = Builder.CreateAdd( + PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); SclrGep->setName("next.gep"); - VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); + State.set(PhiR, SclrGep, VPIteration(Part, Lane)); } } return; @@ -4724,32 +4809,34 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, SCEVExpander Exp(*PSE.getSE(), DL, "induction"); Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); + Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); + Value *NumUnrolledElems = + Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); Value *InductionGEP = GetElementPtrInst::Create( ScStValueType->getPointerElementType(), NewPointerPhi, - Builder.CreateMul( - ScalarStepValue, - ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), - "ptr.ind", InductionLoc); + Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", + InductionLoc); NewPointerPhi->addIncoming(InductionGEP, LoopLatch); // Create UF many actual address geps that use the pointer // phi as base and a vectorized version of the step value // (<step*0, ..., step*N>) as offset. - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Constant *, 8> Indices; + for (unsigned Part = 0; Part < State.UF; ++Part) { + Type *VecPhiType = VectorType::get(PhiType, State.VF); + Value *StartOffsetScalar = + Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); + Value *StartOffset = + Builder.CreateVectorSplat(State.VF, StartOffsetScalar); // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) - Indices.push_back( - ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); - Constant *StartOffset = ConstantVector::get(Indices); + StartOffset = + Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); Value *GEP = Builder.CreateGEP( ScStValueType->getPointerElementType(), NewPointerPhi, Builder.CreateMul( - StartOffset, - Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), + StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), "vector.gep")); - VectorLoopValueMap.setVectorValue(P, Part, GEP); + State.set(PhiR, GEP, Part); } } } @@ -4803,7 +4890,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, case Instruction::Or: case Instruction::Xor: { // Just widen unops and binops. - setDebugLocFromInst(Builder, &I); + setDebugLocFromInst(&I); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 2> Ops; @@ -4816,7 +4903,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, VecOp->copyIRFlags(&I); // Use this vector value for all users of the original instruction. - State.set(Def, &I, V, Part); + State.set(Def, V, Part); addMetadata(V, &I); } @@ -4827,7 +4914,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, // Widen compares. Generate vector compares. bool FCmp = (I.getOpcode() == Instruction::FCmp); auto *Cmp = cast<CmpInst>(&I); - setDebugLocFromInst(Builder, Cmp); + setDebugLocFromInst(Cmp); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = State.get(User.getOperand(0), Part); Value *B = State.get(User.getOperand(1), Part); @@ -4840,7 +4927,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, } else { C = Builder.CreateICmp(Cmp->getPredicate(), A, B); } - State.set(Def, &I, C, Part); + State.set(Def, C, Part); addMetadata(C, &I); } @@ -4860,7 +4947,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, case Instruction::FPTrunc: case Instruction::BitCast: { auto *CI = cast<CastInst>(&I); - setDebugLocFromInst(Builder, CI); + setDebugLocFromInst(CI); /// Vectorize casts. Type *DestTy = @@ -4869,7 +4956,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, for (unsigned Part = 0; Part < UF; ++Part) { Value *A = State.get(User.getOperand(0), Part); Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(Def, &I, Cast, Part); + State.set(Def, Cast, Part); addMetadata(Cast, &I); } break; @@ -4886,7 +4973,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, VPTransformState &State) { assert(!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); - setDebugLocFromInst(Builder, &I); + setDebugLocFromInst(&I); Module *M = I.getParent()->getParent()->getParent(); auto *CI = cast<CallInst>(&I); @@ -4906,10 +4993,11 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."); - assert(IntrinsicCost.isValid() && CallCost.isValid() && - "Cannot have invalid costs while widening"); + assert((IntrinsicCost.isValid() || CallCost.isValid()) && + "Either the intrinsic cost or vector call cost must be valid"); for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Type *, 2> TysForDecl = {CI->getType()}; SmallVector<Value *, 4> Args; for (auto &I : enumerate(ArgOperands.operands())) { // Some intrinsics have a scalar argument - don't replace it with a @@ -4917,19 +5005,19 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, Value *Arg; if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) Arg = State.get(I.value(), Part); - else - Arg = State.get(I.value(), {0, 0}); + else { + Arg = State.get(I.value(), VPIteration(0, 0)); + if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) + TysForDecl.push_back(Arg->getType()); + } Args.push_back(Arg); } Function *VectorF; if (UseVectorIntrinsic) { // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; - if (VF.isVector()) { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); + if (VF.isVector()) TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - } VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { @@ -4948,7 +5036,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, if (isa<FPMathOperator>(V)) V->copyFastMathFlags(CI); - State.set(Def, &I, V, Part); + State.set(Def, V, Part); addMetadata(V, &I); } } @@ -4957,14 +5045,15 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, bool InvariantCond, VPTransformState &State) { - setDebugLocFromInst(Builder, &I); + setDebugLocFromInst(&I); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - auto *InvarCond = - InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; + auto *InvarCond = InvariantCond + ? State.get(Operands.getOperand(0), VPIteration(0, 0)) + : nullptr; for (unsigned Part = 0; Part < UF; ++Part) { Value *Cond = @@ -4972,7 +5061,7 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, Value *Op0 = State.get(Operands.getOperand(1), Part); Value *Op1 = State.get(Operands.getOperand(2), Part); Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); - State.set(VPDef, &I, Sel, Part); + State.set(VPDef, Sel, Part); addMetadata(Sel, &I); } } @@ -5034,13 +5123,12 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { if (isScalarPtrInduction(MemAccess, Ptr)) { Worklist.insert(cast<Instruction>(Ptr)); - Instruction *Update = cast<Instruction>( - cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); - Worklist.insert(Update); LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr << "\n"); - LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update - << "\n"); + + Instruction *Update = cast<Instruction>( + cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); + ScalarPtrs.insert(Update); return; } // We only care about bitcast and getelementptr instructions contained in @@ -5054,11 +5142,12 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { if (Worklist.count(I)) return; - // If the use of the pointer will be a scalar use, and all users of the - // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, - // place the pointer in PossibleNonScalarPtrs. - if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { - return isa<LoadInst>(U) || isa<StoreInst>(U); + // If all users of the pointer will be memory accesses and scalar, place the + // pointer in ScalarPtrs. Otherwise, place the pointer in + // PossibleNonScalarPtrs. + if (llvm::all_of(I->users(), [&](User *U) { + return (isa<LoadInst>(U) || isa<StoreInst>(U)) && + isScalarUse(cast<Instruction>(U), Ptr); })) ScalarPtrs.insert(I); else @@ -5164,8 +5253,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, - ElementCount VF) { +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -5176,20 +5264,12 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, if (!Legal->isMaskRequired(I)) return false; auto *Ptr = getLoadStorePointerOperand(I); - auto *Ty = getMemInstValueType(I); - // We have already decided how to vectorize this instruction, get that - // result. - if (VF.isVector()) { - InstWidening WideningDecision = getWideningDecision(I, VF); - assert(WideningDecision != CM_Unknown && - "Widening decision should be ready at this moment"); - return WideningDecision == CM_Scalarize; - } + auto *Ty = getLoadStoreType(I); const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || - isLegalMaskedGather(Ty, Alignment)) + TTI.isLegalMaskedGather(Ty, Alignment)) : !(isLegalMaskedStore(Ty, Ptr, Alignment) || - isLegalMaskedScatter(Ty, Alignment)); + TTI.isLegalMaskedScatter(Ty, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -5211,8 +5291,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); - auto *ScalarTy = getMemInstValueType(I); - if (hasIrregularType(ScalarTy, DL, VF)) + auto *ScalarTy = getLoadStoreType(I); + if (hasIrregularType(ScalarTy, DL)) return false; // Check if masking is required. @@ -5231,7 +5311,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( assert(useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."); - auto *Ty = getMemInstValueType(I); + auto *Ty = getLoadStoreType(I); const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) : TTI.isLegalMaskedStore(Ty, Alignment); @@ -5259,7 +5339,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); - if (hasIrregularType(ScalarTy, DL, VF)) + if (hasIrregularType(ScalarTy, DL)) return false; return true; @@ -5302,7 +5382,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { << *I << "\n"); return; } - if (isScalarWithPredication(I, VF)) { + if (isScalarWithPredication(I)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -5347,7 +5427,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // here is something which only demands lane 0 of the unrolled iterations; // it does not imply that all lanes produce the same value (e.g. this is not // the usual meaning of uniform) - SmallPtrSet<Value *, 8> HasUniformUse; + SetVector<Value *> HasUniformUse; // Scan the loop for instructions which are either a) known to have only // lane 0 demanded or b) are uses which demand only lane 0 of their operand. @@ -5483,7 +5563,158 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -Optional<ElementCount> +ElementCount +LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { + if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { + reportVectorizationInfo( + "Disabling scalable vectorization, because target does not " + "support scalable vectors.", + "ScalableVectorsUnsupported", ORE, TheLoop); + return ElementCount::getScalable(0); + } + + if (Hints->isScalableVectorizationDisabled()) { + reportVectorizationInfo("Scalable vectorization is explicitly disabled", + "ScalableVectorizationDisabled", ORE, TheLoop); + return ElementCount::getScalable(0); + } + + auto MaxScalableVF = ElementCount::getScalable( + std::numeric_limits<ElementCount::ScalarTy>::max()); + + // Test that the loop-vectorizer can legalize all operations for this MaxVF. + // FIXME: While for scalable vectors this is currently sufficient, this should + // be replaced by a more detailed mechanism that filters out specific VFs, + // instead of invalidating vectorization for a whole set of VFs based on the + // MaxVF. + + // Disable scalable vectorization if the loop contains unsupported reductions. + if (!canVectorizeReductions(MaxScalableVF)) { + reportVectorizationInfo( + "Scalable vectorization not supported for the reduction " + "operations found in this loop.", + "ScalableVFUnfeasible", ORE, TheLoop); + return ElementCount::getScalable(0); + } + + // Disable scalable vectorization if the loop contains any instructions + // with element types not supported for scalable vectors. + if (any_of(ElementTypesInLoop, [&](Type *Ty) { + return !Ty->isVoidTy() && + !this->TTI.isElementTypeLegalForScalableVector(Ty); + })) { + reportVectorizationInfo("Scalable vectorization is not supported " + "for all element types found in this loop.", + "ScalableVFUnfeasible", ORE, TheLoop); + return ElementCount::getScalable(0); + } + + if (Legal->isSafeForAnyVectorWidth()) + return MaxScalableVF; + + // Limit MaxScalableVF by the maximum safe dependence distance. + Optional<unsigned> MaxVScale = TTI.getMaxVScale(); + MaxScalableVF = ElementCount::getScalable( + MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + if (!MaxScalableVF) + reportVectorizationInfo( + "Max legal vector width too small, scalable vectorization " + "unfeasible.", + "ScalableVFUnfeasible", ORE, TheLoop); + + return MaxScalableVF; +} + +FixedScalableVFPair +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF) { + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); + + // Get the maximum safe dependence distance in bits computed by LAA. + // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from + // the memory accesses that is most restrictive (involved in the smallest + // dependence distance). + unsigned MaxSafeElements = + PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + + auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); + auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); + + LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF + << ".\n"); + LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF + << ".\n"); + + // First analyze the UserVF, fall back if the UserVF should be ignored. + if (UserVF) { + auto MaxSafeUserVF = + UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; + + if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { + // If `VF=vscale x N` is safe, then so is `VF=N` + if (UserVF.isScalable()) + return FixedScalableVFPair( + ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); + else + return UserVF; + } + + assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); + + // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it + // is better to ignore the hint and let the compiler choose a suitable VF. + if (!UserVF.isScalable()) { + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" + << MaxSafeFixedVF << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeFixedVF); + }); + return MaxSafeFixedVF; + } + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe. Ignoring scalable UserVF.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe. Ignoring the hint to let the compiler pick a " + "suitable VF."; + }); + } + + LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType + << " / " << WidestType << " bits.\n"); + + FixedScalableVFPair Result(ElementCount::getFixed(1), + ElementCount::getScalable(0)); + if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, + WidestType, MaxSafeFixedVF)) + Result.FixedVF = MaxVF; + + if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, + WidestType, MaxSafeScalableVF)) + if (MaxVF.isScalable()) { + Result.ScalableVF = MaxVF; + LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF + << "\n"); + } + + return Result; +} + +FixedScalableVFPair LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically @@ -5492,7 +5723,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { "Not inserting runtime ptr check for divergent target", "runtime pointer checks needed. Not enabled for divergent target", "CantVersionLoopWithDivergentTarget", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); @@ -5501,14 +5732,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { reportVectorizationFailure("Single iteration (non) loop", "loop trip count is one, irrelevant for vectorization", "SingleIterationLoop", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } - ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); - switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return MaxVF; + return computeFeasibleMaxVF(TC, UserVF); case CM_ScalarEpilogueNotAllowedUsePredicate: LLVM_FALLTHROUGH; case CM_ScalarEpilogueNotNeededUsePredicate: @@ -5530,7 +5759,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // Bail if runtime checks are required, which are not good when optimising // for size. if (runtimeChecksRequired()) - return None; + return FixedScalableVFPair::getNone(); break; } @@ -5546,9 +5775,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return MaxVF; + return computeFeasibleMaxVF(TC, UserVF); } - return None; + return FixedScalableVFPair::getNone(); } // Now try the tail folding @@ -5563,33 +5792,44 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - assert(!MaxVF.isScalable() && - "Scalable vectors do not yet support tail folding"); - assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && - "MaxVF must be a power of 2"); - unsigned MaxVFtimesIC = - UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); - // Avoid tail folding if the trip count is known to be a multiple of any VF we - // chose. - ScalarEvolution *SE = PSE.getSE(); - const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - const SCEV *ExitCount = SE->getAddExpr( - BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); - const SCEV *Rem = SE->getURemExpr( - ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); - if (Rem->isZero()) { - // Accept MaxVF if we do not have a tail. - LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); - return MaxVF; + FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); + // Avoid tail folding if the trip count is known to be a multiple of any VF + // we chose. + // FIXME: The condition below pessimises the case for fixed-width vectors, + // when scalable VFs are also candidates for vectorization. + if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { + ElementCount MaxFixedVF = MaxFactors.FixedVF; + assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && + "MaxFixedVF must be a power of 2"); + unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC + : MaxFixedVF.getFixedValue(); + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + const SCEV *Rem = SE->getURemExpr( + SE->applyLoopGuards(ExitCount, TheLoop), + SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + if (Rem->isZero()) { + // Accept MaxFixedVF if we do not have a tail. + LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + return MaxFactors; + } } + // For scalable vectors, don't use tail folding as this is currently not yet + // supported. The code is likely to have ended up here if the tripcount is + // low, in which case it makes sense not to use scalable vectors. + if (MaxFactors.ScalableVF.isVector()) + MaxFactors.ScalableVF = ElementCount::getScalable(0); + // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; - return MaxVF; + return MaxFactors; } // If there was a tail-folding hint/switch, but we can't fold the tail by @@ -5598,12 +5838,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return MaxVF; + return MaxFactors; } if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); - return None; + return FixedScalableVFPair::getNone(); } if (TC == 0) { @@ -5611,7 +5851,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { "Unable to calculate the loop count due to complex control flow", "unable to calculate the loop count due to complex control flow", "UnknownLoopCountComplexCFG", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } reportVectorizationFailure( @@ -5620,137 +5860,67 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { "Enable vectorization of this loop with '#pragma clang loop " "vectorize(enable)' when compiling with -Os/-Oz", "NoTailLoopWithOptForSize", ORE, TheLoop); - return None; -} - -ElementCount -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF) { - bool IgnoreScalableUserVF = UserVF.isScalable() && - !TTI.supportsScalableVectors() && - !ForceTargetSupportsScalableVectors; - if (IgnoreScalableUserVF) { - LLVM_DEBUG( - dbgs() << "LV: Ignoring VF=" << UserVF - << " because target does not support scalable vectors.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Ignoring VF=" << ore::NV("UserVF", UserVF) - << " because target does not support scalable vectors."; - }); - } - - // Beyond this point two scenarios are handled. If UserVF isn't specified - // then a suitable VF is chosen. If UserVF is specified and there are - // dependencies, check if it's legal. However, if a UserVF is specified and - // there are no dependencies, then there's nothing to do. - if (UserVF.isNonZero() && !IgnoreScalableUserVF && - Legal->isSafeForAnyVectorWidth()) - return UserVF; - - MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); - unsigned SmallestType, WidestType; - std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); - unsigned WidestRegister = TTI.getRegisterBitWidth(true); - - // Get the maximum safe dependence distance in bits computed by LAA. - // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from - // the memory accesses that is most restrictive (involved in the smallest - // dependence distance). - unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); - - // If the user vectorization factor is legally unsafe, clamp it to a safe - // value. Otherwise, return as is. - if (UserVF.isNonZero() && !IgnoreScalableUserVF) { - unsigned MaxSafeElements = - PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); - - if (UserVF.isScalable()) { - Optional<unsigned> MaxVScale = TTI.getMaxVScale(); - - // Scale VF by vscale before checking if it's safe. - MaxSafeVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); - - if (MaxSafeVF.isZero()) { - // The dependence distance is too small to use scalable vectors, - // fallback on fixed. - LLVM_DEBUG( - dbgs() - << "LV: Max legal vector width too small, scalable vectorization " - "unfeasible. Using fixed-width vectorization instead.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Max legal vector width too small, scalable vectorization " - << "unfeasible. Using fixed-width vectorization instead."; - }); - return computeFeasibleMaxVF( - ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); - } - } - - LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); - - if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) - return UserVF; - - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe, clamping to max safe VF=" << MaxSafeVF - << ".\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe, clamping to maximum safe vectorization factor " - << ore::NV("VectorizationFactor", MaxSafeVF); - }); - return MaxSafeVF; - } - - WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); + return FixedScalableVFPair::getNone(); +} + +ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( + unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, + const ElementCount &MaxSafeVF) { + bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); + TypeSize WidestRegister = TTI.getRegisterBitWidth( + ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector); + + // Convenience function to return the minimum of two ElementCounts. + auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { + assert((LHS.isScalable() == RHS.isScalable()) && + "Scalable flags must match"); + return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; + }; // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. - unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); - - LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType - << " / " << WidestType << " bits.\n"); + auto MaxVectorElementCount = ElementCount::get( + PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), + ComputeScalableMaxVF); + MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " - << WidestRegister << " bits.\n"); - - assert(MaxVectorSize <= WidestRegister && - "Did not expect to pack so many elements" - " into one vector!"); - if (MaxVectorSize == 0) { - LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); - MaxVectorSize = 1; - return ElementCount::getFixed(MaxVectorSize); - } else if (ConstTripCount && ConstTripCount < MaxVectorSize && - isPowerOf2_32(ConstTripCount)) { + << (MaxVectorElementCount * WidestType) << " bits.\n"); + + if (!MaxVectorElementCount) { + LLVM_DEBUG(dbgs() << "LV: The target has no " + << (ComputeScalableMaxVF ? "scalable" : "fixed") + << " vector registers.\n"); + return ElementCount::getFixed(1); + } + + const auto TripCountEC = ElementCount::getFixed(ConstTripCount); + if (ConstTripCount && + ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && + isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in - // choosing a higher viable VF as done in the loop below. + // choosing a higher viable VF as done in the loop below. If + // MaxVectorElementCount is scalable, we only fall back on a fixed VF when + // the TC is less than or equal to the known number of lanes. LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " << ConstTripCount << "\n"); - MaxVectorSize = ConstTripCount; - return ElementCount::getFixed(MaxVectorSize); + return TripCountEC; } - unsigned MaxVF = MaxVectorSize; - if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || + ElementCount MaxVF = MaxVectorElementCount; + if (TTI.shouldMaximizeVectorBandwidth() || (MaximizeBandwidth && isScalarEpilogueAllowed())) { + auto MaxVectorElementCountMaxBW = ElementCount::get( + PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), + ComputeScalableMaxVF); + MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); + // Collect all viable vectorization factors larger than the default MaxVF - // (i.e. MaxVectorSize). + // (i.e. MaxVectorElementCount). SmallVector<ElementCount, 8> VFs; - unsigned NewMaxVectorSize = WidestRegister / SmallestType; - for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) - VFs.push_back(ElementCount::getFixed(VS)); + for (ElementCount VS = MaxVectorElementCount * 2; + ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) + VFs.push_back(VS); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5759,59 +5929,97 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, // ones. for (int i = RUs.size() - 1; i >= 0; --i) { bool Selected = true; - for (auto& pair : RUs[i].MaxLocalUsers) { + for (auto &pair : RUs[i].MaxLocalUsers) { unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); if (pair.second > TargetNumRegisters) Selected = false; } if (Selected) { - MaxVF = VFs[i].getKnownMinValue(); + MaxVF = VFs[i]; break; } } - if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { - if (MaxVF < MinVF) { + if (ElementCount MinVF = + TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { + if (ElementCount::isKnownLT(MaxVF, MinVF)) { LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'); MaxVF = MinVF; } } } - return ElementCount::getFixed(MaxVF); + return MaxVF; } -VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); +bool LoopVectorizationCostModel::isMoreProfitable( + const VectorizationFactor &A, const VectorizationFactor &B) const { + InstructionCost CostA = A.Cost; + InstructionCost CostB = B.Cost; + + unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); + + if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && + MaxTripCount) { + // If we are folding the tail and the trip count is a known (possibly small) + // constant, the trip count will be rounded up to an integer number of + // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), + // which we compare directly. When not folding the tail, the total cost will + // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is + // approximated with the per-lane cost below instead of using the tripcount + // as here. + auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); + auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); + return RTCostA < RTCostB; + } + + // When set to preferred, for now assume vscale may be larger than 1, so + // that scalable vectorization is slightly favorable over fixed-width + // vectorization. + if (Hints->isScalableVectorizationPreferred()) + if (A.Width.isScalable() && !B.Width.isScalable()) + return (CostA * B.Width.getKnownMinValue()) <= + (CostB * A.Width.getKnownMinValue()); + + // To avoid the need for FP division: + // (CostA / A.Width) < (CostB / B.Width) + // <=> (CostA * B.Width) < (CostB * A.Width) + return (CostA * B.Width.getKnownMinValue()) < + (CostB * A.Width.getKnownMinValue()); +} +VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( + const ElementCountSet &VFCandidates) { InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); + assert(VFCandidates.count(ElementCount::getFixed(1)) && + "Expected Scalar VF to be a candidate"); - unsigned Width = 1; - const float ScalarCost = *ExpectedCost.getValue(); - float Cost = ScalarCost; + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); + VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && MaxVF.isVector()) { + if (ForceVectorization && VFCandidates.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. - Cost = std::numeric_limits<float>::max(); - } - - for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { - // Notice that the vector loop needs to be executed less times, so - // we need to divide the cost of the vector loops by the width of - // the vector elements. - VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); - assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); - float VectorCost = *C.first.getValue() / (float)i; - LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i - << " costs: " << (int)VectorCost << ".\n"); + ChosenFactor.Cost = InstructionCost::getMax(); + } + + SmallVector<InstructionVFPair> InvalidCosts; + for (const auto &i : VFCandidates) { + // The cost for scalar VF=1 is already calculated, so ignore it. + if (i.isScalar()) + continue; + + VectorizationCostTy C = expectedCost(i, &InvalidCosts); + VectorizationFactor Candidate(i, C.first); + LLVM_DEBUG( + dbgs() << "LV: Vector loop of width " << i << " costs: " + << (Candidate.Cost / Candidate.Width.getKnownMinValue()) + << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") + << ".\n"); + if (!C.second && !ForceVectorization) { LLVM_DEBUG( dbgs() << "LV: Not considering vector loop of width " << i @@ -5820,32 +6028,86 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { } // If profitable add it to ProfitableVF list. - if (VectorCost < ScalarCost) { - ProfitableVFs.push_back(VectorizationFactor( - {ElementCount::getFixed(i), (unsigned)VectorCost})); - } - - if (VectorCost < Cost) { - Cost = VectorCost; - Width = i; - } + if (isMoreProfitable(Candidate, ScalarCost)) + ProfitableVFs.push_back(Candidate); + + if (isMoreProfitable(Candidate, ChosenFactor)) + ChosenFactor = Candidate; + } + + // Emit a report of VFs with invalid costs in the loop. + if (!InvalidCosts.empty()) { + // Group the remarks per instruction, keeping the instruction order from + // InvalidCosts. + std::map<Instruction *, unsigned> Numbering; + unsigned I = 0; + for (auto &Pair : InvalidCosts) + if (!Numbering.count(Pair.first)) + Numbering[Pair.first] = I++; + + // Sort the list, first on instruction(number) then on VF. + llvm::sort(InvalidCosts, + [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { + if (Numbering[A.first] != Numbering[B.first]) + return Numbering[A.first] < Numbering[B.first]; + ElementCountComparator ECC; + return ECC(A.second, B.second); + }); + + // For a list of ordered instruction-vf pairs: + // [(load, vf1), (load, vf2), (store, vf1)] + // Group the instructions together to emit separate remarks for: + // load (vf1, vf2) + // store (vf1) + auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); + auto Subset = ArrayRef<InstructionVFPair>(); + do { + if (Subset.empty()) + Subset = Tail.take_front(1); + + Instruction *I = Subset.front().first; + + // If the next instruction is different, or if there are no other pairs, + // emit a remark for the collated subset. e.g. + // [(load, vf1), (load, vf2))] + // to emit: + // remark: invalid costs for 'load' at VF=(vf, vf2) + if (Subset == Tail || Tail[Subset.size()].first != I) { + std::string OutString; + raw_string_ostream OS(OutString); + assert(!Subset.empty() && "Unexpected empty range"); + OS << "Instruction with invalid costs prevented vectorization at VF=("; + for (auto &Pair : Subset) + OS << (Pair.second == Subset.front().second ? "" : ", ") + << Pair.second; + OS << "):"; + if (auto *CI = dyn_cast<CallInst>(I)) + OS << " call to " << CI->getCalledFunction()->getName(); + else + OS << " " << I->getOpcodeName(); + OS.flush(); + reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); + Tail = Tail.drop_front(Subset.size()); + Subset = {}; + } else + // Grow the subset by one element + Subset = Tail.take_front(Subset.size() + 1); + } while (!Tail.empty()); } if (!EnableCondStoresVectorization && NumPredStores) { reportVectorizationFailure("There are conditional stores.", "store that is conditionally executed prevents vectorization", "ConditionalStore", ORE, TheLoop); - Width = 1; - Cost = ScalarCost; + ChosenFactor = ScalarCost; } - LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() + LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && + ChosenFactor.Cost >= ScalarCost.Cost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); - LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {ElementCount::getFixed(Width), - (unsigned)(Width * Cost)}; - return Factor; + LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); + return ChosenFactor; } bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( @@ -5880,6 +6142,12 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( })) return false; + // Epilogue vectorization code has not been auditted to ensure it handles + // non-latch exits properly. It may be fine, but it needs auditted and + // tested. + if (L.getExitingBlock() != L.getLoopLatch()) + return false; + return true; } @@ -5958,7 +6226,8 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( for (auto &NextVF : ProfitableVFs) if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && - (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && + (Result.Width.getFixedValue() == 1 || + isMoreProfitable(NextVF, Result)) && LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) Result = NextVF; @@ -5973,7 +6242,17 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + for (Type *T : ElementTypesInLoop) { + MinWidth = std::min<unsigned>( + MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + MaxWidth = std::max<unsigned>( + MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + } + return {MinWidth, MaxWidth}; +} +void LoopVectorizationCostModel::collectElementTypesForWidening() { + ElementTypesInLoop.clear(); // For each block. for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the loop. @@ -5993,8 +6272,8 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { if (auto *PN = dyn_cast<PHINode>(&I)) { if (!Legal->isReductionVariable(PN)) continue; - RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; - if (PreferInLoopReductions || + const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; + if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || TTI.preferInLoopReduction(RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), TargetTransformInfo::ReductionFlags())) @@ -6019,14 +6298,9 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) continue; - MinWidth = std::min(MinWidth, - (unsigned)DL.getTypeSizeInBits(T->getScalarType())); - MaxWidth = std::max(MaxWidth, - (unsigned)DL.getTypeSizeInBits(T->getScalarType())); + ElementTypesInLoop.insert(T); } } - - return {MinWidth, MaxWidth}; } unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, @@ -6157,8 +6431,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { - assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); - LoopCost = *expectedCost(VF).first.getValue(); + InstructionCost C = expectedCost(VF).first; + assert(C.isValid() && "Expected to have chosen a VF with valid cost"); + LoopCost = *C.getValue(); } assert(LoopCost && "Non-zero loop cost expected"); @@ -6198,9 +6473,21 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop - // we're interleaving is inside another loop. Limit, by default to 2, so the - // critical path only gets increased by one reduction operation. + // we're interleaving is inside another loop. For tree-wise reductions + // set the limit to 2, and for ordered reductions it's best to disable + // interleaving entirely. if (HasReductions && TheLoop->getLoopDepth() > 1) { + bool HasOrderedReductions = + any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return RdxDesc.isOrdered(); + }); + if (HasOrderedReductions) { + LLVM_DEBUG( + dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); + return 1; + } + unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -6319,10 +6606,14 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { // A lambda that gets the register usage for the given type and VF. const auto &TTICapture = TTI; - auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) - return 0U; - return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); + return 0; + InstructionCost::CostType RegUsage = + *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); + assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && + "Nonsensical values for register usage."); + return RegUsage; }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -6440,7 +6731,8 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); + assert(isPredicatedInst(I) && + "Expecting a scalar emulated instruction"); return isa<LoadInst>(I) || (isa<StoreInst>(I) && NumPredStores > NumberOfStoresToPredicate); @@ -6469,9 +6761,11 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { for (Instruction &I : *BB) if (isScalarWithPredication(&I)) { ScalarCostsTy ScalarCosts; + // Do not apply discount if scalable, because that would lead to + // invalid scalarization costs. // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. - if (!useEmulatedMaskMemRefHack(&I) && + if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. @@ -6548,9 +6842,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. - assert(!VF.isScalable() && "scalable vectors not yet supported."); InstructionCost ScalarCost = - VF.getKnownMinValue() * + VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)).first; // Compute the scalarization overhead of needed insertelement instructions @@ -6558,10 +6851,9 @@ int LoopVectorizationCostModel::computePredInstDiscount( if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); - assert(!VF.isScalable() && "scalable vectors not yet supported."); + APInt::getAllOnesValue(VF.getFixedValue()), true, false); ScalarCost += - VF.getKnownMinValue() * + VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); } @@ -6576,10 +6868,9 @@ int LoopVectorizationCostModel::computePredInstDiscount( if (canBeScalarized(J)) Worklist.push_back(J); else if (needsExtract(J, VF)) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); + APInt::getAllOnesValue(VF.getFixedValue()), false, true); } } @@ -6596,7 +6887,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(ElementCount VF) { +LoopVectorizationCostModel::expectedCost( + ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { VectorizationCostTy Cost; // For each block. @@ -6613,9 +6905,14 @@ LoopVectorizationCostModel::expectedCost(ElementCount VF) { VectorizationCostTy C = getInstructionCost(&I, VF); // Check if we should override the cost. - if (ForceTargetInstructionCost.getNumOccurrences() > 0) + if (C.first.isValid() && + ForceTargetInstructionCost.getNumOccurrences() > 0) C.first = InstructionCost(ForceTargetInstructionCost); + // Keep a list of instructions with invalid costs. + if (Invalid && !C.first.isValid()) + Invalid->emplace_back(&I, VF); + BlockCost.first += C.first; BlockCost.second |= C.second; LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first @@ -6680,8 +6977,10 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, ElementCount VF) { assert(VF.isVector() && "Scalarization cost of instruction implies vectorization."); - assert(!VF.isScalable() && "scalable vectors not yet supported."); - Type *ValTy = getMemInstValueType(I); + if (VF.isScalable()) + return InstructionCost::getInvalid(); + + Type *ValTy = getLoadStoreType(I); auto SE = PSE.getSE(); unsigned AS = getLoadStoreAddressSpace(I); @@ -6707,12 +7006,20 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // we might create due to scalarization. Cost += getScalarizationOverhead(I, VF); - // If we have a predicated store, it may not be executed for each vector - // lane. Scale the cost by the probability of executing the predicated - // block. + // If we have a predicated load/store, it will need extra i1 extracts and + // conditional branches, but may not be executed for each vector lane. Scale + // the cost by the probability of executing the predicated block. if (isPredicatedInst(I)) { Cost /= getReciprocalPredBlockProb(); + // Add the cost of an i1 extract and a branch + auto *Vec_i1Ty = + VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); + Cost += TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), + /*Insert=*/false, /*Extract=*/true); + Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); + if (useEmulatedMaskMemRefHack(I)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. @@ -6725,7 +7032,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, ElementCount VF) { - Type *ValTy = getMemInstValueType(I); + Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6745,7 +7052,8 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, bool Reverse = ConsecutiveStride < 0; if (Reverse) - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + Cost += + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); return Cost; } @@ -6754,7 +7062,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, ElementCount VF) { assert(Legal->isUniformMemOp(*I)); - Type *ValTy = getMemInstValueType(I); + Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6780,7 +7088,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, ElementCount VF) { - Type *ValTy = getMemInstValueType(I); + Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); const Value *Ptr = getLoadStorePointerOperand(I); @@ -6794,7 +7102,12 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, ElementCount VF) { - Type *ValTy = getMemInstValueType(I); + // TODO: Once we have support for interleaving with scalable vectors + // we can calculate the cost properly here. + if (VF.isScalable()) + return InstructionCost::getInvalid(); + + Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -6802,7 +7115,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. @@ -6825,17 +7137,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, // TODO: Add support for reversed masked interleaved access. assert(!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."); - Cost += Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + Cost += + Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); } return Cost; } -InstructionCost LoopVectorizationCostModel::getReductionPatternCost( +Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { + using namespace llvm::PatternMatch; // Early exit for no inloop reductions if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) - return InstructionCost::getInvalid(); + return None; auto *VectorTy = cast<VectorType>(Ty); // We are looking for a pattern of, and finding the minimal acceptable cost: @@ -6851,23 +7165,22 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost( // it is not we return an invalid cost specifying the orignal cost method // should be used. Instruction *RetI = I; - if ((RetI->getOpcode() == Instruction::SExt || - RetI->getOpcode() == Instruction::ZExt)) { + if (match(RetI, m_ZExtOrSExt(m_Value()))) { if (!RetI->hasOneUser()) - return InstructionCost::getInvalid(); + return None; RetI = RetI->user_back(); } - if (RetI->getOpcode() == Instruction::Mul && + if (match(RetI, m_Mul(m_Value(), m_Value())) && RetI->user_back()->getOpcode() == Instruction::Add) { if (!RetI->hasOneUser()) - return InstructionCost::getInvalid(); + return None; RetI = RetI->user_back(); } // Test if the found instruction is a reduction, and if not return an invalid // cost specifying the parent to use the original cost modelling. if (!InLoopReductionImmediateChains.count(RetI)) - return InstructionCost::getInvalid(); + return None; // Find the reduction this chain is a part of and calculate the basic cost of // the reduction on its own. @@ -6876,10 +7189,17 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost( while (!isa<PHINode>(ReductionPhi)) ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; - RecurrenceDescriptor RdxDesc = + const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; - unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), - VectorTy, false, CostKind); + + InstructionCost BaseCost = TTI.getArithmeticReductionCost( + RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + + // If we're using ordered reductions then we can just return the base cost + // here, since getArithmeticReductionCost calculates the full ordered + // reduction cost when FP reassociation is not allowed. + if (useOrderedReductions(RdxDesc)) + return BaseCost; // Get the operand that was not the reduction chain and match it to one of the // patterns, returning the better cost if it is found. @@ -6889,56 +7209,57 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost( VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); - if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && + Instruction *Op0, *Op1; + if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && !TheLoop->isLoopInvariant(RedOp)) { + // Matched reduce(ext(A)) bool IsUnsigned = isa<ZExtInst>(RedOp); auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); InstructionCost RedCost = TTI.getExtendedAddReductionCost( /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); - unsigned ExtCost = + InstructionCost ExtCost = TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, TTI::CastContextHint::None, CostKind, RedOp); if (RedCost.isValid() && RedCost < BaseCost + ExtCost) - return I == RetI ? *RedCost.getValue() : 0; - } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { - Instruction *Mul = RedOp; - Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); - Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); - if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && + return I == RetI ? RedCost : 0; + } else if (RedOp && + match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { + if (match(Op0, m_ZExtOrSExt(m_Value())) && Op0->getOpcode() == Op1->getOpcode() && Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { bool IsUnsigned = isa<ZExtInst>(Op0); auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); - // reduce(mul(ext, ext)) - unsigned ExtCost = + // Matched reduce(mul(ext, ext)) + InstructionCost ExtCost = TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, TTI::CastContextHint::None, CostKind, Op0); - unsigned MulCost = - TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + InstructionCost MulCost = + TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getExtendedAddReductionCost( /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) - return I == RetI ? *RedCost.getValue() : 0; + return I == RetI ? RedCost : 0; } else { - unsigned MulCost = - TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + // Matched reduce(mul()) + InstructionCost MulCost = + TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getExtendedAddReductionCost( /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); if (RedCost.isValid() && RedCost < MulCost + BaseCost) - return I == RetI ? *RedCost.getValue() : 0; + return I == RetI ? RedCost : 0; } } - return I == RetI ? BaseCost : InstructionCost::getInvalid(); + return I == RetI ? Optional<InstructionCost>(BaseCost) : None; } InstructionCost @@ -6947,7 +7268,7 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, // Calculate scalar cost only. Vectorization cost should be ready at this // moment. if (VF.isScalar()) { - Type *ValTy = getMemInstValueType(I); + Type *ValTy = getLoadStoreType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6991,10 +7312,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - ElementCount VF) { + ElementCount VF) const { + + // There is no mechanism yet to create a scalable scalarization loop, + // so this is currently Invalid. + if (VF.isScalable()) + return InstructionCost::getInvalid(); - assert(!VF.isScalable() && - "cannot compute scalarization overhead for scalable vectorization"); if (VF.isScalar()) return 0; @@ -7020,8 +7344,11 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Skip operands that do not require extraction/scalarization and do not incur // any overhead. + SmallVector<Type *> Tys; + for (auto *V : filterExtractingOperands(Ops, VF)) + Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); + filterExtractingOperands(Ops, VF), Tys); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { @@ -7047,8 +7374,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // relying on instcombine to remove them. // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract - InstructionCost Cost = getUniformMemOpCost(&I, VF); - setWideningDecision(&I, VF, CM_Scalarize, Cost); + InstructionCost Cost; + if (isa<StoreInst>(&I) && VF.isScalable() && + isLegalGatherOrScatter(&I)) { + Cost = getGatherScatterCost(&I, VF); + setWideningDecision(&I, VF, CM_GatherScatter, Cost); + } else { + assert((isa<LoadInst>(&I) || !VF.isScalable()) && + "Cannot yet scalarize uniform stores"); + Cost = getUniformMemOpCost(&I, VF); + setWideningDecision(&I, VF, CM_Scalarize, Cost); + } continue; } @@ -7066,7 +7402,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { } // Choose between Interleaving, Gather/Scatter or Scalarization. - InstructionCost InterleaveCost = std::numeric_limits<int>::max(); + InstructionCost InterleaveCost = InstructionCost::getInvalid(); unsigned NumAccesses = 1; if (isAccessInterleaved(&I)) { auto Group = getInterleavedAccessGroup(&I); @@ -7084,7 +7420,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { InstructionCost GatherScatterCost = isLegalGatherOrScatter(&I) ? getGatherScatterCost(&I, VF) * NumAccesses - : std::numeric_limits<int>::max(); + : InstructionCost::getInvalid(); InstructionCost ScalarizationCost = getMemInstScalarizationCost(&I, VF) * NumAccesses; @@ -7181,10 +7517,40 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); - VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + auto hasSingleCopyAfterVectorization = [this](Instruction *I, + ElementCount VF) -> bool { + if (VF.isScalar()) + return true; + + auto Scalarized = InstsToScalarize.find(VF); + assert(Scalarized != InstsToScalarize.end() && + "VF not yet analyzed for scalarization profitability"); + return !Scalarized->second.count(I) && + llvm::all_of(I->users(), [&](User *U) { + auto *UI = cast<Instruction>(U); + return !Scalarized->second.count(UI); + }); + }; + (void) hasSingleCopyAfterVectorization; + + if (isScalarAfterVectorization(I, VF)) { + // With the exception of GEPs and PHIs, after scalarization there should + // only be one copy of the instruction generated in the loop. This is + // because the VF is either 1, or any instructions that need scalarizing + // have already been dealt with by the the time we get here. As a result, + // it means we don't have to multiply the instruction cost by VF. + assert(I->getOpcode() == Instruction::GetElementPtr || + I->getOpcode() == Instruction::PHI || + (I->getOpcode() == Instruction::BitCast && + I->getType()->isPointerTy()) || + hasSingleCopyAfterVectorization(I, VF)); + VectorTy = RetTy; + } else + VectorTy = ToVectorTy(RetTy, VF); + // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: @@ -7205,15 +7571,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, ScalarPredicatedBB = true; if (ScalarPredicatedBB) { + // Not possible to scalarize scalable vector with predicated instructions. + if (VF.isScalable()) + return InstructionCost::getInvalid(); // Return cost for branches around scalarized and predicated blocks. - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), - false, true) + - (TTI.getCFInstrCost(Instruction::Br, CostKind) * - VF.getKnownMinValue())); + return ( + TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false, + true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); @@ -7232,7 +7600,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost( TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), - VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); + None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi @@ -7297,10 +7665,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, return 0; // Detect reduction patterns - InstructionCost RedCost; - if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) - .isValid()) - return RedCost; + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + return *RedCost; // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. @@ -7312,26 +7678,40 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector<const Value *, 4> Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; - return N * TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, - TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); + return TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, + Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; - return N * TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, - TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, - I->getOperand(0), I); + return TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None, I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); + + const Value *Op0, *Op1; + using namespace llvm::PatternMatch; + if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || + match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { + // select x, y, false --> x & y + // select x, true, y --> x | y + TTI::OperandValueProperties Op1VP = TTI::OP_None; + TTI::OperandValueProperties Op2VP = TTI::OP_None; + TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); + TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); + assert(Op0->getType()->getScalarSizeInBits() == 1 && + Op1->getType()->getScalarSizeInBits() == 1); + + SmallVector<const Value *, 2> Operands{Op0, Op1}; + return TTI.getArithmeticInstrCost( + match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, + CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); + } + Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); @@ -7358,9 +7738,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, if (Decision == CM_Scalarize) Width = ElementCount::getFixed(1); } - VectorTy = ToVectorTy(getMemInstValueType(I), Width); + VectorTy = ToVectorTy(getLoadStoreType(I), Width); return getMemoryInstructionCost(I, VF); } + case Instruction::BitCast: + if (I->getType()->isPointerTy()) + return 0; + LLVM_FALLTHROUGH; case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -7371,8 +7755,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { + case Instruction::FPTrunc: { // Computes the CastContextHint from a Load/Store instruction. auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && @@ -7424,10 +7807,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, } // Detect reduction patterns - InstructionCost RedCost; - if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) - .isValid()) - return RedCost; + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + return *RedCost; Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = @@ -7450,10 +7831,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, } } - assert(!VF.isScalable() && "VF is assumed to be non scalable"); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; - return N * - TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); + return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { bool NeedToScalarize; @@ -7467,12 +7845,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, } case Instruction::ExtractValue: return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); + case Instruction::Alloca: + // We cannot easily widen alloca to a scalable alloca, as + // the result would need to be a vector of pointers. + if (VF.isScalable()) + return InstructionCost::getInvalid(); + LLVM_FALLTHROUGH; default: - // The cost of executing VF copies of the scalar instruction. This opcode - // is unknown. Assume that it is the same as 'mul'. - return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( - Instruction::Mul, VectorTy, CostKind) + - getScalarizationOverhead(I, VF); + // This opcode is unknown. Assume that it is the same as 'mul'. + return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); } // end of switch. } @@ -7548,7 +7929,7 @@ void LoopVectorizationCostModel::collectInLoopReductions() { // If the target would prefer this reduction to happen "in-loop", then we // want to record it as such. unsigned Opcode = RdxDesc.getOpcode(); - if (!PreferInLoopReductions && + if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && !TTI.preferInLoopReduction(Opcode, Phi->getType(), TargetTransformInfo::ReductionFlags())) continue; @@ -7597,8 +7978,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. if (UserVF.isZero()) { - VF = ElementCount::getFixed( - determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); + VF = ElementCount::getFixed(determineVPlanVF( + TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + .getFixedSize(), + CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. @@ -7631,8 +8014,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { Optional<VectorizationFactor> LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); - if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. + FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); + if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return None; // Invalidate interleave groups if all blocks of loop will be predicated. @@ -7649,34 +8032,35 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.invalidateCostModelingDecisions(); } - ElementCount MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF.isNonZero() && "MaxVF is zero."); - - bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); - if (!UserVF.isZero() && - (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { - // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable - // VFs here, this should be reverted to only use legal UserVFs once the - // loop below supports scalable VFs. - ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") - << " VF " << VF << ".\n"); - assert(isPowerOf2_32(VF.getKnownMinValue()) && + ElementCount MaxUserVF = + UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; + bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); + if (!UserVF.isZero() && UserVFIsLegal) { + assert(isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(VF); - CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(VF, VF); - LLVM_DEBUG(printPlans(dbgs())); - return {{VF, 0}}; + if (CM.selectUserVectorizationFactor(UserVF)) { + LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + CM.collectInLoopReductions(); + buildVPlansWithVPRecipes(UserVF, UserVF); + LLVM_DEBUG(printPlans(dbgs())); + return {{UserVF, 0}}; + } else + reportVectorizationInfo("UserVF ignored because of invalid costs.", + "InvalidCost", ORE, OrigLoop); } - assert(!MaxVF.isScalable() && - "Scalable vectors not yet supported beyond this point"); + // Populate the set of Vectorization Factor Candidates. + ElementCountSet VFCandidates; + for (auto VF = ElementCount::getFixed(1); + ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) + VFCandidates.insert(VF); + for (auto VF = ElementCount::getScalable(1); + ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) + VFCandidates.insert(VF); - for (ElementCount VF = ElementCount::getFixed(1); - ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { + for (const auto &VF : VFCandidates) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); @@ -7687,14 +8071,38 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { } CM.collectInLoopReductions(); + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); + buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF.isScalar()) + if (!MaxFactors.hasVector()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - return CM.selectVectorizationFactor(MaxVF); + auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); + + // Check if it is profitable to vectorize with runtime checks. + unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); + if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = + NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + ORE->emit([&]() { + return OptimizationRemarkAnalysisAliasing( + DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), + OrigLoop->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Hints.emitRemarkWithHints(); + return VectorizationFactor::Disabled(); + } + } + return SelectedVF; } void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { @@ -7714,19 +8122,11 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // Perform the actual loop transformation. // 1. Create a new empty loop. Unlink the old loop and connect the new one. - VPCallbackILV CallbackILV(ILV); - assert(BestVF.hasValue() && "Vectorization Factor is missing"); + assert(VPlans.size() == 1 && "Not a single VPlan to execute."); - VPTransformState State{*BestVF, - BestUF, - OrigLoop, - LI, - DT, - ILV.Builder, - ILV.VectorLoopValueMap, - &ILV, - CallbackILV}; + VPTransformState State{ + *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; @@ -7742,16 +8142,25 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - assert(VPlans.size() == 1 && "Not a single VPlan to execute."); VPlans.front()->execute(&State); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. - ILV.fixVectorizedLoop(); + ILV.fixVectorizedLoop(State); ILV.printDebugTracesAtEnd(); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void LoopVectorizationPlanner::printPlans(raw_ostream &O) { + for (const auto &Plan : VPlans) + if (PrintVPlansInDotFormat) + Plan->printDOT(O); + else + Plan->print(O); +} +#endif + void LoopVectorizationPlanner::collectTriviallyDeadInstructions( SmallPtrSetImpl<Instruction *> &DeadInstructions) { @@ -7822,9 +8231,9 @@ Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, if (Ty->isFloatingPointTy()) { Constant *C = ConstantFP::get(Ty, (double)StartIdx); - // Floating point operations had to be 'fast' to enable the unrolling. - Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); - return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); + // Floating-point operations inherit FMF via the builder's flags. + Value *MulOp = Builder.CreateFMul(C, Step); + return Builder.CreateBinOp(BinOp, Val, MulOp); } Constant *C = ConstantInt::get(Ty, StartIdx); return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); @@ -7882,22 +8291,12 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { // Generate the code to check any assumptions that we've made for SCEV // expressions. - BasicBlock *SavedPreHeader = LoopVectorPreHeader; - emitSCEVChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.SCEVSafetyCheck = SavedPreHeader; + EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks at runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - SavedPreHeader = LoopVectorPreHeader; - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save/overwite it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.MemSafetyCheck = SavedPreHeader; + EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case @@ -7958,8 +8357,8 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( // Generate code to check if the loop's trip count is less than VF * UF of the // main vector loop. - auto P = - Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? + ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), @@ -7979,7 +8378,11 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( // Update dominator for Bypass & LoopExit. DT->changeImmediateDominator(Bypass, TCCheckBlock); - DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) + // For loops with multiple exits, there's no edge from the middle block + // to exit blocks (as the epilogue must run) and thus no need to update + // the immediate dominator of the exit blocks. + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); LoopBypassBlocks.push_back(TCCheckBlock); @@ -8043,7 +8446,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { DT->changeImmediateDominator(LoopScalarPreHeader, EPI.EpilogueIterationCountCheck); - DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); + if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) + // If there is an epilogue which must run, there's no edge from the + // middle block to exit blocks and thus no need to update the immediate + // dominator of the exit blocks. + DT->changeImmediateDominator(LoopExitBlock, + EPI.EpilogueIterationCountCheck); // Keep track of bypass blocks, as they feed start values to the induction // phis in the scalar loop preheader. @@ -8102,8 +8510,8 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( // Generate code to check if the loop's trip count is less than VF * UF of the // vector epilogue loop. - auto P = - Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? + ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, @@ -8122,9 +8530,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { LLVM_DEBUG({ dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" - << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() - << ", Main Loop UF:" << EPI.MainLoopUF - << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }); } @@ -8196,8 +8602,15 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (BI->getSuccessor(0) != Dst) EdgeMask = Builder.createNot(EdgeMask); - if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. - EdgeMask = Builder.createAnd(EdgeMask, SrcMask); + if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. + // The condition is 'SrcMask && EdgeMask', which is equivalent to + // 'select i1 SrcMask, i1 EdgeMask, i1 false'. + // The select version does not introduce new UB if SrcMask is false and + // EdgeMask is poison. Using 'and' here introduces undefined behavior. + VPValue *False = Plan->getOrAddVPValue( + ConstantInt::getFalse(BI->getCondition()->getType())); + EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); + } return EdgeMaskCache[Edge] = EdgeMask; } @@ -8232,7 +8645,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { else { auto IVRecipe = new VPWidenCanonicalIVRecipe(); Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); - IV = IVRecipe->getVPValue(); + IV = IVRecipe->getVPSingleValue(); } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); @@ -8266,7 +8679,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; } -VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, +VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, + ArrayRef<VPValue *> Operands, + VFRange &Range, VPlanPtr &Plan) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Must be called with either a load or store"); @@ -8293,32 +8708,35 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, if (Legal->isMaskRequired(I)) Mask = createBlockInMask(I->getParent(), Plan); - VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); if (LoadInst *Load = dyn_cast<LoadInst>(I)) - return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); + return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); StoreInst *Store = cast<StoreInst>(I); - VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); - return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); + return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], + Mask); } VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { +VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, + ArrayRef<VPValue *> Operands) const { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. InductionDescriptor II = Legal->getInductionVars().lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || II.getKind() == InductionDescriptor::IK_FpInduction) { - VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return new VPWidenIntOrFpInductionRecipe(Phi, Start); + assert(II.getStartValue() == + Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); + const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); + return new VPWidenIntOrFpInductionRecipe( + Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); } return nullptr; } -VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, - VPlan &Plan) const { +VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( + TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, + VPlan &Plan) const { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -8340,39 +8758,49 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - Start, I); + Start, nullptr, I); } return nullptr; } -VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { +VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, + ArrayRef<VPValue *> Operands, + VPlanPtr &Plan) { + // If all incoming values are equal, the incoming VPValue can be used directly + // instead of creating a new VPBlendRecipe. + VPValue *FirstIncoming = Operands[0]; + if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { + return FirstIncoming == Inc; + })) { + return Operands[0]; + } + // We know that all PHIs in non-header blocks are converted into selects, so // we don't have to worry about the insertion order and we can just use the // builder. At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - - SmallVector<VPValue *, 2> Operands; + SmallVector<VPValue *, 2> OperandsWithMask; unsigned NumIncoming = Phi->getNumIncomingValues(); + for (unsigned In = 0; In < NumIncoming; In++) { VPValue *EdgeMask = createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); assert((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"); - Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); + OperandsWithMask.push_back(Operands[In]); if (EdgeMask) - Operands.push_back(EdgeMask); + OperandsWithMask.push_back(EdgeMask); } - return new VPBlendRecipe(Phi, Operands); + return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); } -VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, - VPlan &Plan) const { +VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, + ArrayRef<VPValue *> Operands, + VFRange &Range) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { - return CM.isScalarWithPredication(CI, VF); - }, + [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, Range); if (IsPredicated) @@ -8395,15 +8823,14 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - assert(IntrinsicCost.isValid() && CallCost.isValid() && - "Cannot have invalid costs while widening"); return UseVectorIntrinsic || !NeedToScalarize; }; if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return nullptr; - return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); + ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); + return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); } bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { @@ -8413,14 +8840,14 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { // scalarization is profitable or it is predicated. auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || - CM.isScalarWithPredication(I, VF); + CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, + ArrayRef<VPValue *> Operands) const { auto IsVectorizableOpcode = [](unsigned Opcode) { switch (Opcode) { case Instruction::Add: @@ -8466,20 +8893,28 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { return nullptr; // Success: widen this instruction. - return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); + return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); +} + +void VPRecipeBuilder::fixHeaderPhis() { + BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); + for (VPWidenPHIRecipe *R : PhisToFix) { + auto *PN = cast<PHINode>(R->getUnderlyingValue()); + VPRecipeBase *IncR = + getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); + R->addOperand(IncR->getVPSingleValue()); + } } VPBasicBlock *VPRecipeBuilder::handleReplication( Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, VPlanPtr &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, - Range); + [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); @@ -8489,10 +8924,16 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( // Find if I uses a predicated instruction. If so, it will use its scalar // value. Avoid hoisting the insert-element which packs the scalar value into // a vector value, as that happens iff all users use the vector value. - for (auto &Op : I->operands()) - if (auto *PredInst = dyn_cast<Instruction>(Op)) - if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) - PredInst2Recipe[PredInst]->setAlsoPack(false); + for (VPValue *Op : Recipe->operands()) { + auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); + if (!PredR) + continue; + auto *RepR = + cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); + assert(RepR->isPredicated() && + "expected Replicate recipe to be predicated"); + RepR->setAlsoPack(false); + } // Finalize the recipe for Instr, first if it is not predicated. if (!IsPredicated) { @@ -8504,7 +8945,6 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( assert(VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."); // Record predicated instructions for above packing optimizations. - PredInst2Recipe[I] = Recipe; VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); VPBlockUtils::insertBlockAfter(Region, VPBB); auto *RegSucc = new VPBasicBlock(); @@ -8529,6 +8969,10 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, auto *PHIRecipe = Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); + if (PHIRecipe) { + Plan->removeVPValueFor(Instr); + Plan->addVPValue(Instr, PHIRecipe); + } auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); @@ -8541,53 +8985,75 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, return Region; } -VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, - VFRange &Range, - VPlanPtr &Plan) { +VPRecipeOrVPValueTy +VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, + ArrayRef<VPValue *> Operands, + VFRange &Range, VPlanPtr &Plan) { // First, check for specific widening recipes that deal with calls, memory // operations, inductions and Phi nodes. if (auto *CI = dyn_cast<CallInst>(Instr)) - return tryToWidenCall(CI, Range, *Plan); + return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) - return tryToWidenMemory(Instr, Range, Plan); + return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); VPRecipeBase *Recipe; if (auto Phi = dyn_cast<PHINode>(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) - return tryToBlend(Phi, Plan); - if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) - return Recipe; - - if (Legal->isReductionVariable(Phi)) { - RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; - VPValue *StartV = - Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); - return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); + return tryToBlend(Phi, Operands, Plan); + if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) + return toVPRecipeResult(Recipe); + + VPWidenPHIRecipe *PhiRecipe = nullptr; + if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { + VPValue *StartV = Operands[0]; + if (Legal->isReductionVariable(Phi)) { + RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + assert(RdxDesc.getRecurrenceStartValue() == + Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); + PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, + CM.isInLoopReduction(Phi), + CM.useOrderedReductions(RdxDesc)); + } else { + PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); + } + + // Record the incoming value from the backedge, so we can add the incoming + // value from the backedge after all recipes have been created. + recordRecipeOf(cast<Instruction>( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); + PhisToFix.push_back(PhiRecipe); + } else { + // TODO: record start and backedge value for remaining pointer induction + // phis. + assert(Phi->getType()->isPointerTy() && + "only pointer phis should be handled here"); + PhiRecipe = new VPWidenPHIRecipe(Phi); } - return new VPWidenPHIRecipe(Phi); + return toVPRecipeResult(PhiRecipe); } - if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( - cast<TruncInst>(Instr), Range, *Plan))) - return Recipe; + if (isa<TruncInst>(Instr) && + (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, + Range, *Plan))) + return toVPRecipeResult(Recipe); if (!shouldWiden(Instr, Range)) return nullptr; if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) - return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), - OrigLoop); + return toVPRecipeResult(new VPWidenGEPRecipe( + GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); if (auto *SI = dyn_cast<SelectInst>(Instr)) { bool InvariantCond = PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); - return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), - InvariantCond); + return toVPRecipeResult(new VPWidenSelectRecipe( + *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); } - return tryToWiden(Instr, *Plan); + return toVPRecipeResult(tryToWiden(Instr, Operands)); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -8610,11 +9076,29 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); - DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); + MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); // Dead instructions do not need sinking. Remove them from SinkAfter. for (Instruction *I : DeadInstructions) SinkAfter.erase(I); + // Cannot sink instructions after dead instructions (there won't be any + // recipes for them). Instead, find the first non-dead previous instruction. + for (auto &P : Legal->getSinkAfter()) { + Instruction *SinkTarget = P.second; + Instruction *FirstInst = &*SinkTarget->getParent()->begin(); + (void)FirstInst; + while (DeadInstructions.contains(SinkTarget)) { + assert( + SinkTarget != FirstInst && + "Must find a live instruction (at least the one feeding the " + "first-order recurrence PHI) before reaching beginning of the block"); + SinkTarget = SinkTarget->getPrevNode(); + assert(SinkTarget != P.first && + "sink source equals target, no sinking required"); + } + P.second = SinkTarget; + } + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; @@ -8626,12 +9110,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, - const DenseMap<Instruction *, Instruction *> &SinkAfter) { - - // Hold a mapping from predicated instructions to their recipes, in order to - // fix their AlsoPack behavior if a user is determined to replicate and use a - // scalar instead of vector value. - DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; + const MapVector<Instruction *, Instruction *> &SinkAfter) { SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; @@ -8715,8 +9194,29 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) continue; - if (auto Recipe = - RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { + SmallVector<VPValue *, 4> Operands; + auto *Phi = dyn_cast<PHINode>(Instr); + if (Phi && Phi->getParent() == OrigLoop->getHeader()) { + Operands.push_back(Plan->getOrAddVPValue( + Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); + } else { + auto OpRange = Plan->mapToVPValues(Instr->operands()); + Operands = {OpRange.begin(), OpRange.end()}; + } + if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( + Instr, Operands, Range, Plan)) { + // If Instr can be simplified to an existing VPValue, use it. + if (RecipeOrValue.is<VPValue *>()) { + auto *VPV = RecipeOrValue.get<VPValue *>(); + Plan->addVPValue(Instr, VPV); + // If the re-used value is a recipe, register the recipe for the + // instruction, in case the recipe for Instr needs to be recorded. + if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) + RecipeBuilder.setRecipe(Instr, R); + continue; + } + // Otherwise, add the new recipe. + VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); for (auto *Def : Recipe->definedValues()) { auto *UV = Def->getUnderlyingValue(); Plan->addVPValue(UV, Def); @@ -8729,8 +9229,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. - VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( - Instr, Range, VPBB, PredInst2Recipe, Plan); + VPBasicBlock *NextVPBB = + RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); if (NextVPBB != VPBB) { VPBB = NextVPBB; VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) @@ -8739,6 +9239,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } + RecipeBuilder.fixHeaderPhis(); + // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks // may also be empty, such as the last one VPBB, reflecting original // basic-blocks with no recipes. @@ -8754,22 +9256,89 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // --------------------------------------------------------------------------- // Apply Sink-After legal constraints. + auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { + auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); + if (Region && Region->isReplicator()) { + assert(Region->getNumSuccessors() == 1 && + Region->getNumPredecessors() == 1 && "Expected SESE region!"); + assert(R->getParent()->size() == 1 && + "A recipe in an original replicator region must be the only " + "recipe in its block"); + return Region; + } + return nullptr; + }; for (auto &Entry : SinkAfter) { VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); - // If the target is in a replication region, make sure to move Sink to the - // block after it, not into the replication region itself. - if (auto *Region = - dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { - if (Region->isReplicator()) { - assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); + + auto *TargetRegion = GetReplicateRegion(Target); + auto *SinkRegion = GetReplicateRegion(Sink); + if (!SinkRegion) { + // If the sink source is not a replicate region, sink the recipe directly. + if (TargetRegion) { + // The target is in a replication region, make sure to move Sink to + // the block after it, not into the replication region itself. VPBasicBlock *NextBlock = - cast<VPBasicBlock>(Region->getSuccessors().front()); + cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); - continue; - } + } else + Sink->moveAfter(Target); + continue; + } + + // The sink source is in a replicate region. Unhook the region from the CFG. + auto *SinkPred = SinkRegion->getSinglePredecessor(); + auto *SinkSucc = SinkRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); + VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); + VPBlockUtils::connectBlocks(SinkPred, SinkSucc); + + if (TargetRegion) { + // The target recipe is also in a replicate region, move the sink region + // after the target region. + auto *TargetSucc = TargetRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); + VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); + VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); + } else { + // The sink source is in a replicate region, we need to move the whole + // replicate region, which should only contain a single recipe in the + // main block. + auto *SplitBlock = + Target->getParent()->splitAt(std::next(Target->getIterator())); + + auto *SplitPred = SplitBlock->getSinglePredecessor(); + + VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); + VPBlockUtils::connectBlocks(SplitPred, SinkRegion); + VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); + if (VPBB == SplitPred) + VPBB = SplitBlock; } - Sink->moveAfter(Target); + } + + // Introduce a recipe to combine the incoming and previous values of a + // first-order recurrence. + for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { + auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); + if (!RecurPhi) + continue; + + auto *RecurSplice = cast<VPInstruction>( + Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, + {RecurPhi, RecurPhi->getBackedgeValue()})); + + VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); + if (auto *Region = GetReplicateRegion(PrevRecipe)) { + VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor()); + RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi()); + } else + RecurSplice->moveAfter(PrevRecipe); + RecurPhi->replaceAllUsesWith(RecurSplice); + // Set the first operand of RecurSplice to RecurPhi again, after replacing + // all users. + RecurSplice->setOperand(0, RecurPhi); } // Interleave memory: for each Interleave Group we marked earlier as relevant @@ -8780,8 +9349,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RecipeBuilder.getRecipe(IG->getInsertPos())); SmallVector<VPValue *, 4> StoredValues; for (unsigned i = 0; i < IG->getFactor(); ++i) - if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) - StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); + if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { + auto *StoreR = + cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); + StoredValues.push_back(StoreR->getStoredValue()); + } auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, Recipe->getMask()); @@ -8801,8 +9373,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } // Adjust the recipes for any inloop reductions. - if (Range.Start.isVector()) - adjustRecipesForInLoopReductions(Plan, RecipeBuilder); + adjustRecipesForInLoopReductions(Plan, RecipeBuilder, Range.Start); // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. @@ -8818,6 +9389,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } + VPlanTransforms::sinkScalarOperands(*Plan); + VPlanTransforms::mergeReplicateRegions(*Plan); + std::string PlanName; raw_string_ostream RSO(PlanName); ElementCount VF = Range.Start; @@ -8863,8 +9437,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { } SmallPtrSet<Instruction *, 1> DeadInstructions; - VPlanTransforms::VPInstructionsToVPRecipes( - OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); + VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, + Legal->getInductionVars(), + DeadInstructions, *PSE.getSE()); return Plan; } @@ -8873,12 +9448,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // reductions, with one operand being vector and the other being the scalar // reduction chain. void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( - VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { + VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { for (auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; + if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) + continue; + // ReductionOperations are orders top-down from the phi's use to the // LoopExitValue. We keep a track of the previous item (the Chain) to tell // which of the two operands will remain scalar and which will be reduced. @@ -8895,7 +9473,7 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( "Expected to replace a VPWidenSelectSC"); FirstOpId = 1; } else { - assert(isa<VPWidenRecipe>(WidenRecipe) && + assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && "Expected to replace a VPWidenSC"); FirstOpId = 0; } @@ -8907,12 +9485,12 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) : nullptr; VPReductionRecipe *RedRecipe = new VPReductionRecipe( - &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); - WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); + &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); + WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); - WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); + WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); WidenRecipe->eraseFromParent(); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { @@ -8929,19 +9507,10 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( } } -Value* LoopVectorizationPlanner::VPCallbackILV:: -getOrCreateVectorValues(Value *V, unsigned Part) { - return ILV.getOrCreateVectorValue(V, Part); -} - -Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( - Value *V, const VPIteration &Instance) { - return ILV.getOrCreateScalarValue(V, Instance); -} - +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; getAddr()->printAsOperand(O, SlotTracker); @@ -8952,8 +9521,9 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) - O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; + O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; } +#endif void VPWidenCallRecipe::execute(VPTransformState &State) { State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, @@ -8978,17 +9548,17 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), - Trunc); + getTruncInst(), getVPValue(0), + getCastValue(), State); } void VPWidenPHIRecipe::execute(VPTransformState &State) { - Value *StartV = - getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; - State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); + State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, + State); } void VPBlendRecipe::execute(VPTransformState &State) { - State.ILV->setDebugLocFromInst(State.Builder, Phi); + State.ILV->setDebugLocFromInst(Phi, &State.Builder); // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. @@ -9023,7 +9593,7 @@ void VPBlendRecipe::execute(VPTransformState &State) { } } for (unsigned Part = 0; Part < State.UF; ++Part) - State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); + State.set(this, Entry[Part], Part); } void VPInterleaveRecipe::execute(VPTransformState &State) { @@ -9034,53 +9604,66 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Reduction being replicated."); + Value *PrevInChain = State.get(getChainOp(), 0); for (unsigned Part = 0; Part < State.UF; ++Part) { RecurKind Kind = RdxDesc->getRecurrenceKind(); + bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); Value *NewVecOp = State.get(getVecOp(), Part); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, Part); VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( - Kind, VecTy->getElementType()); + Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); Constant *IdenVec = ConstantVector::getSplat(VecTy->getElementCount(), Iden); Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); NewVecOp = Select; } - Value *NewRed = - createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); - Value *PrevInChain = State.get(getChainOp(), Part); + Value *NewRed; Value *NextInChain; + if (IsOrdered) { + if (State.VF.isVector()) + NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, + PrevInChain); + else + NewRed = State.Builder.CreateBinOp( + (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), + PrevInChain, NewVecOp); + PrevInChain = NewRed; + } else { + PrevInChain = State.get(getChainOp(), Part); + NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); + } if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { NextInChain = createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), NewRed, PrevInChain); - } else { + } else if (IsOrdered) + NextInChain = NewRed; + else { NextInChain = State.Builder.CreateBinOp( (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, PrevInChain); } - State.set(this, getUnderlyingInstr(), NextInChain, Part); + State.set(this, NextInChain, Part); } } void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. - if (State.Instance->Lane == 0) { + if (State.Instance->Lane.isFirstLane()) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); Value *Poison = PoisonValue::get( VectorType::get(getUnderlyingValue()->getType(), State.VF)); - State.ValueMap.setVectorValue(getUnderlyingInstr(), - State.Instance->Part, Poison); + State.set(this, Poison, State.Instance->Part); } - State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), - *State.Instance); + State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); } return; } @@ -9093,15 +9676,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) { "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, - IsPredicated, State); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, + VPIteration(Part, Lane), IsPredicated, + State); } void VPBranchOnMaskRecipe::execute(VPTransformState &State) { assert(State.Instance && "Branch on Mask works only on single instance."); unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane; + unsigned Lane = State.Instance->Lane.getKnownLane(); Value *ConditionBit = nullptr; VPValue *BlockInMask = getMask(); @@ -9130,6 +9714,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { BasicBlock *PredicatedBB = ScalarPredInst->getParent(); BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); assert(PredicatingBB && "Predicated block has no single predecessor."); + assert(isa<VPReplicateRecipe>(getOperand(0)) && + "operand must be VPReplicateRecipe"); // By current pack/unpack logic we need to generate only a single phi node: if // a vector value for the predicated instruction exists at this point it means @@ -9138,29 +9724,40 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. unsigned Part = State.Instance->Part; - Instruction *PredInst = - cast<Instruction>(getOperand(0)->getUnderlyingValue()); - if (State.ValueMap.hasVectorValue(PredInst, Part)) { - Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); + if (State.hasVectorValue(getOperand(0), Part)) { + Value *VectorValue = State.get(getOperand(0), Part); InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. - State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. + if (State.hasVectorValue(this, Part)) + State.reset(this, VPhi, Part); + else + State.set(this, VPhi, Part); + // NOTE: Currently we need to update the value of the operand, so the next + // predicated iteration inserts its generated value in the correct vector. + State.reset(getOperand(0), VPhi, Part); } else { - Type *PredInstType = PredInst->getType(); + Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); - Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); + Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), + PredicatingBB); Phi->addIncoming(ScalarPredInst, PredicatedBB); - State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); + if (State.hasScalarValue(this, *State.Instance)) + State.reset(this, Phi, *State.Instance); + else + State.set(this, Phi, *State.Instance); + // NOTE: Currently we need to update the value of the operand, so the next + // predicated iteration inserts its generated value in the correct vector. + State.reset(getOperand(0), Phi, *State.Instance); } } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; - State.ILV->vectorizeMemoryInstruction(&Ingredient, State, - StoredValue ? nullptr : getVPValue(), - getAddr(), StoredValue, getMask()); + State.ILV->vectorizeMemoryInstruction( + &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), + StoredValue, getMask()); } // Determine how to lower the scalar epilogue, which depends on 1) optimising @@ -9213,10 +9810,71 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( return CM_ScalarEpilogueAllowed; } -void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, - unsigned Part) { - set(Def, V, Part); - ILV->setVectorValue(IRDef, Part, V); +Value *VPTransformState::get(VPValue *Def, unsigned Part) { + // If Values have been set for this Def return the one relevant for \p Part. + if (hasVectorValue(Def, Part)) + return Data.PerPartOutput[Def][Part]; + + if (!hasScalarValue(Def, {Part, 0})) { + Value *IRV = Def->getLiveInIRValue(); + Value *B = ILV->getBroadcastInstrs(IRV); + set(Def, B, Part); + return B; + } + + Value *ScalarValue = get(Def, {Part, 0}); + // If we aren't vectorizing, we can just copy the scalar map values over + // to the vector map. + if (VF.isScalar()) { + set(Def, ScalarValue, Part); + return ScalarValue; + } + + auto *RepR = dyn_cast<VPReplicateRecipe>(Def); + bool IsUniform = RepR && RepR->isUniform(); + + unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; + // Check if there is a scalar value for the selected lane. + if (!hasScalarValue(Def, {Part, LastLane})) { + // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. + assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && + "unexpected recipe found to be invariant"); + IsUniform = true; + LastLane = 0; + } + + auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); + // Set the insert point after the last scalarized instruction or after the + // last PHI, if LastInst is a PHI. This ensures the insertelement sequence + // will directly follow the scalar definitions. + auto OldIP = Builder.saveIP(); + auto NewIP = + isa<PHINode>(LastInst) + ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) + : std::next(BasicBlock::iterator(LastInst)); + Builder.SetInsertPoint(&*NewIP); + + // However, if we are vectorizing, we need to construct the vector values. + // If the value is known to be uniform after vectorization, we can just + // broadcast the scalar value corresponding to lane zero for each unroll + // iteration. Otherwise, we construct the vector values using + // insertelement instructions. Since the resulting vectors are stored in + // State, we will only generate the insertelements once. + Value *VectorValue = nullptr; + if (IsUniform) { + VectorValue = ILV->getBroadcastInstrs(ScalarValue); + set(Def, VectorValue, Part); + } else { + // Initialize packing with insertelements to start from undef. + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); + set(Def, Undef, Part); + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) + ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); + VectorValue = get(Def, Part); + } + Builder.restoreIP(OldIP); + return VectorValue; } // Process the loop in the VPlan-native vectorization path. This path builds @@ -9228,7 +9886,8 @@ static bool processLoopInVPlanNativePath( LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { + ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, + LoopVectorizationRequirements &Requirements) { if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); @@ -9246,11 +9905,14 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, + Requirements, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); + CM.collectElementTypesForWidening(); + // Plan how to best vectorize, return the best VF and its cost. const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); @@ -9263,19 +9925,67 @@ static bool processLoopInVPlanNativePath( LVP.setBestPlan(VF.Width, 1); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI); - LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" - << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(LB, DT); + { + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, + &CM, BFI, PSI, Checks); + LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" + << L->getHeader()->getParent()->getName() << "\"\n"); + LVP.executePlan(LB, DT); + } // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } +// Emit a remark if there are stores to floats that required a floating point +// extension. If the vectorized loop was generated with floating point there +// will be a performance penalty from the conversion overhead and the change in +// the vector width. +static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { + SmallVector<Instruction *, 4> Worklist; + for (BasicBlock *BB : L->getBlocks()) { + for (Instruction &Inst : *BB) { + if (auto *S = dyn_cast<StoreInst>(&Inst)) { + if (S->getValueOperand()->getType()->isFloatTy()) + Worklist.push_back(S); + } + } + } + + // Traverse the floating point stores upwards searching, for floating point + // conversions. + SmallPtrSet<const Instruction *, 4> Visited; + SmallPtrSet<const Instruction *, 4> EmittedRemark; + while (!Worklist.empty()) { + auto *I = Worklist.pop_back_val(); + if (!L->contains(I)) + continue; + if (!Visited.insert(I).second) + continue; + + // Emit a remark if the floating point store required a floating + // point conversion. + // TODO: More work could be done to identify the root cause such as a + // constant or a function return type and point the user to it. + if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) + ORE->emit([&]() { + return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", + I->getDebugLoc(), L->getHeader()) + << "floating point conversion changes vector width. " + << "Mixed floating point precision requires an up/down " + << "cast that will negatively impact performance."; + }); + + for (Use &Op : I->operands()) + if (auto *OpI = dyn_cast<Instruction>(Op)) + Worklist.push_back(OpI); + } +} + LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || !EnableLoopInterleaving), @@ -9305,7 +10015,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { ? "enabled" : "?")) << " width=" << Hints.getWidth() - << " unroll=" << Hints.getInterleave() << "\n"); + << " interleave=" << Hints.getInterleave() << "\n"); // Function containing loop Function *F = L->getHeader()->getParent(); @@ -9326,7 +10036,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { PredicatedScalarEvolution PSE(*SE, *L); // Check if it is legal to vectorize the loop. - LoopVectorizationRequirements Requirements(*ORE); + LoopVectorizationRequirements Requirements; LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, &Requirements, &Hints, DB, AC, BFI, PSI); if (!LVL.canVectorize(EnableVPlanNativePath)) { @@ -9347,7 +10057,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // pipeline. if (!L->isInnermost()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, - ORE, BFI, PSI, Hints); + ORE, BFI, PSI, Hints, Requirements); assert(L->isInnermost() && "Inner loop expected."); @@ -9393,6 +10103,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } + if (!LVL.canVectorizeFPMath(EnableStrictReductions)) { + ORE->emit([&]() { + auto *ExactFPMathInst = Requirements.getExactFPInst(); + return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", + ExactFPMathInst->getDebugLoc(), + ExactFPMathInst->getParent()) + << "loop not vectorized: cannot prove it is safe to reorder " + "floating-point operations"; + }); + LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " + "reorder floating-point operations\n"); + Hints.emitRemarkWithHints(); + return false; + } + bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); @@ -9409,9 +10134,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); CM.collectValuesToIgnore(); + CM.collectElementTypesForWidening(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, + Requirements, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -9426,19 +10153,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); } // Identify the diagnostic messages that should be produced. std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; - if (Requirements.doesNotMeet(F, L, Hints)) { - LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " - "requirements.\n"); - Hints.emitRemarkWithHints(); - return false; - } - if (VF.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( @@ -9518,82 +10238,94 @@ bool LoopVectorizePass::processLoop(Loop *L) { LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } - LVP.setBestPlan(VF.Width, IC); - - using namespace ore; bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); - - if (!VectorizeLoop) { - assert(IC > 1 && "interleave count should not be 1 or 0"); - // If we decided that it is not legal to vectorize the loop, then - // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, - BFI, PSI); - LVP.executePlan(Unroller, DT); - - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), - L->getHeader()) - << "interleaved loop (interleaved count: " - << NV("InterleaveCount", IC) << ")"; - }); - } else { - // If we decided that it is *legal* to vectorize the loop, then do it. - - // Consider vectorizing the epilogue too if it's profitable. - VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); - if (EpilogueVF.Width.isVector()) { - - // The first pass vectorizes the main loop and creates a scalar epilogue - // to be vectorized by executing the plan (potentially with a different - // factor) again shortly afterwards. - EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, - EpilogueVF.Width.getKnownMinValue(), 1); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, - &LVL, &CM, BFI, PSI); - - LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); - LVP.executePlan(MainILV, DT); - ++LoopsVectorized; - - simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); - formLCSSARecursively(*L, *DT, LI, SE); - - // Second pass vectorizes the epilogue and adjusts the control flow - // edges from the first pass. - LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); - EPI.MainLoopVF = EPI.EpilogueVF; - EPI.MainLoopUF = EPI.EpilogueUF; - EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, - ORE, EPI, &LVL, &CM, BFI, PSI); - LVP.executePlan(EpilogILV, DT); - ++LoopsEpilogueVectorized; - - if (!MainILV.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; + { + // Optimistically generate runtime checks. Drop them if they turn out to not + // be profitable. Limit the scope of Checks, so the cleanup happens + // immediately after vector codegeneration is done. + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); + if (!VF.Width.isScalar() || IC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); + LVP.setBestPlan(VF.Width, IC); + + using namespace ore; + if (!VectorizeLoop) { + assert(IC > 1 && "interleave count should not be 1 or 0"); + // If we decided that it is not legal to vectorize the loop, then + // interleave it. + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, + &CM, BFI, PSI, Checks); + LVP.executePlan(Unroller, DT); + + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"; + }); } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI); - LVP.executePlan(LB, DT); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there are - // no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; + // If we decided that it is *legal* to vectorize the loop, then do it. + + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + if (EpilogueVF.Width.isVector()) { + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, + EpilogueVF.Width.getKnownMinValue(), + 1); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, &LVL, &CM, BFI, PSI, Checks); + + LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); + LVP.executePlan(MainILV, DT); + ++LoopsVectorized; + + simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); + formLCSSARecursively(*L, *DT, LI, SE); + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); + EPI.MainLoopVF = EPI.EpilogueVF; + EPI.MainLoopUF = EPI.EpilogueUF; + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, + ORE, EPI, &LVL, &CM, BFI, PSI, + Checks); + LVP.executePlan(EpilogILV, DT); + ++LoopsEpilogueVectorized; + + if (!MainILV.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, + &LVL, &CM, BFI, PSI, Checks); + LVP.executePlan(LB, DT); + ++LoopsVectorized; + + // Add metadata to disable runtime unrolling a scalar loop when there + // are no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + if (!LB.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } + // Report the vectorization decision. + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), + L->getHeader()) + << "vectorized loop (vectorization width: " + << NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; + }); } - // Report the vectorization decision. - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), - L->getHeader()) - << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) - << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; - }); + if (ORE->allowExtraAnalysis(LV_NAME)) + checkMixedPrecision(L, ORE); } Optional<MDNode *> RemainderLoopID = @@ -9719,8 +10451,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, PA.preserve<LoopAnalysis>(); PA.preserve<DominatorTreeAnalysis>(); } - PA.preserve<BasicAA>(); - PA.preserve<GlobalsAA>(); if (!Result.MadeCFGChange) PA.preserveSet<CFGAnalyses>(); return PA; |