diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2149 |
1 files changed, 1039 insertions, 1110 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3290439ecd07..b637b2d5ddae 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,7 +58,6 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanHCFGBuilder.h" -#include "VPlanPredicator.h" #include "VPlanTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" @@ -112,7 +111,6 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -144,10 +142,10 @@ #include <algorithm> #include <cassert> #include <cstdint> -#include <cstdlib> #include <functional> #include <iterator> #include <limits> +#include <map> #include <memory> #include <string> #include <tuple> @@ -346,13 +344,6 @@ cl::opt<bool> EnableVPlanNativePath( cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); -// FIXME: Remove this switch once we have divergence analysis. Currently we -// assume divergent non-backedge branches when this switch is true. -cl::opt<bool> EnableVPlanPredication( - "enable-vplan-predication", cl::init(false), cl::Hidden, - cl::desc("Enable VPlan-native vectorization path predicator with " - "support for outer loop vectorization.")); - // This flag enables the stress testing of the VPlan H-CFG construction in the // VPlan-native vectorization path. It must be used in conjuction with // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the @@ -481,7 +472,7 @@ public: VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. - void fixVectorizedLoop(VPTransformState &State); + void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -491,12 +482,6 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; - /// Vectorize a single first-order recurrence or pointer induction PHINode in - /// a block. This method handles the induction variable canonicalization. It - /// supports both VF = 1 for unrolled loops and arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, - VPTransformState &State); - /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, @@ -506,13 +491,6 @@ public: const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); - /// Widen an integer or floating-point induction variable \p IV. If \p Trunc - /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. \p CanonicalIV is the scalar value generated for - /// the canonical induction variable. - void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, - VPTransformState &State, Value *CanonicalIV); - /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, VPTransformState &State); @@ -527,13 +505,8 @@ public: ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask = nullptr); - /// Set the debug location in the builder \p Ptr using the debug location in - /// \p V. If \p Ptr is None then it uses the class member's Builder. - void setDebugLocFromInst(const Value *V, - Optional<IRBuilder<> *> CustomBuilder = None); - - /// Fix the non-induction PHIs in the OrigPHIsToFix vector. - void fixNonInductionPHIs(VPTransformState &State); + /// Fix the non-induction PHIs in \p Plan. + void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); /// Returns true if the reordering of FP operations is not allowed, but we are /// able to vectorize with strict in-order reductions for the given RdxDesc. @@ -546,17 +519,6 @@ public: /// element. virtual Value *getBroadcastInstrs(Value *V); - /// Add metadata from one instruction to another. - /// - /// This includes both the original MDs from \p From and additional ones (\see - /// addNewMetadata). Use this for *newly created* instructions in the vector - /// loop. - void addMetadata(Instruction *To, Instruction *From); - - /// Similar to the previous function but it adds the metadata to a - /// vector of instructions. - void addMetadata(ArrayRef<Value *> To, Instruction *From); - // Returns the resume value (bc.merge.rdx) for a reduction as // generated by fixReduction. PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); @@ -575,13 +537,9 @@ protected: /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, - Value *CountRoundDown, Value *EndValue, - BasicBlock *MiddleBlock); - - /// Introduce a conditional branch (on true, condition to be set later) at the - /// end of the header=latch connecting it to itself (across the backedge) and - /// to the exit block of \p L. - void createHeaderBranch(Loop *L); + Value *VectorTripCount, Value *EndValue, + BasicBlock *MiddleBlock, BasicBlock *VectorHeader, + VPlan &Plan); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); @@ -595,16 +553,9 @@ protected: void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); /// Clear NSW/NUW flags from reduction instructions if necessary. - void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, + void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, VPTransformState &State); - /// Fixup the LCSSA phi nodes in the unique exit block. This simply - /// means we need to add the appropriate incoming value from the middle - /// block as exiting edges from the scalar epilogue loop (if present) are - /// already in place, and we exit the vector loop exclusively to the middle - /// block. - void fixLCSSAPHIs(VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -613,30 +564,11 @@ protected: /// represented as. void truncateToMinimalBitwidths(VPTransformState &State); - /// Compute scalar induction steps. \p ScalarIV is the scalar induction - /// variable on which to base the steps, \p Step is the size of the step, and - /// \p EntryVal is the value from the original loop that maps to the steps. - /// Note that \p EntryVal doesn't have to be an induction variable - it - /// can also be a truncate instruction. - void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, - const InductionDescriptor &ID, VPValue *Def, - VPTransformState &State); - - /// Create a vector induction phi node based on an existing scalar one. \p - /// EntryVal is the value from the original loop that maps to the vector phi - /// node, and \p Step is the loop-invariant step. If \p EntryVal is a - /// truncate instruction, instead of widening the original IV, we widen a - /// version of the IV truncated to \p EntryVal's type. - void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, - Value *Step, Value *Start, - Instruction *EntryVal, VPValue *Def, - VPTransformState &State); - /// Returns (and creates if needed) the original loop trip count. - Value *getOrCreateTripCount(Loop *NewLoop); + Value *getOrCreateTripCount(BasicBlock *InsertBlock); /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(Loop *NewLoop); + Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); /// Returns a bitcasted value to the requested vector type. /// Also handles bitcasts of vector<float> <-> vector<pointer> types. @@ -645,33 +577,21 @@ protected: /// Emit a bypass check to see if the vector trip count is zero, including if /// it overflows. - void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + void emitIterationCountCheck(BasicBlock *Bypass); /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. Returns the block containing the checks or /// nullptr if no checks have been added. - BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); + BasicBlock *emitSCEVChecks(BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. /// Returns the block containing the checks or nullptr if no checks have been /// added. - BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); - - /// Compute the transformed value of Index at offset StartValue using step - /// StepValue. - /// For integer induction, returns StartValue + Index * StepValue. - /// For pointer induction, returns StartValue[Index * StepValue]. - /// FIXME: The newly created binary instructions should contain nsw/nuw - /// flags, which can be found from the original scalar operations. - Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, - const DataLayout &DL, - const InductionDescriptor &ID, - BasicBlock *VectorHeader) const; + BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, - /// vector loop preheader, middle block and scalar preheader. Also - /// allocate a loop object for the new vector loop and return it. - Loop *createVectorLoopSkeleton(StringRef Prefix); + /// vector loop preheader, middle block and scalar preheader. + void createVectorLoopSkeleton(StringRef Prefix); /// Create new phi nodes for the induction variables to resume iteration count /// in the scalar epilogue, from where the vectorized loop left off. @@ -680,21 +600,12 @@ protected: /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( - Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate /// conditional branches in the middle block, preparing the builder and - /// running the verifier. Take in the vector loop \p L as argument, and return - /// the preheader of the completed vector loop. - BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); - - /// Add additional metadata to \p To that was not present on \p Orig. - /// - /// Currently this is used to add the noalias annotations based on the - /// inserted memchecks. Use this for instructions that are *cloned* into the - /// vector loop. - void addNewMetadata(Instruction *To, const Instruction *Orig); + /// running the verifier. Return the preheader of the completed vector loop. + BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); /// Collect poison-generating recipes that may generate a poison value that is /// used after vectorization, even when their operands are not poison. Those @@ -741,13 +652,6 @@ protected: /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; - /// LoopVersioning. It's only set up (non-null) if memchecks were - /// used. - /// - /// This is currently only used to add no-alias metadata based on the - /// memchecks. The actually versioning is performed manually. - std::unique_ptr<LoopVersioning> LVer; - /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. ElementCount VF; @@ -774,9 +678,6 @@ protected: /// there can be multiple exiting edges reaching this block. BasicBlock *LoopExitBlock; - /// The vector loop body. - BasicBlock *LoopVectorBody; - /// The scalar loop body. BasicBlock *LoopScalarBody; @@ -805,10 +706,6 @@ protected: // so we can later fix-up the external users of the induction variables. DenseMap<PHINode *, Value *> IVEndValues; - // Vector of original scalar PHIs whose corresponding widened PHIs need to be - // fixed up at the end of vector code generation. - SmallVector<PHINode *, 8> OrigPHIsToFix; - /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; @@ -936,8 +833,7 @@ protected: /// Emits an iteration count bypass check once for the main loop (when \p /// ForEpilogue is false) and once for the epilogue loop (when \p /// ForEpilogue is true). - BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, - bool ForEpilogue); + BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; @@ -956,7 +852,9 @@ public: BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI, Checks) {} + EPI, LVL, CM, BFI, PSI, Checks) { + TripCount = EPI.TripCount; + } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). std::pair<BasicBlock *, Value *> @@ -966,7 +864,7 @@ protected: /// Emits an iteration count bypass check after the main vector loop has /// finished to see if there are any iterations left to execute by either /// the vector epilogue or the scalar epilogue. - BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, + BasicBlock *emitMinimumVectorEpilogueIterCountCheck( BasicBlock *Bypass, BasicBlock *Insert); void printDebugTracesAtStart() override; @@ -993,31 +891,6 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { return I; } -void InnerLoopVectorizer::setDebugLocFromInst( - const Value *V, Optional<IRBuilder<> *> CustomBuilder) { - IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; - if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { - const DILocation *DIL = Inst->getDebugLoc(); - - // When a FSDiscriminator is enabled, we don't need to add the multiply - // factors to the discriminators. - if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && - !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { - // FIXME: For scalable vectors, assume vscale=1. - auto NewDIL = - DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); - if (NewDIL) - B->SetCurrentDebugLocation(NewDIL.getValue()); - else - LLVM_DEBUG(dbgs() - << "Failed to create new discriminator: " - << DIL->getFilename() << " Line: " << DIL->getLine()); - } else - B->SetCurrentDebugLocation(DIL); - } else - B->SetCurrentDebugLocation(DebugLoc()); -} - /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I /// is passed, the message relates to that particular instruction. #ifndef NDEBUG @@ -1059,7 +932,7 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, namespace llvm { /// Return a value for Step multiplied by VF. -Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, +Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); @@ -1067,12 +940,13 @@ Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, } /// Return the runtime value for VF. -Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { +Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); return VF.isScalable() ? B.CreateVScale(EC) : EC; } -static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { +static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, + ElementCount VF) { assert(FTy->isFloatingPointTy() && "Expected floating point type!"); Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); @@ -1119,14 +993,6 @@ static std::string getDebugLocString(const Loop *L) { } #endif -void InnerLoopVectorizer::addNewMetadata(Instruction *To, - const Instruction *Orig) { - // If the loop was versioned with memchecks, add the corresponding no-alias - // metadata. - if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) - LVer->annotateInstWithNoAlias(To, Orig); -} - void InnerLoopVectorizer::collectPoisonGeneratingRecipes( VPTransformState &State) { @@ -1151,6 +1017,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( // handled. if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) || + isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPCanonicalIVPHIRecipe>(CurRec)) continue; @@ -1176,10 +1043,10 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { for (VPRecipeBase &Recipe : *VPBB) { if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { - Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); + Instruction &UnderlyingInstr = WidenRec->getIngredient(); VPDef *AddrDef = WidenRec->getAddr()->getDef(); - if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && - Legal->blockNeedsPredication(UnderlyingInstr->getParent())) + if (AddrDef && WidenRec->isConsecutive() && + Legal->blockNeedsPredication(UnderlyingInstr.getParent())) collectPoisonGeneratingInstrsInBackwardSlice( cast<VPRecipeBase>(AddrDef)); } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { @@ -1206,20 +1073,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( } } -void InnerLoopVectorizer::addMetadata(Instruction *To, - Instruction *From) { - propagateMetadata(To, From); - addNewMetadata(To, From); -} - -void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, - Instruction *From) { - for (Value *V : To) { - if (Instruction *I = dyn_cast<Instruction>(V)) - addMetadata(I, From); - } -} - PHINode *InnerLoopVectorizer::getReductionResumeValue( const RecurrenceDescriptor &RdxDesc) { auto It = ReductionResumeValues.find(&RdxDesc); @@ -1363,7 +1216,7 @@ public: /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, /// the IsOrdered flag of RdxDesc is set and we do not allow reordering /// of FP operations. - bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { + bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { return !Hints->allowReordering() && RdxDesc.isOrdered(); } @@ -1701,6 +1554,11 @@ public: private: unsigned NumPredStores = 0; + /// Convenience function that returns the value of vscale_range iff + /// vscale_range.min == vscale_range.max or otherwise returns the value + /// returned by the corresponding TLI method. + Optional<unsigned> getVScaleForTuning() const; + /// \return An upper bound for the vectorization factors for both /// fixed and scalable vectorization, where the minimum-known number of /// elements is a power-of-2 larger than zero. If scalable vectorization is @@ -1713,15 +1571,10 @@ private: /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. /// This is a helper function of computeFeasibleMaxVF. - /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure - /// issue that occurred on one of the buildbots which cannot be reproduced - /// without having access to the properietary compiler (see comments on - /// D98509). The issue is currently under investigation and this workaround - /// will be removed as soon as possible. ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF, + ElementCount MaxSafeVF, bool FoldTailByMasking); /// \return the maximum legal scalable VF, based on the safe max number @@ -2012,7 +1865,7 @@ public: /// there is no vector code generation, the check blocks are removed /// completely. void Create(Loop *L, const LoopAccessInfo &LAI, - const SCEVUnionPredicate &UnionPred) { + const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -2035,9 +1888,19 @@ public: MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, "vector.memcheck"); - MemRuntimeCheckCond = - addRuntimeChecks(MemCheckBlock->getTerminator(), L, - RtPtrChecking.getChecks(), MemCheckExp); + auto DiffChecks = RtPtrChecking.getDiffChecks(); + if (DiffChecks) { + MemRuntimeCheckCond = addDiffRuntimeChecks( + MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, + [VF](IRBuilderBase &B, unsigned Bits) { + return getRuntimeVF(B, B.getIntNTy(Bits), VF); + }, + IC); + } else { + MemRuntimeCheckCond = + addRuntimeChecks(MemCheckBlock->getTerminator(), L, + RtPtrChecking.getChecks(), MemCheckExp); + } assert(MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " "claimed checks are required"); @@ -2109,12 +1972,16 @@ public: /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and /// adjusts the branches to branch to the vector preheader or \p Bypass, /// depending on the generated condition. - BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, + BasicBlock *emitSCEVChecks(BasicBlock *Bypass, BasicBlock *LoopVectorPreHeader, BasicBlock *LoopExitBlock) { if (!SCEVCheckCond) return nullptr; - if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) + + Value *Cond = SCEVCheckCond; + // Mark the check as used, to prevent it from being removed during cleanup. + SCEVCheckCond = nullptr; + if (auto *C = dyn_cast<ConstantInt>(Cond)) if (C->isZero()) return nullptr; @@ -2133,18 +2000,15 @@ public: DT->addNewBlock(SCEVCheckBlock, Pred); DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); - ReplaceInstWithInst( - SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); - // Mark the check as used, to prevent it from being removed during cleanup. - SCEVCheckCond = nullptr; + ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); return SCEVCheckBlock; } /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts /// the branches to branch to the vector preheader or \p Bypass, depending on /// the generated condition. - BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, + BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, BasicBlock *LoopVectorPreHeader) { // Check if we generated code that checks in runtime if arrays overlap. if (!MemRuntimeCheckCond) @@ -2341,7 +2205,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { /// \p Opcode is relevant for FP induction variable. static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, Instruction::BinaryOps BinOp, ElementCount VF, - IRBuilder<> &Builder) { + IRBuilderBase &Builder) { assert(VF.isVector() && "only vector VFs are supported"); // Create and check the types. @@ -2357,9 +2221,8 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, // Create a vector of consecutive numbers from zero to VF. VectorType *InitVecValVTy = ValVTy; - Type *InitVecValSTy = STy; if (STy->isFloatingPointTy()) { - InitVecValSTy = + Type *InitVecValSTy = IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); InitVecValVTy = VectorType::get(InitVecValSTy, VLen); } @@ -2389,199 +2252,12 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); } -void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( - const InductionDescriptor &II, Value *Step, Value *Start, - Instruction *EntryVal, VPValue *Def, VPTransformState &State) { - IRBuilder<> &Builder = State.Builder; - assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && - "Expected either an induction phi-node or a truncate of it!"); - - // Construct the initial value of the vector IV in the vector loop preheader - auto CurrIP = Builder.saveIP(); - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - if (isa<TruncInst>(EntryVal)) { - assert(Start->getType()->isIntegerTy() && - "Truncation requires an integer type"); - auto *TruncType = cast<IntegerType>(EntryVal->getType()); - Step = Builder.CreateTrunc(Step, TruncType); - Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); - } - - Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); - Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); - Value *SteppedStart = getStepVector( - SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); - - // We create vector phi nodes for both integer and floating-point induction - // variables. Here, we determine the kind of arithmetic we will perform. - Instruction::BinaryOps AddOp; - Instruction::BinaryOps MulOp; - if (Step->getType()->isIntegerTy()) { - AddOp = Instruction::Add; - MulOp = Instruction::Mul; - } else { - AddOp = II.getInductionOpcode(); - MulOp = Instruction::FMul; - } - - // Multiply the vectorization factor by the step using integer or - // floating-point arithmetic as appropriate. - Type *StepType = Step->getType(); - Value *RuntimeVF; - if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); - else - RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); - Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); - - // Create a vector splat to use in the induction update. - // - // FIXME: If the step is non-constant, we create the vector splat with - // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't - // handle a constant vector splat. - Value *SplatVF = isa<Constant>(Mul) - ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(State.VF, Mul); - Builder.restoreIP(CurrIP); - - // We may need to add the step a number of times, depending on the unroll - // factor. The last of those goes into the PHI. - PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", - &*LoopVectorBody->getFirstInsertionPt()); - VecInd->setDebugLoc(EntryVal->getDebugLoc()); - Instruction *LastInduction = VecInd; - for (unsigned Part = 0; Part < UF; ++Part) { - State.set(Def, LastInduction, Part); - - if (isa<TruncInst>(EntryVal)) - addMetadata(LastInduction, EntryVal); - - LastInduction = cast<Instruction>( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); - LastInduction->setDebugLoc(EntryVal->getDebugLoc()); - } - - // Move the last step to the end of the latch block. This ensures consistent - // placement of all induction updates. - auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); - auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); - LastInduction->moveBefore(Br); - LastInduction->setName("vec.ind.next"); - - VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); - VecInd->addIncoming(LastInduction, LoopVectorLatch); -} - -void InnerLoopVectorizer::widenIntOrFpInduction( - PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, - Value *CanonicalIV) { - Value *Start = Def->getStartValue()->getLiveInIRValue(); - const InductionDescriptor &ID = Def->getInductionDescriptor(); - TruncInst *Trunc = Def->getTruncInst(); - IRBuilder<> &Builder = State.Builder; - assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); - assert(!State.VF.isZero() && "VF must be non-zero"); - - // The value from the original loop to which we are mapping the new induction - // variable. - Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; - - auto &DL = EntryVal->getModule()->getDataLayout(); - - // Generate code for the induction step. Note that induction steps are - // required to be loop-invariant - auto CreateStepValue = [&](const SCEV *Step) -> Value * { - assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && - "Induction step should be loop invariant"); - if (PSE.getSE()->isSCEVable(IV->getType())) { - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - return Exp.expandCodeFor(Step, Step->getType(), - State.CFG.VectorPreHeader->getTerminator()); - } - return cast<SCEVUnknown>(Step)->getValue(); - }; - - // The scalar value to broadcast. This is derived from the canonical - // induction variable. If a truncation type is given, truncate the canonical - // induction variable and step. Otherwise, derive these values from the - // induction descriptor. - auto CreateScalarIV = [&](Value *&Step) -> Value * { - Value *ScalarIV = CanonicalIV; - Type *NeededType = IV->getType(); - if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { - ScalarIV = - NeededType->isIntegerTy() - ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) - : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); - ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, - State.CFG.PrevBB); - ScalarIV->setName("offset.idx"); - } - if (Trunc) { - auto *TruncType = cast<IntegerType>(Trunc->getType()); - assert(Step->getType()->isIntegerTy() && - "Truncation requires an integer step"); - ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); - Step = Builder.CreateTrunc(Step, TruncType); - } - return ScalarIV; - }; - - // Fast-math-flags propagate from the original induction instruction. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) - Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); - - // Now do the actual transformations, and start with creating the step value. - Value *Step = CreateStepValue(ID.getStep()); - if (State.VF.isScalar()) { - Value *ScalarIV = CreateScalarIV(Step); - Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), - Step->getType()->getScalarSizeInBits()); - - Instruction::BinaryOps IncOp = ID.getInductionOpcode(); - if (IncOp == Instruction::BinaryOpsEnd) - IncOp = Instruction::Add; - for (unsigned Part = 0; Part < UF; ++Part) { - Value *StartIdx = ConstantInt::get(ScalarTy, Part); - Instruction::BinaryOps MulOp = Instruction::Mul; - if (Step->getType()->isFloatingPointTy()) { - StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); - MulOp = Instruction::FMul; - } - - Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); - Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); - State.set(Def, EntryPart, Part); - if (Trunc) { - assert(!Step->getType()->isFloatingPointTy() && - "fp inductions shouldn't be truncated"); - addMetadata(EntryPart, Trunc); - } - } - return; - } - - // Create a new independent vector induction variable, if one is needed. - if (Def->needsVectorIV()) - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); - - if (Def->needsScalarIV()) { - // Create scalar steps that can be used by instructions we will later - // scalarize. Note that the addition of the scalar steps will not increase - // the number of instructions in the loop in the common case prior to - // InstCombine. We will be trading one vector extract for each scalar step. - Value *ScalarIV = CreateScalarIV(Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); - } -} - -void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, - Instruction *EntryVal, - const InductionDescriptor &ID, - VPValue *Def, - VPTransformState &State) { - IRBuilder<> &Builder = State.Builder; +/// Compute scalar induction steps. \p ScalarIV is the scalar induction +/// variable on which to base the steps, \p Step is the size of the step. +static void buildScalarSteps(Value *ScalarIV, Value *Step, + const InductionDescriptor &ID, VPValue *Def, + VPTransformState &State) { + IRBuilderBase &Builder = State.Builder; // We shouldn't have to build scalar steps if we aren't vectorizing. assert(State.VF.isVector() && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. @@ -2652,6 +2328,103 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, } } +// Generate code for the induction step. Note that induction steps are +// required to be loop-invariant +static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, + Instruction *InsertBefore, + Loop *OrigLoop = nullptr) { + const DataLayout &DL = SE.getDataLayout(); + assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && + "Induction step should be loop invariant"); + if (auto *E = dyn_cast<SCEVUnknown>(Step)) + return E->getValue(); + + SCEVExpander Exp(SE, DL, "induction"); + return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); +} + +/// Compute the transformed value of Index at offset StartValue using step +/// StepValue. +/// For integer induction, returns StartValue + Index * StepValue. +/// For pointer induction, returns StartValue[Index * StepValue]. +/// FIXME: The newly created binary instructions should contain nsw/nuw +/// flags, which can be found from the original scalar operations. +static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, + Value *StartValue, Value *Step, + const InductionDescriptor &ID) { + assert(Index->getType()->getScalarType() == Step->getType() && + "Index scalar type does not match StepValue type"); + + // Note: the IR at this point is broken. We cannot use SE to create any new + // SCEV and then expand it, hoping that SCEV's simplification will give us + // a more optimal code. Unfortunately, attempt of doing so on invalid IR may + // lead to various SCEV crashes. So all we can do is to use builder and rely + // on InstCombine for future simplifications. Here we handle some trivial + // cases only. + auto CreateAdd = [&B](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast<ConstantInt>(X)) + if (CX->isZero()) + return Y; + if (auto *CY = dyn_cast<ConstantInt>(Y)) + if (CY->isZero()) + return X; + return B.CreateAdd(X, Y); + }; + + // We allow X to be a vector type, in which case Y will potentially be + // splatted into a vector with the same element count. + auto CreateMul = [&B](Value *X, Value *Y) { + assert(X->getType()->getScalarType() == Y->getType() && + "Types don't match!"); + if (auto *CX = dyn_cast<ConstantInt>(X)) + if (CX->isOne()) + return Y; + if (auto *CY = dyn_cast<ConstantInt>(Y)) + if (CY->isOne()) + return X; + VectorType *XVTy = dyn_cast<VectorType>(X->getType()); + if (XVTy && !isa<VectorType>(Y->getType())) + Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); + return B.CreateMul(X, Y); + }; + + switch (ID.getKind()) { + case InductionDescriptor::IK_IntInduction: { + assert(!isa<VectorType>(Index->getType()) && + "Vector indices not supported for integer inductions yet"); + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) + return B.CreateSub(StartValue, Index); + auto *Offset = CreateMul(Index, Step); + return CreateAdd(StartValue, Offset); + } + case InductionDescriptor::IK_PtrInduction: { + assert(isa<Constant>(Step) && + "Expected constant step for pointer induction"); + return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); + } + case InductionDescriptor::IK_FpInduction: { + assert(!isa<VectorType>(Index->getType()) && + "Vector indices not supported for FP inductions yet"); + assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); + auto InductionBinOp = ID.getInductionBinOp(); + assert(InductionBinOp && + (InductionBinOp->getOpcode() == Instruction::FAdd || + InductionBinOp->getOpcode() == Instruction::FSub) && + "Original bin op should be defined for FP induction"); + + Value *MulExp = B.CreateFMul(Step, Index); + return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, + "induction"); + } + case InductionDescriptor::IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); +} + void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, VPTransformState &State) { @@ -2734,7 +2507,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); - setDebugLocFromInst(AddrPart); + State.setDebugLocFromInst(AddrPart); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2760,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); } - setDebugLocFromInst(Instr); + State.setDebugLocFromInst(Instr); Value *PoisonVec = PoisonValue::get(VecTy); Value *MaskForGaps = nullptr; @@ -2915,8 +2688,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, if (!Instance.isFirstIteration()) return; - setDebugLocFromInst(Instr); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); @@ -2933,21 +2704,23 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) Cloned->dropPoisonGeneratingFlags(); - State.Builder.SetInsertPoint(Builder.GetInsertBlock(), - Builder.GetInsertPoint()); + if (Instr->getDebugLoc()) + State.setDebugLocFromInst(Instr); + // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. for (auto &I : enumerate(RepRecipe->operands())) { auto InputInstance = Instance; VPValue *Operand = I.value(); - if (State.Plan->isUniformAfterVectorization(Operand)) + VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); + if (OperandR && OperandR->isUniform()) InputInstance.Lane = VPLane::getFirstLane(); Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); } - addNewMetadata(Cloned, Instr); + State.addNewMetadata(Cloned, Instr); // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + State.Builder.Insert(Cloned); State.set(RepRecipe, Cloned, Instance); @@ -2960,29 +2733,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, PredicatedInstructions.push_back(Cloned); } -void InnerLoopVectorizer::createHeaderBranch(Loop *L) { - BasicBlock *Header = L->getHeader(); - assert(!L->getLoopLatch() && "loop should not have a latch at this point"); - - IRBuilder<> B(Header->getTerminator()); - Instruction *OldInst = - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); - setDebugLocFromInst(OldInst, &B); - - // Connect the header to the exit and header blocks and replace the old - // terminator. - B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); - - // Now we have two terminators. Remove the old one from the block. - Header->getTerminator()->eraseFromParent(); -} - -Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { +Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { if (TripCount) return TripCount; - assert(L && "Create Trip Count for null loop."); - IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + assert(InsertBlock); + IRBuilder<> Builder(InsertBlock->getTerminator()); // Find the loop boundaries. ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); @@ -3006,7 +2762,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { const SCEV *ExitCount = SE->getAddExpr( BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. @@ -3014,22 +2770,23 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { // Count holds the overall loop count (N). TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - L->getLoopPreheader()->getTerminator()); + InsertBlock->getTerminator()); if (TripCount->getType()->isPointerTy()) TripCount = CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", - L->getLoopPreheader()->getTerminator()); + InsertBlock->getTerminator()); return TripCount; } -Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { +Value * +InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (VectorTripCount) return VectorTripCount; - Value *TC = getOrCreateTripCount(L); - IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + Value *TC = getOrCreateTripCount(InsertBlock); + IRBuilder<> Builder(InsertBlock->getTerminator()); Type *Ty = TC->getType(); // This is where we can make the step a runtime constant. @@ -3041,6 +2798,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // overflows: the vector induction variable will eventually wrap to zero given // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. + // For scalable vectors the VF is not guaranteed to be a power of 2, but this + // is accounted for in emitIterationCountCheck that adds an overflow check. if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); @@ -3103,9 +2862,8 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } -void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, - BasicBlock *Bypass) { - Value *Count = getOrCreateTripCount(L); +void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { + Value *Count = getOrCreateTripCount(LoopVectorPreHeader); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -3120,10 +2878,23 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. + Type *CountTy = Count->getType(); Value *CheckMinIters = Builder.getFalse(); - if (!Cost->foldTailByMasking()) { - Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); + Value *Step = createStepForVF(Builder, CountTy, VF, UF); + if (!Cost->foldTailByMasking()) CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + else if (VF.isScalable()) { + // vscale is not necessarily a power-of-2, which means we cannot guarantee + // an overflow to zero when updating induction variables and so an + // additional overflow check is required before entering the vector loop. + + // Get the maximum unsigned value for the type. + Value *MaxUIntTripCount = + ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); + Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); + + // Don't execute the vector loop if (UMax - n) < (VF * UF). + CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); } // Create new preheader for vector loop. LoopVectorPreHeader = @@ -3148,10 +2919,10 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, LoopBypassBlocks.push_back(TCCheckBlock); } -BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { BasicBlock *const SCEVCheckBlock = - RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); + RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) return nullptr; @@ -3176,14 +2947,13 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { return SCEVCheckBlock; } -BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, - BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { // VPlan-native path does not do any analysis for runtime checks currently. if (EnableVPlanNativePath) return nullptr; BasicBlock *const MemCheckBlock = - RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); + RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); // Check if we generated code that checks in runtime if arrays overlap. We put // the checks into a separate block to make the more common case of few @@ -3197,7 +2967,8 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, "to vectorize."); ORE->emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", - L->getStartLoc(), L->getHeader()) + OrigLoop->getStartLoc(), + OrigLoop->getHeader()) << "Code-size may be reduced by not forcing " "vectorization, or by source-code modifications " "eliminating the need for runtime checks " @@ -3209,116 +2980,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, AddedSafetyChecks = true; - // We currently don't use LoopVersioning for the actual loop cloning but we - // still use it to add the noalias metadata. - LVer = std::make_unique<LoopVersioning>( - *Legal->getLAI(), - Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, - DT, PSE.getSE()); - LVer->prepareNoAliasMetadata(); return MemCheckBlock; } -Value *InnerLoopVectorizer::emitTransformedIndex( - IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, - const InductionDescriptor &ID, BasicBlock *VectorHeader) const { - - SCEVExpander Exp(*SE, DL, "induction"); - auto Step = ID.getStep(); - auto StartValue = ID.getStartValue(); - assert(Index->getType()->getScalarType() == Step->getType() && - "Index scalar type does not match StepValue type"); - - // Note: the IR at this point is broken. We cannot use SE to create any new - // SCEV and then expand it, hoping that SCEV's simplification will give us - // a more optimal code. Unfortunately, attempt of doing so on invalid IR may - // lead to various SCEV crashes. So all we can do is to use builder and rely - // on InstCombine for future simplifications. Here we handle some trivial - // cases only. - auto CreateAdd = [&B](Value *X, Value *Y) { - assert(X->getType() == Y->getType() && "Types don't match!"); - if (auto *CX = dyn_cast<ConstantInt>(X)) - if (CX->isZero()) - return Y; - if (auto *CY = dyn_cast<ConstantInt>(Y)) - if (CY->isZero()) - return X; - return B.CreateAdd(X, Y); - }; - - // We allow X to be a vector type, in which case Y will potentially be - // splatted into a vector with the same element count. - auto CreateMul = [&B](Value *X, Value *Y) { - assert(X->getType()->getScalarType() == Y->getType() && - "Types don't match!"); - if (auto *CX = dyn_cast<ConstantInt>(X)) - if (CX->isOne()) - return Y; - if (auto *CY = dyn_cast<ConstantInt>(Y)) - if (CY->isOne()) - return X; - VectorType *XVTy = dyn_cast<VectorType>(X->getType()); - if (XVTy && !isa<VectorType>(Y->getType())) - Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); - return B.CreateMul(X, Y); - }; - - // Get a suitable insert point for SCEV expansion. For blocks in the vector - // loop, choose the end of the vector loop header (=VectorHeader), because - // the DomTree is not kept up-to-date for additional blocks generated in the - // vector loop. By using the header as insertion point, we guarantee that the - // expanded instructions dominate all their uses. - auto GetInsertPoint = [this, &B, VectorHeader]() { - BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); - if (InsertBB != LoopVectorBody && - LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) - return VectorHeader->getTerminator(); - return &*B.GetInsertPoint(); - }; - - switch (ID.getKind()) { - case InductionDescriptor::IK_IntInduction: { - assert(!isa<VectorType>(Index->getType()) && - "Vector indices not supported for integer inductions yet"); - assert(Index->getType() == StartValue->getType() && - "Index type does not match StartValue type"); - if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) - return B.CreateSub(StartValue, Index); - auto *Offset = CreateMul( - Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); - return CreateAdd(StartValue, Offset); - } - case InductionDescriptor::IK_PtrInduction: { - assert(isa<SCEVConstant>(Step) && - "Expected constant step for pointer induction"); - return B.CreateGEP( - ID.getElementType(), StartValue, - CreateMul(Index, - Exp.expandCodeFor(Step, Index->getType()->getScalarType(), - GetInsertPoint()))); - } - case InductionDescriptor::IK_FpInduction: { - assert(!isa<VectorType>(Index->getType()) && - "Vector indices not supported for FP inductions yet"); - assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); - auto InductionBinOp = ID.getInductionBinOp(); - assert(InductionBinOp && - (InductionBinOp->getOpcode() == Instruction::FAdd || - InductionBinOp->getOpcode() == Instruction::FSub) && - "Original bin op should be defined for FP induction"); - - Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); - Value *MulExp = B.CreateFMul(StepValue, Index); - return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, - "induction"); - } - case InductionDescriptor::IK_NoInduction: - return nullptr; - } - llvm_unreachable("invalid enum"); -} - -Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { +void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); @@ -3350,43 +3015,24 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); - // We intentionally don't let SplitBlock to update LoopInfo since - // LoopVectorBody should belong to another loop than LoopVectorPreHeader. - // LoopVectorBody is explicitly added to the correct place few lines later. - LoopVectorBody = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - nullptr, nullptr, Twine(Prefix) + "vector.body"); - - // Update dominator for loop exit. + // Update dominator for loop exit. During skeleton creation, only the vector + // pre-header and the middle block are created. The vector loop is entirely + // created during VPlan exection. if (!Cost->requiresScalarEpilogue(VF)) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); - - // Create and register the new vector loop. - Loop *Lp = LI->AllocateLoop(); - Loop *ParentLoop = OrigLoop->getParentLoop(); - - // Insert the new loop into the loop nest and register the new basic blocks - // before calling any utilities such as SCEV that require valid LoopInfo. - if (ParentLoop) { - ParentLoop->addChildLoop(Lp); - } else { - LI->addTopLevelLoop(Lp); - } - Lp->addBasicBlockToLoop(LoopVectorBody, *LI); - return Lp; } void InnerLoopVectorizer::createInductionResumeValues( - Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { + std::pair<BasicBlock *, Value *> AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); - Value *VectorTripCount = getOrCreateVectorTripCount(L); - assert(VectorTripCount && L && "Expected valid arguments"); + Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); + assert(VectorTripCount && "Expected valid arguments"); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3399,19 +3045,13 @@ void InnerLoopVectorizer::createInductionResumeValues( PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; - // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = - PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", - LoopScalarPreHeader->getTerminator()); - // Copy original phi DL over to the new one. - BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; Value *EndValueFromAdditionalBypass = AdditionalBypass.second; if (OrigPhi == OldInduction) { // We know what the end value is. EndValue = VectorTripCount; } else { - IRBuilder<> B(L->getLoopPreheader()->getTerminator()); + IRBuilder<> B(LoopVectorPreHeader->getTerminator()); // Fast-math-flags propagate from the original induction instruction. if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) @@ -3420,10 +3060,10 @@ void InnerLoopVectorizer::createInductionResumeValues( Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = CastInst::getCastOpcode(VectorTripCount, true, StepType, true); - Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); - const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); - EndValue = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); + Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); + Value *Step = + CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); + EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); EndValue->setName("ind.end"); // Compute the end value for the additional bypass (if applicable). @@ -3431,13 +3071,23 @@ void InnerLoopVectorizer::createInductionResumeValues( B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, StepType, true); - CRD = - B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); + Value *Step = + CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); + VTC = + B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); EndValueFromAdditionalBypass = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); + emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); EndValueFromAdditionalBypass->setName("ind.end"); } } + + // Create phi nodes to merge from the backedge-taken check block. + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); + // Copy original phi DL over to the new one. + BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); + // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); @@ -3456,13 +3106,10 @@ void InnerLoopVectorizer::createInductionResumeValues( } } -BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, - MDNode *OrigLoopID) { - assert(L && "Expected valid loop."); - +BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { // The trip counts should be cached by now. - Value *Count = getOrCreateTripCount(L); - Value *VectorTripCount = getOrCreateVectorTripCount(L); + Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); @@ -3487,14 +3134,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); } - // Get ready to start creating new instructions into the vectorized body. - assert(LoopVectorPreHeader == L->getLoopPreheader() && - "Inconsistent vector loop preheader"); - Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); - #ifdef EXPENSIVE_CHECKS assert(DT->verify(DominatorTree::VerificationLevel::Fast)); - LI->verify(*DT); #endif return LoopVectorPreHeader; @@ -3517,7 +3158,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { |/ | | v | [ ] \ - | [ ]_| <-- vector loop. + | [ ]_| <-- vector loop (created during VPlan execution). | | | v \ -[ ] <--- middle-block. @@ -3544,34 +3185,32 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { // simply happens to be prone to hitting this in practice. In theory, we // can hit the same issue for any SCEV, or ValueTracking query done during // mutation. See PR49900. - getOrCreateTripCount(OrigLoop); + getOrCreateTripCount(OrigLoop->getLoopPreheader()); // Create an empty vector loop, and prepare basic blocks for the runtime // checks. - Loop *Lp = createVectorLoopSkeleton(""); + createVectorLoopSkeleton(""); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. This check also covers the case where the // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); + emitIterationCountCheck(LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. - emitSCEVChecks(Lp, LoopScalarPreHeader); + emitSCEVChecks(LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - createHeaderBranch(Lp); + emitMemRuntimeChecks(LoopScalarPreHeader); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(Lp); + createInductionResumeValues(); - return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; + return {completeLoopSkeleton(OrigLoopID), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -3580,8 +3219,9 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { // value for the IV when arriving directly from the middle block. void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, - Value *CountRoundDown, Value *EndValue, - BasicBlock *MiddleBlock) { + Value *VectorTripCount, Value *EndValue, + BasicBlock *MiddleBlock, + BasicBlock *VectorHeader, VPlan &Plan) { // There are two kinds of external IV usages - those that use the value // computed in the last iteration (the PHI) and those that use the penultimate // value (the value that feeds into the phi from the loop latch). @@ -3608,8 +3248,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, for (User *U : OrigPhi->users()) { auto *UI = cast<Instruction>(U); if (!OrigLoop->contains(UI)) { - const DataLayout &DL = - OrigLoop->getHeader()->getModule()->getDataLayout(); assert(isa<PHINode>(UI) && "Expected LCSSA form"); IRBuilder<> B(MiddleBlock->getTerminator()); @@ -3619,15 +3257,18 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); Value *CountMinusOne = B.CreateSub( - CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); + VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); Value *CMO = !II.getStep()->getType()->isIntegerTy() ? B.CreateCast(Instruction::SIToFP, CountMinusOne, II.getStep()->getType()) : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); CMO->setName("cast.cmo"); + + Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), + VectorHeader->getTerminator()); Value *Escape = - emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); + emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } @@ -3640,8 +3281,10 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, // In this case, if IV1 has an external use, we need to avoid adding both // "last value of IV1" and "penultimate value of IV2". So, verify that we // don't already have an incoming value for the middle block. - if (PHI->getBasicBlockIndex(MiddleBlock) == -1) + if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { PHI->addIncoming(I.second, MiddleBlock); + Plan.removeLiveOut(PHI); + } } } @@ -3920,18 +3563,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { } } -void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { +void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, + VPlan &Plan) { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF.isVector()) truncateToMinimalBitwidths(State); // Fix widened non-induction PHIs by setting up the PHI operands. - if (OrigPHIsToFix.size()) { - assert(EnableVPlanNativePath && - "Unexpected non-induction PHIs for fixup in non VPlan-native path"); - fixNonInductionPHIs(State); - } + if (EnableVPlanNativePath) + fixNonInductionPHIs(Plan, State); // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI @@ -3942,24 +3583,37 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); - // If we inserted an edge from the middle block to the unique exit block, - // update uses outside the loop (phis) to account for the newly inserted - // edge. - if (!Cost->requiresScalarEpilogue(VF)) { + VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); + Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); + if (Cost->requiresScalarEpilogue(VF)) { + // No edge from the middle block to the unique exit block has been inserted + // and there is nothing to fix from vector loop; phis should have incoming + // from scalar loop only. + Plan.clearLiveOuts(); + } else { + // If we inserted an edge from the middle block to the unique exit block, + // update uses outside the loop (phis) to account for the newly inserted + // edge. + // Fix-up external users of the induction variables. for (auto &Entry : Legal->getInductionVars()) fixupIVUsers(Entry.first, Entry.second, - getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), - IVEndValues[Entry.first], LoopMiddleBlock); - - fixLCSSAPHIs(State); + getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), + IVEndValues[Entry.first], LoopMiddleBlock, + VectorLoop->getHeader(), Plan); } + // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated + // in the exit block, so update the builder. + State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); + for (auto &KV : Plan.getLiveOuts()) + KV.second->fixPhi(Plan, State); + for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); // Remove redundant induction instructions. - cse(LoopVectorBody); + cse(VectorLoop->getHeader()); // Set/update profile weights for the vector and remainder loops as original // loop iterations are now distributed among them. Note that original loop @@ -3974,9 +3628,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // For scalable vectorization we can't know at compile time how many iterations // of the loop are handled in one vector iteration, so instead assume a pessimistic // vscale of '1'. - setProfileInfoAfterUnrolling( - LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); + setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, + LI->getLoopFor(LoopScalarBody), + VF.getKnownMinValue() * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { @@ -3986,7 +3640,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { // the currently empty PHI nodes. At this point every instruction in the // original loop is widened to a vector form so we can use them to construct // the incoming edges. - VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); + VPBasicBlock *Header = + State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) fixReduction(ReductionPhi, State); @@ -4102,8 +3757,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence( // and thus no phis which needed updated. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) + if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); + State.Plan->removeLiveOut(&LCSSAPhi); + } } void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, @@ -4117,14 +3774,14 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, RecurKind RK = RdxDesc.getRecurrenceKind(); TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - setDebugLocFromInst(ReductionStartValue); + State.setDebugLocFromInst(ReductionStartValue); VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); // This is the vector-clone of the value that leaves the loop. Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); // Wrap flags are in general invalid after vectorization, clear them. - clearReductionWrapFlags(RdxDesc, State); + clearReductionWrapFlags(PhiR, State); // Before each round, move the insertion point right between // the PHIs and the values we are going to write. @@ -4132,9 +3789,13 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // instructions. Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - setDebugLocFromInst(LoopExitInst); + State.setDebugLocFromInst(LoopExitInst); Type *PhiTy = OrigPhi->getType(); + + VPBasicBlock *LatchVPBB = + PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); + BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; // If tail is folded by masking, the vector value to leave the loop should be // a Select choosing between the vectorized LoopExitInst and vectorized Phi, // instead of the former. For an inloop reduction the reduction will already @@ -4142,17 +3803,20 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); - Value *Sel = nullptr; + SelectInst *Sel = nullptr; for (User *U : VecLoopExitInst->users()) { if (isa<SelectInst>(U)) { assert(!Sel && "Reduction exit feeding two selects"); - Sel = U; + Sel = cast<SelectInst>(U); } else assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); } assert(Sel && "Reduction exit feeds no select"); State.reset(LoopExitInstDef, Sel, Part); + if (isa<FPMathOperator>(Sel)) + Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); + // If the target can create a predicated operator for the reduction at no // extra cost in the loop (for example a predicated vadd), it can be // cheaper for the select to remain in the loop than be sunk out of it, @@ -4164,8 +3828,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, TargetTransformInfo::ReductionFlags())) { auto *VecRdxPhi = cast<PHINode>(State.get(PhiR, Part)); - VecRdxPhi->setIncomingValueForBlock( - LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); + VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); } } } @@ -4176,8 +3839,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); - Builder.SetInsertPoint( - LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); + Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); VectorParts RdxParts(UF); for (unsigned Part = 0; Part < UF; ++Part) { RdxParts[Part] = State.get(LoopExitInstDef, Part); @@ -4208,7 +3870,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // conditional branch, and (c) other passes may add new predecessors which // terminate on this line. This is the easiest way to ensure we don't // accidentally cause an extra step back into the loop while debugging. - setDebugLocFromInst(LoopMiddleBlock->getTerminator()); + State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); if (PhiR->isOrdered()) ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); else { @@ -4265,6 +3927,17 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // Set the resume value for this reduction ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); + // If there were stores of the reduction value to a uniform memory address + // inside the loop, create the final store here. + if (StoreInst *SI = RdxDesc.IntermediateStore) { + StoreInst *NewSI = + Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); + propagateMetadata(NewSI, SI); + + // If the reduction value is used in other places, + // then let the code below create PHI's for that. + } + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -4273,8 +3946,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // fixFirstOrderRecurrence for a more complete explaination of the logic. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) + if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); + State.Plan->removeLiveOut(&LCSSAPhi); + } // Fix the scalar loop reduction variable with the incoming reduction sum // from the vector body and from the backedge value. @@ -4287,63 +3962,35 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } -void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, +void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, VPTransformState &State) { + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind RK = RdxDesc.getRecurrenceKind(); if (RK != RecurKind::Add && RK != RecurKind::Mul) return; - Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); - assert(LoopExitInstr && "null loop exit instruction"); - SmallVector<Instruction *, 8> Worklist; - SmallPtrSet<Instruction *, 8> Visited; - Worklist.push_back(LoopExitInstr); - Visited.insert(LoopExitInstr); + SmallVector<VPValue *, 8> Worklist; + SmallPtrSet<VPValue *, 8> Visited; + Worklist.push_back(PhiR); + Visited.insert(PhiR); while (!Worklist.empty()) { - Instruction *Cur = Worklist.pop_back_val(); - if (isa<OverflowingBinaryOperator>(Cur)) - for (unsigned Part = 0; Part < UF; ++Part) { - // FIXME: Should not rely on getVPValue at this point. - Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); - cast<Instruction>(V)->dropPoisonGeneratingFlags(); + VPValue *Cur = Worklist.pop_back_val(); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = State.get(Cur, Part); + if (!isa<OverflowingBinaryOperator>(V)) + break; + cast<Instruction>(V)->dropPoisonGeneratingFlags(); } - for (User *U : Cur->users()) { - Instruction *UI = cast<Instruction>(U); - if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && - Visited.insert(UI).second) - Worklist.push_back(UI); - } - } -} - -void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) - // Some phis were already hand updated by the reduction and recurrence - // code above, leave them alone. - continue; - - auto *IncomingValue = LCSSAPhi.getIncomingValue(0); - // Non-instruction incoming values will have only one value. - - VPLane Lane = VPLane::getFirstLane(); - if (isa<Instruction>(IncomingValue) && - !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), - VF)) - Lane = VPLane::getLastLaneForVF(VF); - - // Can be a loop invariant incoming value or the last scalar value to be - // extracted from the vectorized loop. - // FIXME: Should not rely on getVPValue at this point. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - Value *lastIncomingValue = - OrigLoop->isLoopInvariant(IncomingValue) - ? IncomingValue - : State.get(State.Plan->getVPValue(IncomingValue, true), - VPIteration(UF - 1, Lane)); - LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); + for (VPUser *U : Cur->users()) { + auto *UserRecipe = dyn_cast<VPRecipeBase>(U); + if (!UserRecipe) + continue; + for (VPValue *V : UserRecipe->definedValues()) + if (Visited.insert(V).second) + Worklist.push_back(V); + } } } @@ -4421,17 +4068,23 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { } while (Changed); } -void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { - for (PHINode *OrigPhi : OrigPHIsToFix) { - VPWidenPHIRecipe *VPPhi = - cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); - PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); - // Make sure the builder has a valid insert point. - Builder.SetInsertPoint(NewPhi); - for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { - VPValue *Inc = VPPhi->getIncomingValue(i); - VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); - NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); +void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, + VPTransformState &State) { + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { + for (VPRecipeBase &P : VPBB->phis()) { + VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); + if (!VPPhi) + continue; + PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); + // Make sure the builder has a valid insert point. + Builder.SetInsertPoint(NewPhi); + for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { + VPValue *Inc = VPPhi->getIncomingValue(i); + VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); + NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); + } } } } @@ -4441,139 +4094,6 @@ bool InnerLoopVectorizer::useOrderedReductions( return Cost->useOrderedReductions(RdxDesc); } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, - VPWidenPHIRecipe *PhiR, - VPTransformState &State) { - PHINode *P = cast<PHINode>(PN); - if (EnableVPlanNativePath) { - // Currently we enter here in the VPlan-native path for non-induction - // PHIs where all control flow is uniform. We simply widen these PHIs. - // Create a vector phi with no operands - the vector phi operands will be - // set at the end of vector code generation. - Type *VecTy = (State.VF.isScalar()) - ? PN->getType() - : VectorType::get(PN->getType(), State.VF); - Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); - State.set(PhiR, VecPhi, 0); - OrigPHIsToFix.push_back(P); - - return; - } - - assert(PN->getParent() == OrigLoop->getHeader() && - "Non-header phis should have been handled elsewhere"); - - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. This is - // stage #1: We create a new vector PHI node with no incoming edges. We'll use - // this value when we vectorize all of the instructions that use the PHI. - - assert(!Legal->isReductionVariable(P) && - "reductions should be handled elsewhere"); - - setDebugLocFromInst(P); - - // This PHINode must be an induction variable. - // Make sure that we know about it. - assert(Legal->getInductionVars().count(P) && "Not an induction variable"); - - InductionDescriptor II = Legal->getInductionVars().lookup(P); - const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - - auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); - PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); - - // FIXME: The newly created binary instructions should contain nsw/nuw flags, - // which can be found from the original scalar operations. - switch (II.getKind()) { - case InductionDescriptor::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case InductionDescriptor::IK_IntInduction: - case InductionDescriptor::IK_FpInduction: - llvm_unreachable("Integer/fp induction is handled elsewhere."); - case InductionDescriptor::IK_PtrInduction: { - // Handle the pointer induction variable case. - assert(P->getType()->isPointerTy() && "Unexpected type."); - - if (Cost->isScalarAfterVectorization(P, State.VF)) { - // This is the normalized GEP that starts counting at zero. - Value *PtrInd = - Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); - // Determine the number of scalars we need to generate for each unroll - // iteration. If the instruction is uniform, we only need to generate the - // first lane. Otherwise, we generate all VF values. - bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); - assert((IsUniform || !State.VF.isScalable()) && - "Cannot scalarize a scalable VF"); - unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *PartStart = - createStepForVF(Builder, PtrInd->getType(), VF, Part); - - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Value *Idx = Builder.CreateAdd( - PartStart, ConstantInt::get(PtrInd->getType(), Lane)); - Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), - DL, II, State.CFG.PrevBB); - SclrGep->setName("next.gep"); - State.set(PhiR, SclrGep, VPIteration(Part, Lane)); - } - } - return; - } - assert(isa<SCEVConstant>(II.getStep()) && - "Induction step not a SCEV constant!"); - Type *PhiType = II.getStep()->getType(); - - // Build a pointer phi - Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); - Type *ScStValueType = ScalarStartValue->getType(); - PHINode *NewPointerPhi = - PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); - NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); - - // A pointer induction, performed by using a gep - BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); - Instruction *InductionLoc = LoopLatch->getTerminator(); - const SCEV *ScalarStep = II.getStep(); - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Value *ScalarStepValue = - Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); - Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); - Value *NumUnrolledElems = - Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); - Value *InductionGEP = GetElementPtrInst::Create( - II.getElementType(), NewPointerPhi, - Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", - InductionLoc); - NewPointerPhi->addIncoming(InductionGEP, LoopLatch); - - // Create UF many actual address geps that use the pointer - // phi as base and a vectorized version of the step value - // (<step*0, ..., step*N>) as offset. - for (unsigned Part = 0; Part < State.UF; ++Part) { - Type *VecPhiType = VectorType::get(PhiType, State.VF); - Value *StartOffsetScalar = - Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); - Value *StartOffset = - Builder.CreateVectorSplat(State.VF, StartOffsetScalar); - // Create a vector of consecutive numbers from zero to VF. - StartOffset = - Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); - - Value *GEP = Builder.CreateGEP( - II.getElementType(), NewPointerPhi, - Builder.CreateMul( - StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), - "vector.gep")); - State.set(PhiR, GEP, Part); - } - } - } -} - /// A helper function for checking whether an integer division-related /// instruction may divide by zero (in which case it must be predicated if /// executed conditionally in the scalar code). @@ -4597,7 +4117,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, VPTransformState &State) { assert(!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); - setDebugLocFromInst(&I); + State.setDebugLocFromInst(&I); Module *M = I.getParent()->getParent()->getParent(); auto *CI = cast<CallInst>(&I); @@ -4627,13 +4147,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, // Some intrinsics have a scalar argument - don't replace it with a // vector. Value *Arg; - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) + if (!UseVectorIntrinsic || + !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) Arg = State.get(I.value(), Part); - else { + else Arg = State.get(I.value(), VPIteration(0, 0)); - if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) - TysForDecl.push_back(Arg->getType()); - } + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) + TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); } @@ -4661,7 +4181,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, V->copyFastMathFlags(CI); State.set(Def, V, Part); - addMetadata(V, &I); + State.addMetadata(V, &I); } } @@ -4672,6 +4192,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); + // This avoids any chances of creating a REPLICATE recipe during planning + // since that would result in generation of scalarized code during execution, + // which is not supported for scalable vectors. + if (VF.isScalable()) { + Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); + return; + } + SmallSetVector<Instruction *, 8> Worklist; // These sets are used to seed the analysis with pointers used by memory @@ -4761,7 +4289,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { } // Insert the forced scalars. - // FIXME: Currently widenPHIInstruction() often creates a dead vector + // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector // induction variable when the PHI user is scalarized. auto ForcedScalar = ForcedScalars.find(VF); if (ForcedScalar != ForcedScalars.end()) @@ -4888,6 +4416,27 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (hasIrregularType(ScalarTy, DL)) return false; + // If the group involves a non-integral pointer, we may not be able to + // losslessly cast all values to a common type. + unsigned InterleaveFactor = Group->getFactor(); + bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); + for (unsigned i = 0; i < InterleaveFactor; i++) { + Instruction *Member = Group->getMember(i); + if (!Member) + continue; + auto *MemberTy = getLoadStoreType(Member); + bool MemberNI = DL.isNonIntegralPointerType(MemberTy); + // Don't coerce non-integral pointers to integers or vice versa. + if (MemberNI != ScalarNI) { + // TODO: Consider adding special nullptr value case here + return false; + } else if (MemberNI && ScalarNI && + ScalarTy->getPointerAddressSpace() != + MemberTy->getPointerAddressSpace()) { + return false; + } + } + // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that // needs predication, or it was decided to use masking to deal with gaps @@ -5170,7 +4719,7 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return true; } - if (!PSE.getUnionPredicate().getPredicates().empty()) { + if (!PSE.getPredicate().isAlwaysTrue()) { reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", "runtime SCEV checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " @@ -5461,14 +5010,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } } - // For scalable vectors don't use tail folding for low trip counts or - // optimizing for code size. We only permit this if the user has explicitly - // requested it. - if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && - ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && - MaxFactors.ScalableVF.isVector()) - MaxFactors.ScalableVF = ElementCount::getScalable(0); - // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. @@ -5511,7 +5052,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF, bool FoldTailByMasking) { + ElementCount MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector @@ -5556,9 +5097,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( return ElementCount::getFixed(ClampedConstTripCount); } + TargetTransformInfo::RegisterKind RegKind = + ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector; ElementCount MaxVF = MaxVectorElementCount; - if (TTI.shouldMaximizeVectorBandwidth() || - (MaximizeBandwidth && isScalarEpilogueAllowed())) { + if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && + TTI.shouldMaximizeVectorBandwidth(RegKind))) { auto MaxVectorElementCountMaxBW = ElementCount::get( PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), ComputeScalableMaxVF); @@ -5596,10 +5140,27 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( MaxVF = MinVF; } } + + // Invalidate any widening decisions we might have made, in case the loop + // requires prediction (decided later), but we have already made some + // load/store widening decisions. + invalidateCostModelingDecisions(); } return MaxVF; } +Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { + if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); + auto Min = Attr.getVScaleRangeMin(); + auto Max = Attr.getVScaleRangeMax(); + if (Max && Min == Max) + return Max; + } + + return TTI.getVScaleForTuning(); +} + bool LoopVectorizationCostModel::isMoreProfitable( const VectorizationFactor &A, const VectorizationFactor &B) const { InstructionCost CostA = A.Cost; @@ -5624,7 +5185,7 @@ bool LoopVectorizationCostModel::isMoreProfitable( // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { + if (Optional<unsigned> VScale = getVScaleForTuning()) { if (A.Width.isScalable()) EstimatedWidthA *= VScale.getValue(); if (B.Width.isScalable()) @@ -5651,7 +5212,8 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( assert(VFCandidates.count(ElementCount::getFixed(1)) && "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, + ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -5669,12 +5231,12 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( continue; VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first); + VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; - if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) - AssumedMinimumVscale = VScale.getValue(); + if (Optional<unsigned> VScale = getVScaleForTuning()) + AssumedMinimumVscale = *VScale; unsigned Width = Candidate.Width.isScalable() ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale @@ -5862,7 +5424,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); if (LVP.hasPlanWithVF(ForcedEC)) - return {ForcedEC, 0}; + return {ForcedEC, 0, 0}; else { LLVM_DEBUG( dbgs() @@ -5885,8 +5447,20 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( return Result; } + // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know + // the main loop handles 8 lanes per iteration. We could still benefit from + // vectorizing the epilogue loop with VF=4. + ElementCount EstimatedRuntimeVF = MainLoopVF; + if (MainLoopVF.isScalable()) { + EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); + if (Optional<unsigned> VScale = getVScaleForTuning()) + EstimatedRuntimeVF *= *VScale; + } + for (auto &NextVF : ProfitableVFs) - if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && + ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || + ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && LVP.hasPlanWithVF(NextVF.Width)) Result = NextVF; @@ -6006,6 +5580,18 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) return 1; + // If we did not calculate the cost for VF (because the user selected the VF) + // then we calculate the cost of VF here. + if (LoopCost == 0) { + InstructionCost C = expectedCost(VF).first; + assert(C.isValid() && "Expected to have chosen a VF with valid cost"); + LoopCost = *C.getValue(); + + // Loop body is free and there is no need for interleaving. + if (LoopCost == 0) + return 1; + } + RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. @@ -6097,16 +5683,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, assert(IC > 0 && "Interleave count must be greater than 0."); - // If we did not calculate the cost for VF (because the user selected the VF) - // then we calculate the cost of VF here. - if (LoopCost == 0) { - InstructionCost C = expectedCost(VF).first; - assert(C.isValid() && "Expected to have chosen a VF with valid cost"); - LoopCost = *C.getValue(); - } - - assert(LoopCost && "Non-zero loop cost expected"); - // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. if (VF.isVector() && HasReductions) { @@ -6114,9 +5690,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, return IC; } - // Note that if we've already vectorized the loop we will have done the - // runtime check and so interleaving won't require further checks. - bool InterleavingRequiresRuntimePointerCheck = + // For any scalar loop that either requires runtime checks or predication we + // are better off leaving this to the unroller. Note that if we've already + // vectorized the loop we will have done the runtime check and so interleaving + // won't require further checks. + bool ScalarInterleavingRequiresPredication = + (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { + return Legal->blockNeedsPredication(BB); + })); + bool ScalarInterleavingRequiresRuntimePointerCheck = (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and @@ -6126,7 +5708,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, << "LV: VF is " << VF << '\n'); const bool AggressivelyInterleaveReductions = TTI.enableAggressiveInterleaving(HasReductions); - if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { + if (!ScalarInterleavingRequiresRuntimePointerCheck && + !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. @@ -6289,16 +5872,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - // A lambda that gets the register usage for the given type and VF. - const auto &TTICapture = TTI; - auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { + auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0; - InstructionCost::CostType RegUsage = - *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); - assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && - "Nonsensical values for register usage."); - return RegUsage; + return TTI.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -7049,10 +6626,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool TypeNotScalarized = false; if (VF.isVector() && VectorTy->isVectorTy()) { - unsigned NumParts = TTI.getNumberOfParts(VectorTy); - if (NumParts) - TypeNotScalarized = NumParts < VF.getKnownMinValue(); - else + if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { + if (VF.isScalable()) + // <vscale x 1 x iN> is assumed to be profitable over iN because + // scalable registers are a distinct register class from scalar ones. + // If we ever find a target which wants to lower scalable vectors + // back to scalars, we'll need to update this code to explicitly + // ask TTI about the register class uses for each part. + TypeNotScalarized = NumParts <= VF.getKnownMinValue(); + else + TypeNotScalarized = NumParts < VF.getKnownMinValue(); + } else C = InstructionCost::getInvalid(); } return VectorizationCostTy(C, TypeNotScalarized); @@ -7128,8 +6712,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { Cost = getGatherScatterCost(&I, VF); setWideningDecision(&I, VF, CM_GatherScatter, Cost); } else { - assert((isa<LoadInst>(&I) || !VF.isScalable()) && - "Cannot yet scalarize uniform stores"); Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); } @@ -7487,8 +7069,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); - if (Decision == CM_Scalarize) + if (Decision == CM_Scalarize) { + if (VF.isScalable() && isa<StoreInst>(I)) + // We can't scalarize a scalable vector store (even a uniform one + // currently), return an invalid cost so as to prevent vectorization. + return InstructionCost::getInvalid(); Width = ElementCount::getFixed(1); + } } VectorTy = ToVectorTy(getLoadStoreType(I), Width); return getMemoryInstructionCost(I, VF); @@ -7656,6 +7243,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + // Find all stores to invariant variables. Since they are going to sink + // outside the loop we do not need calculate cost for them. + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + StoreInst *SI; + if ((SI = dyn_cast<StoreInst>(&I)) && + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + ValuesToIgnore.insert(&I); + } + // Ignore type-promoting instructions we identified during reduction // detection. for (auto &Reduction : Legal->getReductionVars()) { @@ -7757,7 +7354,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/}; + return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; } LLVM_DEBUG( @@ -7766,6 +7363,14 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } +bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { + unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); + return (NumRuntimePointerChecks > + VectorizerParams::RuntimeMemoryCheckThreshold && + !Hints.allowReordering()) || + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; +} + Optional<VectorizationFactor> LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -7800,7 +7405,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + return {{UserVF, 0, 0}}; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -7834,30 +7439,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); - - // Check if it is profitable to vectorize with runtime checks. - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE->emit([&]() { - return OptimizationRemarkAnalysisAliasing( - DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), - OrigLoop->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Hints.emitRemarkWithHints(); - return VectorizationFactor::Disabled(); - } - } - return SelectedVF; + return CM.selectVectorizationFactor(VFCandidates); } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -7910,17 +7492,36 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, - DominatorTree *DT) { + DominatorTree *DT, + bool IsEpilogueVectorization) { LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); // Perform the actual loop transformation. - // 1. Create a new empty loop. Unlink the old loop and connect the new one. + // 1. Set up the skeleton for vectorization, including vector pre-header and + // middle block. The vector loop is created during VPlan execution. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = ILV.createVectorizedLoopSkeleton(); + + // Only use noalias metadata when using memory checks guaranteeing no overlap + // across all iterations. + const LoopAccessInfo *LAI = ILV.Legal->getLAI(); + if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && + !LAI->getRuntimePointerChecking()->getDiffChecks()) { + + // We currently don't use LoopVersioning for the actual loop cloning but we + // still use it to add the noalias metadata. + // TODO: Find a better way to re-use LoopVersioning functionality to add + // metadata. + State.LVer = std::make_unique<LoopVersioning>( + *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, + PSE.getSE()); + State.LVer->prepareNoAliasMetadata(); + } + ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -7936,7 +7537,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // 2. Copy and widen instructions from the old loop into the new loop. BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), ILV.getOrCreateVectorTripCount(nullptr), - CanonicalIVStartValue, State); + CanonicalIVStartValue, State, + IsEpilogueVectorization); + BestVPlan.execute(&State); // Keep all loop hints from the original loop on the vector loop (we'll @@ -7947,8 +7550,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); - Loop *L = LI->getLoopFor(State.CFG.PrevBB); - if (VectorizedLoopID.hasValue()) + VPBasicBlock *HeaderVPBB = + BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); + Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); + if (VectorizedLoopID) L->setLoopID(VectorizedLoopID.getValue()); else { // Keep all loop hints from the original loop on the vector loop (we'll @@ -7965,7 +7570,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. - ILV.fixVectorizedLoop(State); + ILV.fixVectorizedLoop(State, BestVPlan); ILV.printDebugTracesAtEnd(); } @@ -8036,22 +7641,31 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } std::pair<BasicBlock *, Value *> EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); - Loop *Lp = createVectorLoopSkeleton(""); + + // Workaround! Compute the trip count of the original loop and cache it + // before we start modifying the CFG. This code has a systemic problem + // wherein it tries to run analysis over partially constructed IR; this is + // wrong, and not simply for SCEV. The trip count of the original loop + // simply happens to be prone to hitting this in practice. In theory, we + // can hit the same issue for any SCEV, or ValueTracking query done during + // mutation. See PR49900. + getOrCreateTripCount(OrigLoop->getLoopPreheader()); + createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector // epilogue (see below). EPI.EpilogueIterationCountCheck = - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); + emitIterationCountCheck(LoopScalarPreHeader, true); EPI.EpilogueIterationCountCheck->setName("iter.check"); // Generate the code to check any assumptions that we've made for SCEV // expressions. - EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); + EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); // Generate the code that checks at runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case @@ -8060,19 +7674,17 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { // trip count. Note: the branch will get updated later on when we vectorize // the epilogue. EPI.MainLoopIterationCountCheck = - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); + emitIterationCountCheck(LoopScalarPreHeader, false); // Generate the induction variable. - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - EPI.VectorTripCount = CountRoundDown; - createHeaderBranch(Lp); + EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); // Skip induction resume value creation here because they will be created in // the second pass. If we created them here, they wouldn't be used anyway, // because the vplan in the second pass still contains the inductions from the // original loop. - return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; + return {completeLoopSkeleton(OrigLoopID), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -8092,13 +7704,13 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { }); } -BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( - Loop *L, BasicBlock *Bypass, bool ForEpilogue) { - assert(L && "Expected valid Loop."); +BasicBlock * +EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, + bool ForEpilogue) { assert(Bypass && "Expected valid bypass basic block."); ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; - Value *Count = getOrCreateTripCount(L); + Value *Count = getOrCreateTripCount(LoopVectorPreHeader); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -8157,7 +7769,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( std::pair<BasicBlock *, Value *> EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); - Loop *Lp = createVectorLoopSkeleton("vec.epilog."); + createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to // execute the vectorized epilogue skip to the scalar part. @@ -8166,7 +7778,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { LoopVectorPreHeader = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, LI, nullptr, "vec.epilog.ph"); - emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, + emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, VecEpilogueIterationCountCheck); // Adjust the control flow taking the state info from the main loop @@ -8238,9 +7850,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), EPI.MainLoopIterationCountCheck); - // Generate the induction variable. - createHeaderBranch(Lp); - // Generate induction resume values. These variables save the new starting // indexes for the scalar loop. They are used to test if there are any tail // iterations left once the vector loop has completed. @@ -8248,15 +7857,15 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, - EPI.VectorTripCount} /* AdditionalBypass */); + createInductionResumeValues({VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); - return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; + return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; } BasicBlock * EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( - Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { + BasicBlock *Bypass, BasicBlock *Insert) { assert(EPI.TripCount && "Expected trip count to have been safed in the first pass."); @@ -8397,7 +8006,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { // constructing the desired canonical IV in the header block as its first // non-phi instructions. assert(CM.foldTailByMasking() && "must fold the tail"); - VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); + VPBasicBlock *HeaderVPBB = + Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); @@ -8439,8 +8049,6 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, "Must be called with either a load or store"); auto willWiden = [&](ElementCount VF) -> bool { - if (VF.isScalar()) - return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && @@ -8477,11 +8085,12 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, Mask, Consecutive, Reverse); } -static VPWidenIntOrFpInductionRecipe * -createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, - VPValue *Start, const InductionDescriptor &IndDesc, - LoopVectorizationCostModel &CM, Loop &OrigLoop, - VFRange &Range) { +/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also +/// insert a recipe to expand the step for the induction recipe. +static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( + PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, + const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, + VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { // Returns true if an instruction \p I should be scalarized instead of // vectorized for the chosen vectorization factor. auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { @@ -8489,18 +8098,6 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, CM.isProfitableToScalarize(I, VF); }; - bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { - // Returns true if we should generate a scalar version of \p IV. - if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) - return true; - auto isScalarInst = [&](User *U) -> bool { - auto *I = cast<Instruction>(U); - return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); - }; - return any_of(PhiOrTrunc->users(), isScalarInst); - }, - Range); bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return ShouldScalarizeInstruction(PhiOrTrunc, VF); @@ -8508,30 +8105,38 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, Range); assert(IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); + assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && + "step must be loop invariant"); + + VPValue *Step = + vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { - return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, - NeedsScalarIV, !NeedsScalarIVOnly); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, + !NeedsScalarIVOnly); } assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, !NeedsScalarIVOnly); } -VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( - PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { +VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( + PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) - return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, - Range); + return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, + *PSE.getSE(), *OrigLoop, Range); + // Check if this is pointer induction. If so, build the recipe for it. + if (auto *II = Legal->getPointerInductionDescriptor(Phi)) + return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, + *PSE.getSE()); return nullptr; } VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( - TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, - VPlan &Plan) const { + TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -8552,7 +8157,8 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( auto *Phi = cast<PHINode>(I->getOperand(0)); const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); + return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, + *PSE.getSE(), *OrigLoop, Range); } return nullptr; } @@ -8569,13 +8175,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, return Operands[0]; } + unsigned NumIncoming = Phi->getNumIncomingValues(); + // For in-loop reductions, we do not need to create an additional select. + VPValue *InLoopVal = nullptr; + for (unsigned In = 0; In < NumIncoming; In++) { + PHINode *PhiOp = + dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); + if (PhiOp && CM.isInLoopReduction(PhiOp)) { + assert(!InLoopVal && "Found more than one in-loop reduction!"); + InLoopVal = Operands[In]; + } + } + + assert((!InLoopVal || NumIncoming == 2) && + "Found an in-loop reduction for PHI with unexpected number of " + "incoming values"); + if (InLoopVal) + return Operands[Operands[0] == InLoopVal ? 1 : 0]; + // We know that all PHIs in non-header blocks are converted into selects, so // we don't have to worry about the insertion order and we can just use the // builder. At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. SmallVector<VPValue *, 2> OperandsWithMask; - unsigned NumIncoming = Phi->getNumIncomingValues(); for (unsigned In = 0; In < NumIncoming; In++) { VPValue *EdgeMask = @@ -8681,6 +8304,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, case Instruction::URem: case Instruction::Xor: case Instruction::ZExt: + case Instruction::Freeze: return true; } return false; @@ -8806,14 +8430,14 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, Plan->removeVPValueFor(Instr); Plan->addVPValue(Instr, PHIRecipe); } - auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); + VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. - VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); - VPBlockUtils::connectBlocks(Pred, Exit); + VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); + VPBlockUtils::connectBlocks(Pred, Exiting); return Region; } @@ -8822,52 +8446,37 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef<VPValue *> Operands, VFRange &Range, VPlanPtr &Plan) { - // First, check for specific widening recipes that deal with calls, memory - // operations, inductions and Phi nodes. - if (auto *CI = dyn_cast<CallInst>(Instr)) - return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); - - if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) - return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); - + // First, check for specific widening recipes that deal with inductions, Phi + // nodes, calls and memory operations. VPRecipeBase *Recipe; if (auto Phi = dyn_cast<PHINode>(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Operands, Plan); - if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) + if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) return toVPRecipeResult(Recipe); VPHeaderPHIRecipe *PhiRecipe = nullptr; - if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { - VPValue *StartV = Operands[0]; - if (Legal->isReductionVariable(Phi)) { - const RecurrenceDescriptor &RdxDesc = - Legal->getReductionVars().find(Phi)->second; - assert(RdxDesc.getRecurrenceStartValue() == - Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, - CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc)); - } else { - PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); - } - - // Record the incoming value from the backedge, so we can add the incoming - // value from the backedge after all recipes have been created. - recordRecipeOf(cast<Instruction>( - Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); - PhisToFix.push_back(PhiRecipe); + assert((Legal->isReductionVariable(Phi) || + Legal->isFirstOrderRecurrence(Phi)) && + "can only widen reductions and first-order recurrences here"); + VPValue *StartV = Operands[0]; + if (Legal->isReductionVariable(Phi)) { + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(Phi)->second; + assert(RdxDesc.getRecurrenceStartValue() == + Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); + PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, + CM.isInLoopReduction(Phi), + CM.useOrderedReductions(RdxDesc)); } else { - // TODO: record backedge value for remaining pointer induction phis. - assert(Phi->getType()->isPointerTy() && - "only pointer phis should be handled here"); - assert(Legal->getInductionVars().count(Phi) && - "Not an induction variable"); - InductionDescriptor II = Legal->getInductionVars().lookup(Phi); - VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); - PhiRecipe = new VPWidenPHIRecipe(Phi, Start); + PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); } + // Record the incoming value from the backedge, so we can add the incoming + // value from the backedge after all recipes have been created. + recordRecipeOf(cast<Instruction>( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); + PhisToFix.push_back(PhiRecipe); return toVPRecipeResult(PhiRecipe); } @@ -8876,6 +8485,17 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, Range, *Plan))) return toVPRecipeResult(Recipe); + // All widen recipes below deal only with VF > 1. + if (LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { return VF.isScalar(); }, Range)) + return nullptr; + + if (auto *CI = dyn_cast<CallInst>(Instr)) + return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); + + if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) + return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); + if (!shouldWiden(Instr, Range)) return nullptr; @@ -8949,15 +8569,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a // BranchOnCount VPInstruction to the latch. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW, bool IsVPlanNative) { + bool HasNUW) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); - if (IsVPlanNative) - Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); Header->insert(CanonicalIVPHI, Header->begin()); auto *CanonicalIVIncrement = @@ -8966,11 +8584,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, {CanonicalIVPHI}, DL); CanonicalIVPHI->addOperand(CanonicalIVIncrement); - VPBasicBlock *EB = TopRegion->getExitBasicBlock(); - if (IsVPlanNative) { - EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); - EB->setCondBit(nullptr); - } + VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); auto *BranchOnCount = @@ -8979,6 +8593,26 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, EB->appendRecipe(BranchOnCount); } +// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the +// original exit block. +static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, + VPBasicBlock *MiddleVPBB, Loop *OrigLoop, + VPlan &Plan) { + BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); + BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); + // Only handle single-exit loops with unique exit blocks for now. + if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) + return; + + // Introduce VPUsers modeling the exit values. + for (PHINode &ExitPhi : ExitBB->phis()) { + Value *IncomingValue = + ExitPhi.getIncomingValueForBlock(ExitingBB); + VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); + Plan.addLiveOut(&ExitPhi, V); + } +} + VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const MapVector<Instruction *, Instruction *> &SinkAfter) { @@ -9007,7 +8641,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RecipeBuilder.recordRecipeOf(Phi); for (auto &R : ReductionOperations) { RecipeBuilder.recordRecipeOf(R); - // For min/max reducitons, where we have a pair of icmp/select, we also + // For min/max reductions, where we have a pair of icmp/select, we also // need to record the ICmp recipe, so it can be removed later. assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); @@ -9039,18 +8673,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - // Create initial VPlan skeleton, with separate header and latch blocks. - VPBasicBlock *HeaderVPBB = new VPBasicBlock(); + // Create initial VPlan skeleton, starting with a block for the pre-header, + // followed by a region for the vector loop, followed by the middle block. The + // skeleton vector loop region contains a header and latch block. + VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); + auto Plan = std::make_unique<VPlan>(Preheader); + + VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); - auto Plan = std::make_unique<VPlan>(TopRegion); + VPBlockUtils::insertBlockAfter(TopRegion, Preheader); + VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); + VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); Instruction *DLInst = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking(), false); + !CM.foldTailByMasking()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9063,11 +8704,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; - VPBB->setName(BB->getName()); + if (VPBB != HeaderVPBB) + VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. - // TODO: Model and preserve debug instrinsics in VPlan. + // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; @@ -9085,6 +8727,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( auto OpRange = Plan->mapToVPValues(Instr->operands()); Operands = {OpRange.begin(), OpRange.end()}; } + + // Invariant stores inside loop will be deleted and a single store + // with the final reduction value will be added to the exit block + StoreInst *SI; + if ((SI = dyn_cast<StoreInst>(&I)) && + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + continue; + if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( Instr, Operands, Range, Plan)) { // If Instr can be simplified to an existing VPValue, use it. @@ -9135,14 +8785,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); } + HeaderVPBB->setName("vector.body"); + // Fold the last, empty block into its predecessor. VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); assert(VPBB && "expected to fold last (empty) block"); // After here, VPBB should not be used. VPBB = nullptr; - assert(isa<VPRegionBlock>(Plan->getEntry()) && - !Plan->getEntry()->getEntryBasicBlock()->empty() && + addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); + + assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && + !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); @@ -9222,12 +8876,13 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, + adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, RecipeBuilder, Range.Start); // Introduce a recipe to combine the incoming and previous values of a // first-order recurrence. - for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { + for (VPRecipeBase &R : + Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); if (!RecurPhi) continue; @@ -9236,7 +8891,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBasicBlock *InsertBlock = PrevRecipe->getParent(); auto *Region = GetReplicateRegion(PrevRecipe); if (Region) - InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); + InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); + if (!InsertBlock) { + InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); + VPBlockUtils::insertBlockAfter(InsertBlock, Region); + } if (Region || PrevRecipe->isPhi()) Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); else @@ -9283,13 +8942,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } - // From this point onwards, VPlan-to-VPlan transformations may change the plan - // in ways that accessing values using original IR values is incorrect. - Plan->disableValue2VPValue(); - - VPlanTransforms::sinkScalarOperands(*Plan); - VPlanTransforms::mergeReplicateRegions(*Plan); - std::string PlanName; raw_string_ostream RSO(PlanName); ElementCount VF = Range.Start; @@ -9303,10 +8955,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RSO.flush(); Plan->setName(PlanName); + // From this point onwards, VPlan-to-VPlan transformations may change the plan + // in ways that accessing values using original IR values is incorrect. + Plan->disableValue2VPValue(); + + VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); + VPlanTransforms::sinkScalarOperands(*Plan); + VPlanTransforms::mergeReplicateRegions(*Plan); + VPlanTransforms::removeDeadRecipes(*Plan); + VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); + // Fold Exit block into its predecessor if possible. // TODO: Fold block earlier once all VPlan transforms properly maintain a // VPBasicBlock as exit. - VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); + VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; @@ -9331,23 +8993,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { VF *= 2) Plan->addVF(VF); - if (EnableVPlanPredication) { - VPlanPredicator VPP(*Plan); - VPP.predicate(); - - // Avoid running transformation to recipes until masked code generation in - // VPlan-native path is in place. - return Plan; - } - SmallPtrSet<Instruction *, 1> DeadInstructions; VPlanTransforms::VPInstructionsToVPRecipes( OrigLoop, Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, DeadInstructions, *PSE.getSE()); + // Remove the existing terminator of the exiting block of the top-most region. + // A BranchOnCount will be added instead when adding the canonical IV recipes. + auto *Term = + Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); + Term->eraseFromParent(); + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true, true); + true); return Plan; } @@ -9399,7 +9058,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); - auto *CondOp = CM.foldTailByMasking() + auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) : nullptr; @@ -9441,7 +9100,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // dedicated latch block. if (CM.foldTailByMasking()) { Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); - for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { + for (VPRecipeBase &R : + Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); if (!PhiR || PhiR->isInLoop()) continue; @@ -9493,7 +9153,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { void VPWidenSelectRecipe::execute(VPTransformState &State) { auto &I = *cast<SelectInst>(getUnderlyingInstr()); - State.ILV->setDebugLocFromInst(&I); + State.setDebugLocFromInst(&I); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -9508,7 +9168,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { Value *Op1 = State.get(getOperand(2), Part); Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); State.set(this, Sel, Part); - State.ILV->addMetadata(Sel, &I); + State.addMetadata(Sel, &I); } } @@ -9542,7 +9202,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::Or: case Instruction::Xor: { // Just widen unops and binops. - State.ILV->setDebugLocFromInst(&I); + State.setDebugLocFromInst(&I); for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector<Value *, 2> Ops; @@ -9565,17 +9225,28 @@ void VPWidenRecipe::execute(VPTransformState &State) { // Use this vector value for all users of the original instruction. State.set(this, V, Part); - State.ILV->addMetadata(V, &I); + State.addMetadata(V, &I); } break; } + case Instruction::Freeze: { + State.setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Op = State.get(getOperand(0), Part); + + Value *Freeze = Builder.CreateFreeze(Op); + State.set(this, Freeze, Part); + } + break; + } case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = (I.getOpcode() == Instruction::FCmp); auto *Cmp = cast<CmpInst>(&I); - State.ILV->setDebugLocFromInst(Cmp); + State.setDebugLocFromInst(Cmp); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); @@ -9589,7 +9260,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { C = Builder.CreateICmp(Cmp->getPredicate(), A, B); } State.set(this, C, Part); - State.ILV->addMetadata(C, &I); + State.addMetadata(C, &I); } break; @@ -9608,7 +9279,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::FPTrunc: case Instruction::BitCast: { auto *CI = cast<CastInst>(&I); - State.ILV->setDebugLocFromInst(CI); + State.setDebugLocFromInst(CI); /// Vectorize casts. Type *DestTy = (State.VF.isScalar()) @@ -9619,7 +9290,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { Value *A = State.get(getOperand(0), Part); Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); State.set(this, Cast, Part); - State.ILV->addMetadata(Cast, &I); + State.addMetadata(Cast, &I); } break; } @@ -9655,7 +9326,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { for (unsigned Part = 0; Part < State.UF; ++Part) { Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); State.set(this, EntryPart, Part); - State.ILV->addMetadata(EntryPart, GEP); + State.addMetadata(EntryPart, GEP); } } else { // If the GEP has at least one loop-varying operand, we are sure to @@ -9693,32 +9364,276 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { // Create the new GEP. Note that this GEP may be a scalar if VF == 1, // but it should be a vector, otherwise. - auto *NewGEP = IsInBounds - ? State.Builder.CreateInBoundsGEP( - GEP->getSourceElementType(), Ptr, Indices) - : State.Builder.CreateGEP(GEP->getSourceElementType(), - Ptr, Indices); + auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, + Indices, "", IsInBounds); assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); State.set(this, NewGEP, Part); - State.ILV->addMetadata(NewGEP, GEP); + State.addMetadata(NewGEP, GEP); } } } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); - State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); + + Value *Start = getStartValue()->getLiveInIRValue(); + const InductionDescriptor &ID = getInductionDescriptor(); + TruncInst *Trunc = getTruncInst(); + IRBuilderBase &Builder = State.Builder; + assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); + assert(State.VF.isVector() && "must have vector VF"); + + // The value from the original loop to which we are mapping the new induction + // variable. + Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; + + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) + Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); + + // Now do the actual transformations, and start with fetching the step value. + Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + + assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); + + // Construct the initial value of the vector IV in the vector loop preheader + auto CurrIP = Builder.saveIP(); + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + Builder.SetInsertPoint(VectorPH->getTerminator()); + if (isa<TruncInst>(EntryVal)) { + assert(Start->getType()->isIntegerTy() && + "Truncation requires an integer type"); + auto *TruncType = cast<IntegerType>(EntryVal->getType()); + Step = Builder.CreateTrunc(Step, TruncType); + Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); + } + + Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); + Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); + Value *SteppedStart = getStepVector( + SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); + + // We create vector phi nodes for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (Step->getType()->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = ID.getInductionOpcode(); + MulOp = Instruction::FMul; + } + + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Type *StepType = Step->getType(); + Value *RuntimeVF; + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); + else + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't + // handle a constant vector splat. + Value *SplatVF = isa<Constant>(Mul) + ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); + Builder.restoreIP(CurrIP); + + // We may need to add the step a number of times, depending on the unroll + // factor. The last of those goes into the PHI. + PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", + &*State.CFG.PrevBB->getFirstInsertionPt()); + VecInd->setDebugLoc(EntryVal->getDebugLoc()); + Instruction *LastInduction = VecInd; + for (unsigned Part = 0; Part < State.UF; ++Part) { + State.set(this, LastInduction, Part); + + if (isa<TruncInst>(EntryVal)) + State.addMetadata(LastInduction, EntryVal); + + LastInduction = cast<Instruction>( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + LastInduction->setDebugLoc(EntryVal->getDebugLoc()); + } + + LastInduction->setName("vec.ind.next"); + VecInd->addIncoming(SteppedStart, VectorPH); + // Add induction update using an incorrect block temporarily. The phi node + // will be fixed after VPlan execution. Note that at this point the latch + // block cannot be used, as it does not exist yet. + // TODO: Model increment value in VPlan, by turning the recipe into a + // multi-def and a subclass of VPHeaderPHIRecipe. + VecInd->addIncoming(LastInduction, VectorPH); +} + +void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { + assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && + "Not a pointer induction according to InductionDescriptor!"); + assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && + "Unexpected type."); + + auto *IVR = getParent()->getPlan()->getCanonicalIV(); + PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); + + if (onlyScalarsGenerated(State.VF)) { + // This is the normalized GEP that starts counting at zero. + Value *PtrInd = State.Builder.CreateSExtOrTrunc( + CanonicalIV, IndDesc.getStep()->getType()); + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + bool IsUniform = vputils::onlyFirstLaneUsed(this); + assert((IsUniform || !State.VF.isScalable()) && + "Cannot scalarize a scalable VF"); + unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *PartStart = + createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); + + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Value *Idx = State.Builder.CreateAdd( + PartStart, ConstantInt::get(PtrInd->getType(), Lane)); + Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); + + Value *Step = CreateStepValue(IndDesc.getStep(), SE, + State.CFG.PrevBB->getTerminator()); + Value *SclrGep = emitTransformedIndex( + State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); + SclrGep->setName("next.gep"); + State.set(this, SclrGep, VPIteration(Part, Lane)); + } + } + return; + } + + assert(isa<SCEVConstant>(IndDesc.getStep()) && + "Induction step not a SCEV constant!"); + Type *PhiType = IndDesc.getStep()->getType(); + + // Build a pointer phi + Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); + Type *ScStValueType = ScalarStartValue->getType(); + PHINode *NewPointerPhi = + PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); + + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); + + // A pointer induction, performed by using a gep + const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); + Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); + + const SCEV *ScalarStep = IndDesc.getStep(); + SCEVExpander Exp(SE, DL, "induction"); + Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); + Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); + Value *NumUnrolledElems = + State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); + Value *InductionGEP = GetElementPtrInst::Create( + IndDesc.getElementType(), NewPointerPhi, + State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", + InductionLoc); + // Add induction update using an incorrect block temporarily. The phi node + // will be fixed after VPlan execution. Note that at this point the latch + // block cannot be used, as it does not exist yet. + // TODO: Model increment value in VPlan, by turning the recipe into a + // multi-def and a subclass of VPHeaderPHIRecipe. + NewPointerPhi->addIncoming(InductionGEP, VectorPH); + + // Create UF many actual address geps that use the pointer + // phi as base and a vectorized version of the step value + // (<step*0, ..., step*N>) as offset. + for (unsigned Part = 0; Part < State.UF; ++Part) { + Type *VecPhiType = VectorType::get(PhiType, State.VF); + Value *StartOffsetScalar = + State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); + Value *StartOffset = + State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); + // Create a vector of consecutive numbers from zero to VF. + StartOffset = State.Builder.CreateAdd( + StartOffset, State.Builder.CreateStepVector(VecPhiType)); + + Value *GEP = State.Builder.CreateGEP( + IndDesc.getElementType(), NewPointerPhi, + State.Builder.CreateMul( + StartOffset, + State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), + "vector.gep")); + State.set(this, GEP, Part); + } } -void VPWidenPHIRecipe::execute(VPTransformState &State) { - State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, - State); +void VPScalarIVStepsRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); + + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); + if (IndDesc.getInductionBinOp() && + isa<FPMathOperator>(IndDesc.getInductionBinOp())) + State.Builder.setFastMathFlags( + IndDesc.getInductionBinOp()->getFastMathFlags()); + + Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + auto CreateScalarIV = [&](Value *&Step) -> Value * { + Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); + auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); + if (!isCanonical() || CanonicalIV->getType() != Ty) { + ScalarIV = + Ty->isIntegerTy() + ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) + : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); + ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, + getStartValue()->getLiveInIRValue(), Step, + IndDesc); + ScalarIV->setName("offset.idx"); + } + if (TruncToTy) { + assert(Step->getType()->isIntegerTy() && + "Truncation requires an integer step"); + ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); + Step = State.Builder.CreateTrunc(Step, TruncToTy); + } + return ScalarIV; + }; + + Value *ScalarIV = CreateScalarIV(Step); + if (State.VF.isVector()) { + buildScalarSteps(ScalarIV, Step, IndDesc, this, State); + return; + } + + for (unsigned Part = 0; Part < State.UF; ++Part) { + assert(!State.VF.isScalable() && "scalable vectors not yet supported."); + Value *EntryPart; + if (Step->getType()->isFloatingPointTy()) { + Value *StartIdx = + getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); + // Floating-point operations inherit FMF via the builder's flags. + Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); + EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), + ScalarIV, MulOp); + } else { + Value *StartIdx = + getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); + EntryPart = State.Builder.CreateAdd( + ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); + } + State.set(this, EntryPart, Part); + } } void VPBlendRecipe::execute(VPTransformState &State) { - State.ILV->setDebugLocFromInst(Phi, &State.Builder); + State.setDebugLocFromInst(Phi); // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. @@ -9979,7 +9894,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { // Handle Stores: if (SI) { - State.ILV->setDebugLocFromInst(SI); + State.setDebugLocFromInst(SI); for (unsigned Part = 0; Part < State.UF; ++Part) { Instruction *NewSI = nullptr; @@ -10005,14 +9920,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { else NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); } - State.ILV->addMetadata(NewSI, SI); + State.addMetadata(NewSI, SI); } return; } // Handle loads. assert(LI && "Must have a load instruction"); - State.ILV->setDebugLocFromInst(LI); + State.setDebugLocFromInst(LI); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { @@ -10020,7 +9935,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { Value *VectorGep = State.get(getAddr(), Part); NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); - State.ILV->addMetadata(NewLI, LI); + State.addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); @@ -10033,12 +9948,12 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); // Add metadata to the load, but setVectorValue to the reverse shuffle. - State.ILV->addMetadata(NewLI, LI); + State.addMetadata(NewLI, LI); if (Reverse) NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); } - State.set(this, NewLI, Part); + State.set(getVPSingleValue(), NewLI, Part); } } @@ -10119,7 +10034,8 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) { // Check if there is a scalar value for the selected lane. if (!hasScalarValue(Def, {Part, LastLane})) { // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. - assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && + assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || + isa<VPScalarIVStepsRecipe>(Def->getDef())) && "unexpected recipe found to be invariant"); IsUniform = true; LastLane = 0; @@ -10201,8 +10117,7 @@ static bool processLoopInVPlanNativePath( // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. // Also, do not attempt to vectorize if no vector code will be produced. - if (VPlanBuildStressTest || EnableVPlanPredication || - VectorizationFactor::Disabled() == VF) + if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) return false; VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); @@ -10214,7 +10129,7 @@ static bool processLoopInVPlanNativePath( &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); + LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -10282,8 +10197,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { const std::string DebugLocStr = getDebugLocString(L); #endif /* NDEBUG */ - LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" - << L->getHeader()->getParent()->getName() << "\" from " + LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" + << L->getHeader()->getParent()->getName() << "' from " << DebugLocStr << "\n"); LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); @@ -10438,10 +10353,30 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); if (MaybeVF) { + if (LVP.requiresTooManyRuntimeChecks()) { + ORE->emit([&]() { + return OptimizationRemarkAnalysisAliasing( + DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Hints.emitRemarkWithHints(); + return false; + } VF = *MaybeVF; // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + + unsigned SelectedIC = std::max(IC, UserIC); + // Optimistically generate runtime checks if they are needed. Drop them if + // they turn out to not be profitable. + if (VF.Width.isVector() || SelectedIC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); } // Identify the diagnostic messages that should be produced. @@ -10529,14 +10464,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); { - // Optimistically generate runtime checks. Drop them if they turn out to not - // be profitable. Limit the scope of Checks, so the cleanup happens - // immediately after vector codegeneration is done. - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, - F->getParent()->getDataLayout()); - if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); - using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); @@ -10546,7 +10473,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { &CM, BFI, PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); + LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), @@ -10571,12 +10498,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, - DT); + DT, true); ++LoopsVectorized; - simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); - formLCSSARecursively(*L, *DT, LI, SE); - // Second pass vectorizes the epilogue and adjusts the control flow // edges from the first pass. EPI.MainLoopVF = EPI.EpilogueVF; @@ -10586,23 +10510,24 @@ bool LoopVectorizePass::processLoop(Loop *L) { Checks); VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); + VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); + Header->setName("vec.epilog.vector.body"); // Ensure that the start values for any VPReductionPHIRecipes are // updated before vectorising the epilogue loop. - VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { if (auto *Resume = MainILV.getReductionResumeValue( ReductionPhi->getRecurrenceDescriptor())) { - VPValue *StartVal = new VPValue(Resume); - BestEpiPlan.addExternalDef(StartVal); + VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); ReductionPhi->setOperand(0, StartVal); } } } LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, - DT); + DT, true); ++LoopsEpilogueVectorized; if (!MainILV.areSafetyChecksAdded()) @@ -10612,7 +10537,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { &LVL, &CM, BFI, PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); + LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there @@ -10638,7 +10563,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { Optional<MDNode *> RemainderLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); - if (RemainderLoopID.hasValue()) { + if (RemainderLoopID) { L->setLoopID(RemainderLoopID.getValue()); } else { if (DisableRuntimeUnroll) @@ -10720,8 +10645,12 @@ LoopVectorizeResult LoopVectorizePass::runImpl( PreservedAnalyses LoopVectorizePass::run(Function &F, FunctionAnalysisManager &AM) { - auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &LI = AM.getResult<LoopAnalysis>(F); + // There are no loops in the function. Return before computing other expensive + // analyses. + if (LI.empty()) + return PreservedAnalyses::all(); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &TTI = AM.getResult<TargetIRAnalysis>(F); auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); |