diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 831 |
1 files changed, 451 insertions, 380 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4747f34fcc62..d11f4146b590 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -470,10 +470,11 @@ public: /// on, while the old loop will be used as the scalar remainder. Control flow /// is generated around the vectorized (and scalar epilogue) loops consisting /// of various checks and bypasses. Return the pre-header block of the new - /// loop. - /// In the case of epilogue vectorization, this function is overriden to - /// handle the more complex control flow around the loops. - virtual BasicBlock *createVectorizedLoopSkeleton(); + /// loop and the start value for the canonical induction, if it is != 0. The + /// latter is the case when vectorizing the epilogue loop. In the case of + /// epilogue vectorization, this function is overriden to handle the more + /// complex control flow around the loops. + virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, @@ -507,10 +508,10 @@ public: /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, - Value *Start, TruncInst *Trunc, VPValue *Def, - VPTransformState &State); + /// the corresponding type. \p CanonicalIV is the scalar value generated for + /// the canonical induction variable. + void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, + VPTransformState &State, Value *CanonicalIV); /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, @@ -556,6 +557,10 @@ public: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); + // Returns the resume value (bc.merge.rdx) for a reduction as + // generated by fixReduction. + PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); + protected: friend class LoopVectorizationPlanner; @@ -573,16 +578,18 @@ protected: Value *CountRoundDown, Value *EndValue, BasicBlock *MiddleBlock); - /// Create a new induction variable inside L. - PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, - Value *Step, Instruction *DL); + /// Introduce a conditional branch (on true, condition to be set later) at the + /// end of the header=latch connecting it to itself (across the backedge) and + /// to the exit block of \p L. + void createHeaderBranch(Loop *L); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); /// Create the exit value of first order recurrences in the middle block and /// update their users. - void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); + void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, + VPTransformState &State); /// Create code for the loop exit value of the reduction. void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); @@ -606,14 +613,6 @@ protected: /// represented as. void truncateToMinimalBitwidths(VPTransformState &State); - /// This function adds - /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) - /// to each vector element of Val. The sequence starts at StartIndex. - /// \p Opcode is relevant for FP induction variable. - virtual Value * - getStepVector(Value *Val, Value *StartIdx, Value *Step, - Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); - /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and /// \p EntryVal is the value from the original loop that maps to the steps. @@ -640,9 +639,6 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// Generate a shuffle sequence that will reverse the vector Vec. - virtual Value *reverseVector(Value *Vec); - /// Returns (and creates if needed) the original loop trip count. Value *getOrCreateTripCount(Loop *NewLoop); @@ -685,14 +681,13 @@ protected: Loop *createVectorLoopSkeleton(StringRef Prefix); /// Create new phi nodes for the induction variables to resume iteration count - /// in the scalar epilogue, from where the vectorized loop left off (given by - /// \p VectorTripCount). + /// in the scalar epilogue, from where the vectorized loop left off. /// In cases where the loop skeleton is more complicated (eg. epilogue /// vectorization) and the resume values can come from an additional bypass /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( - Loop *L, Value *VectorTripCount, + Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate @@ -795,12 +790,6 @@ protected: /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector<BasicBlock *, 4> LoopBypassBlocks; - /// The new Induction variable which was added to the new block. - PHINode *Induction = nullptr; - - /// The induction variable of the old basic block. - PHINode *OldInduction = nullptr; - /// Store instructions that were predicated. SmallVector<Instruction *, 4> PredicatedInstructions; @@ -838,6 +827,11 @@ protected: /// Structure to hold information about generated runtime checks, responsible /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; + + // Holds the resume values for reductions in the loops, used to set the + // correct start value of reduction PHIs when vectorizing the epilogue. + SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> + ReductionResumeValues; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -856,10 +850,6 @@ public: private: Value *getBroadcastInstrs(Value *V) override; - Value *getStepVector( - Value *Val, Value *StartIdx, Value *Step, - Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; - Value *reverseVector(Value *Vec) override; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -909,14 +899,16 @@ public: // Override this function to handle the more complex control flow around the // three loops. - BasicBlock *createVectorizedLoopSkeleton() final override { + std::pair<BasicBlock *, Value *> + createVectorizedLoopSkeleton() final override { return createEpilogueVectorizedLoopSkeleton(); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. - virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + virtual std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton() = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -944,7 +936,8 @@ public: EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -973,7 +966,8 @@ public: EPI, LVL, CM, BFI, PSI, Checks) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -1069,16 +1063,16 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); } +namespace llvm { + /// Return a value for Step multiplied by VF. -static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, - int64_t Step) { +Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, + int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } -namespace llvm { - /// Return the runtime value for VF. Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); @@ -1163,7 +1157,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( // will lead to gather/scatter instructions, which don't need to be // handled. if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || - isa<VPInterleaveRecipe>(CurRec)) + isa<VPInterleaveRecipe>(CurRec) || + isa<VPCanonicalIVPHIRecipe>(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1232,6 +1227,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, } } +PHINode *InnerLoopVectorizer::getReductionResumeValue( + const RecurrenceDescriptor &RdxDesc) { + auto It = ReductionResumeValues.find(&RdxDesc); + assert(It != ReductionResumeValues.end() && + "Expected to find a resume value for the reduction."); + return It->second; +} + namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be @@ -1556,13 +1559,16 @@ public: /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { + bool isLegalGatherOrScatter(Value *V, + ElementCount VF = ElementCount::getFixed(1)) { bool LI = isa<LoadInst>(V); bool SI = isa<StoreInst>(V); if (!LI && !SI) return false; auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); + if (VF.isVector()) + Ty = VectorType::get(Ty, VF); return (LI && TTI.isLegalMaskedGather(Ty, Align)) || (SI && TTI.isLegalMaskedScatter(Ty, Align)); } @@ -1577,16 +1583,17 @@ public: } /// Returns true if \p I is an instruction that will be scalarized with - /// predication. Such instructions include conditional stores and - /// instructions that may divide by zero. - /// If a non-zero VF has been calculated, we check if I will be scalarized - /// predication for that VF. - bool isScalarWithPredication(Instruction *I) const; + /// predication when vectorizing \p I with vectorization factor \p VF. Such + /// instructions include conditional stores and instructions that may divide + /// by zero. + bool isScalarWithPredication(Instruction *I, ElementCount VF) const; // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. + // \p VF is the vectorization factor that will be used to vectorize \p I. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { + bool isPredicatedInst(Instruction *I, ElementCount VF, + bool IsKnownUniform = false) { // When we know the load is uniform and the original scalar loop was not // predicated we don't need to mark it as a predicated instruction. Any // vectorised blocks created when tail-folding are something artificial we @@ -1602,7 +1609,7 @@ public: // instructions. if (isa<LoadInst>(I) || isa<StoreInst>(I)) return Legal->isMaskRequired(I); - return isScalarWithPredication(I); + return isScalarWithPredication(I, VF); } /// Returns true if \p I is a memory instruction with consecutive memory @@ -1794,7 +1801,7 @@ private: /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. - bool useEmulatedMaskMemRefHack(Instruction *I); + bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated @@ -2078,8 +2085,8 @@ public: /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { - SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); - SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); + SCEVExpanderCleaner SCEVCleaner(SCEVExp); + SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); if (!SCEVCheckCond) SCEVCleaner.markResultUsed(); @@ -2335,6 +2342,60 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } +/// This function adds +/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) +/// to each vector element of Val. The sequence starts at StartIndex. +/// \p Opcode is relevant for FP induction variable. +static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, + Instruction::BinaryOps BinOp, ElementCount VF, + IRBuilder<> &Builder) { + assert(VF.isVector() && "only vector VFs are supported"); + + // Create and check the types. + auto *ValVTy = cast<VectorType>(Val->getType()); + ElementCount VLen = ValVTy->getElementCount(); + + Type *STy = Val->getType()->getScalarType(); + assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && + "Induction Step must be an integer or FP"); + assert(Step->getType() == STy && "Step has wrong type"); + + SmallVector<Constant *, 8> Indices; + + // Create a vector of consecutive numbers from zero to VF. + VectorType *InitVecValVTy = ValVTy; + Type *InitVecValSTy = STy; + if (STy->isFloatingPointTy()) { + InitVecValSTy = + IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); + InitVecValVTy = VectorType::get(InitVecValSTy, VLen); + } + Value *InitVec = Builder.CreateStepVector(InitVecValVTy); + + // Splat the StartIdx + Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); + + if (STy->isIntegerTy()) { + InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw + // flags, which can be found from the original scalar operations. + Step = Builder.CreateMul(InitVec, Step); + return Builder.CreateAdd(Val, Step, "induction"); + } + + // Floating point induction. + assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && + "Binary Opcode should be specified for FP induction"); + InitVec = Builder.CreateUIToFP(InitVec, ValVTy); + InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); + + Step = Builder.CreateVectorSplat(VLen, Step); + Value *MulOp = Builder.CreateFMul(InitVec, Step); + return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); +} + void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Value *Start, Instruction *EntryVal, VPValue *Def, VPTransformState &State) { @@ -2355,8 +2416,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); - Value *SteppedStart = - getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); + Value *SteppedStart = getStepVector( + SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); // We create vector phi nodes for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. @@ -2411,8 +2472,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // placement of all induction updates. auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); - auto *ICmp = cast<Instruction>(Br->getCondition()); - LastInduction->moveBefore(ICmp); + LastInduction->moveBefore(Br); LastInduction->setName("vec.ind.next"); VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); @@ -2434,15 +2494,15 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { return llvm::any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, - const InductionDescriptor &ID, - Value *Start, TruncInst *Trunc, - VPValue *Def, - VPTransformState &State) { +void InnerLoopVectorizer::widenIntOrFpInduction( + PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, + Value *CanonicalIV) { + Value *Start = Def->getStartValue()->getLiveInIRValue(); + const InductionDescriptor &ID = Def->getInductionDescriptor(); + TruncInst *Trunc = Def->getTruncInst(); IRBuilder<> &Builder = State.Builder; - assert((IV->getType()->isIntegerTy() || IV != OldInduction) && - "Primary induction variable must have an integer type"); assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); + assert(!State.VF.isZero() && "VF must be non-zero"); // The value from the original loop to which we are mapping the new induction // variable. @@ -2468,12 +2528,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, // induction variable and step. Otherwise, derive these values from the // induction descriptor. auto CreateScalarIV = [&](Value *&Step) -> Value * { - Value *ScalarIV = Induction; - if (IV != OldInduction) { - ScalarIV = IV->getType()->isIntegerTy() - ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) - : Builder.CreateCast(Instruction::SIToFP, Induction, - IV->getType()); + Value *ScalarIV = CanonicalIV; + Type *NeededType = IV->getType(); + if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { + ScalarIV = + NeededType->isIntegerTy() + ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) + : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, State.CFG.PrevBB); ScalarIV->setName("offset.idx"); @@ -2493,7 +2554,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!State.VF.isScalable() && "scalable vectors not yet supported."); Value *StartIdx; if (Step->getType()->isFloatingPointTy()) StartIdx = @@ -2502,7 +2562,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); Value *EntryPart = - getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); + getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), + State.VF, State.Builder); State.set(Def, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); @@ -2516,9 +2577,31 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (State.VF.isZero() || State.VF.isScalar()) { + if (State.VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); - CreateSplatIV(ScalarIV, Step); + Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), + Step->getType()->getScalarSizeInBits()); + + Instruction::BinaryOps IncOp = ID.getInductionOpcode(); + if (IncOp == Instruction::BinaryOpsEnd) + IncOp = Instruction::Add; + for (unsigned Part = 0; Part < UF; ++Part) { + Value *StartIdx = ConstantInt::get(ScalarTy, Part); + Instruction::BinaryOps MulOp = Instruction::Mul; + if (Step->getType()->isFloatingPointTy()) { + StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); + MulOp = Instruction::FMul; + } + + Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); + Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); + State.set(Def, EntryPart, Part); + if (Trunc) { + assert(!Step->getType()->isFloatingPointTy() && + "fp inductions shouldn't be truncated"); + addMetadata(EntryPart, Trunc); + } + } return; } @@ -2554,54 +2637,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); } -Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, - Value *Step, - Instruction::BinaryOps BinOp) { - // Create and check the types. - auto *ValVTy = cast<VectorType>(Val->getType()); - ElementCount VLen = ValVTy->getElementCount(); - - Type *STy = Val->getType()->getScalarType(); - assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && - "Induction Step must be an integer or FP"); - assert(Step->getType() == STy && "Step has wrong type"); - - SmallVector<Constant *, 8> Indices; - - // Create a vector of consecutive numbers from zero to VF. - VectorType *InitVecValVTy = ValVTy; - Type *InitVecValSTy = STy; - if (STy->isFloatingPointTy()) { - InitVecValSTy = - IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); - InitVecValVTy = VectorType::get(InitVecValSTy, VLen); - } - Value *InitVec = Builder.CreateStepVector(InitVecValVTy); - - // Splat the StartIdx - Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); - - if (STy->isIntegerTy()) { - InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); - Step = Builder.CreateVectorSplat(VLen, Step); - assert(Step->getType() == Val->getType() && "Invalid step vec"); - // FIXME: The newly created binary instructions should contain nsw/nuw flags, - // which can be found from the original scalar operations. - Step = Builder.CreateMul(InitVec, Step); - return Builder.CreateAdd(Val, Step, "induction"); - } - - // Floating point induction. - assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && - "Binary Opcode should be specified for FP induction"); - InitVec = Builder.CreateUIToFP(InitVec, ValVTy); - InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); - - Step = Builder.CreateVectorSplat(VLen, Step); - Value *MulOp = Builder.CreateFMul(InitVec, Step); - return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); -} - void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, @@ -2691,11 +2726,6 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, State.set(Def, VectorValue, Instance.Part); } -Value *InnerLoopVectorizer::reverseVector(Value *Vec) { - assert(Vec->getType()->isVectorTy() && "Invalid type"); - return Builder.CreateVectorReverse(Vec, "reverse"); -} - // Return whether we allow using masked interleave-groups (for dealing with // strided loads/stores that reside in predicated blocks, or for dealing // with gaps). @@ -2858,7 +2888,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } if (Group->isReverse()) - StridedVec = reverseVector(StridedVec); + StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); State.set(VPDefs[J], StridedVec, Part); } @@ -2894,7 +2924,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *StoredVec = State.get(StoredValues[i], Part); if (Group->isReverse()) - StoredVec = reverseVector(StoredVec); + StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); // If this member has different type, cast it to a unified type. @@ -2993,43 +3023,21 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, PredicatedInstructions.push_back(Cloned); } -PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, - Value *End, Value *Step, - Instruction *DL) { +void InnerLoopVectorizer::createHeaderBranch(Loop *L) { BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - // As we're just creating this loop, it's possible no latch exists - // yet. If so, use the header as this will be a single block loop. - if (!Latch) - Latch = Header; - - IRBuilder<> B(&*Header->getFirstInsertionPt()); - Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); - setDebugLocFromInst(OldInst, &B); - auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); + assert(!L->getLoopLatch() && "loop should not have a latch at this point"); - B.SetInsertPoint(Latch->getTerminator()); + IRBuilder<> B(Header->getTerminator()); + Instruction *OldInst = + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); setDebugLocFromInst(OldInst, &B); - // Create i+1 and fill the PHINode. - // - // If the tail is not folded, we know that End - Start >= Step (either - // statically or through the minimum iteration checks). We also know that both - // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + - // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned - // overflows and we can mark the induction increment as NUW. - Value *Next = B.CreateAdd(Induction, Step, "index.next", - /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); - Induction->addIncoming(Start, L->getLoopPreheader()); - Induction->addIncoming(Next, Latch); - // Create the compare. - Value *ICmp = B.CreateICmpEQ(Next, End); - B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); + // Connect the header to the exit and header blocks and replace the old + // terminator. + B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); // Now we have two terminators. Remove the old one from the block. - Latch->getTerminator()->eraseFromParent(); - - return Induction; + Header->getTerminator()->eraseFromParent(); } Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { @@ -3099,10 +3107,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - assert(!VF.isScalable() && - "Tail folding not yet supported for scalable vectors"); + Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); TC = Builder.CreateAdd( - TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); + TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -3436,12 +3443,13 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { } void InnerLoopVectorizer::createInductionResumeValues( - Loop *L, Value *VectorTripCount, - std::pair<BasicBlock *, Value *> AdditionalBypass) { - assert(VectorTripCount && L && "Expected valid arguments"); + Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); + + Value *VectorTripCount = getOrCreateVectorTripCount(L); + assert(VectorTripCount && L && "Expected valid arguments"); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3449,6 +3457,7 @@ void InnerLoopVectorizer::createInductionResumeValues( // iteration in the vectorized loop. // If we come from a bypass edge then we need to start from the original // start value. + Instruction *OldInduction = Legal->getPrimaryInduction(); for (auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; @@ -3546,25 +3555,6 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, "Inconsistent vector loop preheader"); Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); - Optional<MDNode *> VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); - if (VectorizedLoopID.hasValue()) { - L->setLoopID(VectorizedLoopID.getValue()); - - // Do not setAlreadyVectorized if loop attributes have been defined - // explicitly. - return LoopVectorPreHeader; - } - - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE, TTI); - Hints.setAlreadyVectorized(); - #ifdef EXPENSIVE_CHECKS assert(DT->verify(DominatorTree::VerificationLevel::Fast)); LI->verify(*DT); @@ -3573,7 +3563,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, return LoopVectorPreHeader; } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { +std::pair<BasicBlock *, Value *> +InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3638,33 +3629,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // faster. emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - // Some loops have a single integer induction variable, while other loops - // don't. One example is c++ iterators that often have multiple pointer - // induction variables. In the code below we also support a case where we - // don't have a single induction variable. - // - // We try to obtain an induction variable from the original loop as hard - // as possible. However if we don't find one that: - // - is an integer - // - counts from zero, stepping by one - // - is the size of the widest induction variable type - // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); - Value *Step = createStepForVF(Builder, IdxTy, VF, UF); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(Lp, CountRoundDown); + createInductionResumeValues(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -4088,8 +4058,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { } } -void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, - VPTransformState &State) { +void InnerLoopVectorizer::fixFirstOrderRecurrence( + VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An // overview of the transformation is described below. Suppose we have the // following loop. @@ -4334,13 +4304,29 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, : Builder.CreateZExt(ReducedPartRdx, PhiTy); } + PHINode *ResumePhi = + dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); + // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); - BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + + // If we are fixing reductions in the epilogue loop then we should already + // have created a bc.merge.rdx Phi after the main vector body. Ensure that + // we carry over the incoming values correctly. + for (auto *Incoming : predecessors(LoopScalarPreHeader)) { + if (Incoming == LoopMiddleBlock) + BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); + else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) + BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), + Incoming); + else + BCBlockPhi->addIncoming(ReductionStartValue, Incoming); + } + + // Set the resume value for this reduction + ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -4557,6 +4543,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, InductionDescriptor II = Legal->getInductionVars().lookup(P); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); + PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, // which can be found from the original scalar operations. switch (II.getKind()) { @@ -4572,7 +4561,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, if (Cost->isScalarAfterVectorization(P, State.VF)) { // This is the normalized GEP that starts counting at zero. Value *PtrInd = - Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); + Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. @@ -4602,10 +4591,10 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Type *PhiType = II.getStep()->getType(); // Build a pointer phi - Value *ScalarStartValue = II.getStartValue(); + Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); Type *ScStValueType = ScalarStartValue->getType(); PHINode *NewPointerPhi = - PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); + PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); // A pointer induction, performed by using a gep @@ -4916,7 +4905,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { +bool LoopVectorizationCostModel::isScalarWithPredication( + Instruction *I, ElementCount VF) const { if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; switch(I->getOpcode()) { @@ -4928,11 +4918,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); + Type *VTy = Ty; + if (VF.isVector()) + VTy = VectorType::get(Ty, VF); const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || - TTI.isLegalMaskedGather(Ty, Alignment)) + TTI.isLegalMaskedGather(VTy, Alignment)) : !(isLegalMaskedStore(Ty, Ptr, Alignment) || - TTI.isLegalMaskedScatter(Ty, Alignment)); + TTI.isLegalMaskedScatter(VTy, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -5005,7 +4998,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( // If the instruction is a store located in a predicated block, it will be // scalarized. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If the instruction's allocated size doesn't equal it's type size, it @@ -5056,7 +5049,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { << *I << "\n"); return; } - if (isScalarWithPredication(I)) { + if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -5531,10 +5524,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } } - // For scalable vectors, don't use tail folding as this is currently not yet - // supported. The code is likely to have ended up here if the tripcount is - // low, in which case it makes sense not to use scalable vectors. - if (MaxFactors.ScalableVF.isVector()) + // For scalable vectors don't use tail folding for low trip counts or + // optimizing for code size. We only permit this if the user has explicitly + // requested it. + if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && + ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && + MaxFactors.ScalableVF.isVector()) MaxFactors.ScalableVF = ElementCount::getScalable(0); // If we don't know the precise trip count, or if the trip count that we @@ -5849,10 +5844,8 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( const Loop &L, ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. - if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { - return Legal->isFirstOrderRecurrence(&Phi) || - Legal->isReductionVariable(&Phi); - })) + if (any_of(L.getHeader()->phis(), + [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) return false; // Phis with uses outside of the loop require special handling and are @@ -5978,11 +5971,29 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); - for (Type *T : ElementTypesInLoop) { - MinWidth = std::min<unsigned>( - MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); - MaxWidth = std::max<unsigned>( - MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + // For in-loop reductions, no element types are added to ElementTypesInLoop + // if there are no loads/stores in the loop. In this case, check through the + // reduction variables to determine the maximum width. + if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { + // Reset MaxWidth so that we can find the smallest type used by recurrences + // in the loop. + MaxWidth = -1U; + for (auto &PhiDescriptorPair : Legal->getReductionVars()) { + const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; + // When finding the min width used by the recurrence we need to account + // for casts on the input operands of the recurrence. + MaxWidth = std::min<unsigned>( + MaxWidth, std::min<unsigned>( + RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), + RdxDesc.getRecurrenceType()->getScalarSizeInBits())); + } + } else { + for (Type *T : ElementTypesInLoop) { + MinWidth = std::min<unsigned>( + MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + MaxWidth = std::max<unsigned>( + MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + } } return {MinWidth, MaxWidth}; } @@ -6022,18 +6033,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { if (auto *ST = dyn_cast<StoreInst>(&I)) T = ST->getValueOperand()->getType(); - // Ignore loaded pointer types and stored pointer types that are not - // vectorizable. - // - // FIXME: The check here attempts to predict whether a load or store will - // be vectorized. We only know this for certain after a VF has - // been selected. Here, we assume that if an access can be - // vectorized, it will be. We should also look at extending this - // optimization to non-pointer types. - // - if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && - !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) - continue; + assert(T->isSized() && + "Expected the load/store/recurrence type to be sized"); ElementTypesInLoop.insert(T); } @@ -6475,7 +6476,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { return RUs; } -bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, + ElementCount VF) { // TODO: Cost model for emulated masked load/store is completely // broken. This hack guides the cost model to use an artificially // high enough value to practically disable vectorization with such @@ -6484,8 +6486,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I) && - "Expecting a scalar emulated instruction"); + assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); return isa<LoadInst>(I) || (isa<StoreInst>(I) && NumPredStores > NumberOfStoresToPredicate); @@ -6512,13 +6513,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { if (!blockNeedsPredicationForAnyReason(BB)) continue; for (Instruction &I : *BB) - if (isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I, VF)) { ScalarCostsTy ScalarCosts; // Do not apply discount if scalable, because that would lead to // invalid scalarization costs. // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. - if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && + if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. @@ -6554,7 +6555,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6601,7 +6602,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), true, false); @@ -6764,7 +6765,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (isPredicatedInst(I, VF)) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -6775,7 +6776,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, /*Insert=*/false, /*Extract=*/true); Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); - if (useEmulatedMaskMemRefHack(I)) + if (useEmulatedMaskMemRefHack(I, VF)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. Cost = 3000000; @@ -7182,7 +7183,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). - if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) + if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) NumPredStores++; if (Legal->isUniformMemOp(I)) { @@ -7192,7 +7193,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract InstructionCost Cost; if (isa<StoreInst>(&I) && VF.isScalable() && - isLegalGatherOrScatter(&I)) { + isLegalGatherOrScatter(&I, VF)) { Cost = getGatherScatterCost(&I, VF); setWideningDecision(&I, VF, CM_GatherScatter, Cost); } else { @@ -7234,7 +7235,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { } InstructionCost GatherScatterCost = - isLegalGatherOrScatter(&I) + isLegalGatherOrScatter(&I, VF) ? getGatherScatterCost(&I, VF) * NumAccesses : InstructionCost::getInvalid(); @@ -7437,7 +7438,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I, VF)) { InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -7941,6 +7942,40 @@ VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { llvm_unreachable("No plan found!"); } +static void AddRuntimeUnrollDisableMetaData(Loop *L) { + SmallVector<Metadata *, 4> MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + bool IsUnrollMetadata = false; + MDNode *LoopID = L->getLoopID(); + if (LoopID) { + // First find existing loop unrolling disable metadata. + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const auto *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = + S && S->getString().startswith("llvm.loop.unroll.disable"); + } + MDs.push_back(LoopID->getOperand(i)); + } + } + + if (!IsUnrollMetadata) { + // Add runtime unroll disable metadata. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + } +} + void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, @@ -7952,9 +7987,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; - State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); - State.TripCount = ILV.getOrCreateTripCount(nullptr); - State.CanonicalIV = ILV.Induction; + Value *CanonicalIVStartValue; + std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = + ILV.createVectorizedLoopSkeleton(); ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -7968,8 +8003,35 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. + BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), + ILV.getOrCreateVectorTripCount(nullptr), + CanonicalIVStartValue, State); BestVPlan.execute(&State); + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + MDNode *OrigLoopID = OrigLoop->getLoopID(); + + Optional<MDNode *> VectorizedLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized}); + + Loop *L = LI->getLoopFor(State.CFG.PrevBB); + if (VectorizedLoopID.hasValue()) + L->setLoopID(VectorizedLoopID.getValue()); + else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (MDNode *LID = OrigLoop->getLoopID()) + L->setLoopID(LID); + + LoopVectorizeHints Hints(L, true, *ORE); + Hints.setAlreadyVectorized(); + } + // Disable runtime unrolling when vectorizing the epilogue loop. + if (CanonicalIVStartValue) + AddRuntimeUnrollDisableMetaData(L); + // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(State); @@ -8032,66 +8094,16 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( } } -Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } - Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, - Value *Step, - Instruction::BinaryOps BinOp) { - // When unrolling and the VF is 1, we only need to add a simple scalar. - Type *Ty = Val->getType(); - assert(!Ty->isVectorTy() && "Val must be a scalar"); - - if (Ty->isFloatingPointTy()) { - // Floating-point operations inherit FMF via the builder's flags. - Value *MulOp = Builder.CreateFMul(StartIdx, Step); - return Builder.CreateBinOp(BinOp, Val, MulOp); - } - return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); -} - -static void AddRuntimeUnrollDisableMetaData(Loop *L) { - SmallVector<Metadata *, 4> MDs; - // Reserve first location for self reference to the LoopID metadata node. - MDs.push_back(nullptr); - bool IsUnrollMetadata = false; - MDNode *LoopID = L->getLoopID(); - if (LoopID) { - // First find existing loop unrolling disable metadata. - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (MD) { - const auto *S = dyn_cast<MDString>(MD->getOperand(0)); - IsUnrollMetadata = - S && S->getString().startswith("llvm.loop.unroll.disable"); - } - MDs.push_back(LoopID->getOperand(i)); - } - } - - if (!IsUnrollMetadata) { - // Add runtime unroll disable metadata. - LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Metadata *, 1> DisableOperands; - DisableOperands.push_back( - MDString::get(Context, "llvm.loop.unroll.runtime.disable")); - MDNode *DisableNode = MDNode::get(Context, DisableOperands); - MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - L->setLoopID(NewLoopID); - } -} - //===--------------------------------------------------------------------===// // EpilogueVectorizerMainLoop //===--------------------------------------------------------------------===// /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +std::pair<BasicBlock *, Value *> +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton(""); @@ -8120,24 +8132,16 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - - IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); - Value *Step = getRuntimeVF(B, IdxTy, VF * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); EPI.VectorTripCount = CountRoundDown; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Skip induction resume value creation here because they will be created in // the second pass. If we created them here, they wouldn't be used anyway, // because the vplan in the second pass still contains the inductions from the // original loop. - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -8219,7 +8223,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock * +std::pair<BasicBlock *, Value *> EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton("vec.epilog."); @@ -8275,6 +8279,25 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { LoopBypassBlocks.push_back(EPI.MemSafetyCheck); LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); + // The vec.epilog.iter.check block may contain Phi nodes from reductions which + // merge control-flow from the latch block and the middle block. Update the + // incoming values here and move the Phi into the preheader. + SmallVector<PHINode *, 4> PhisInBlock; + for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) + PhisInBlock.push_back(&Phi); + + for (PHINode *Phi : PhisInBlock) { + Phi->replaceIncomingBlockWith( + VecEpilogueIterationCountCheck->getSinglePredecessor(), + VecEpilogueIterationCountCheck); + Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); + if (EPI.SCEVSafetyCheck) + Phi->removeIncomingValue(EPI.SCEVSafetyCheck); + if (EPI.MemSafetyCheck) + Phi->removeIncomingValue(EPI.MemSafetyCheck); + Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); + } + // Generate a resume induction for the vector epilogue and put it in the // vector epilogue preheader Type *IdxTy = Legal->getWidestInductionType(); @@ -8285,13 +8308,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { EPI.MainLoopIterationCountCheck); // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); - Value *StartIdx = EPResumeVal; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Generate induction resume values. These variables save the new starting // indexes for the scalar loop. They are used to test if there are any tail @@ -8300,12 +8317,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues(Lp, CountRoundDown, - {VecEpilogueIterationCountCheck, - EPI.VectorTripCount} /* AdditionalBypass */); + createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); - AddRuntimeUnrollDisableMetaData(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; } BasicBlock * @@ -8447,33 +8462,22 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. - // Start by constructing the desired canonical IV in the header block. - VPValue *IV = nullptr; - if (Legal->getPrimaryInduction()) - IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); - else { - VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); - auto *IVRecipe = new VPWidenCanonicalIVRecipe(); - HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi()); - IV = IVRecipe; - } + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); + HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); - // Create the block in mask as the first non-phi instruction in the block. VPBuilder::InsertPointGuard Guard(Builder); - auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); - Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); - - VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); - bool TailFolded = !CM.isScalarEpilogueAllowed(); - - if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { - // While ActiveLaneMask is a binary op that consumes the loop tripcount - // as a second argument, we only pass the IV here and extract the - // tripcount from the transform state where codegen of the VP instructions - // happen. - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + if (CM.TTI.emitGetActiveLaneMask()) { + VPValue *TC = Plan->getOrCreateTripCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); } else { + VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); } return BlockMaskCache[BB] = BlockMask; @@ -8621,7 +8625,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -8661,7 +8667,8 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { // scalarization is profitable or it is predicated. auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); + CM.isProfitableToScalarize(I, VF) || + CM.isScalarWithPredication(I, VF); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); @@ -8719,7 +8726,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, void VPRecipeBuilder::fixHeaderPhis() { BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); - for (VPWidenPHIRecipe *R : PhisToFix) { + for (VPHeaderPHIRecipe *R : PhisToFix) { auto *PN = cast<PHINode>(R->getUnderlyingValue()); VPRecipeBase *IncR = getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); @@ -8735,7 +8742,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, + [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, Range); // Even if the instruction is not marked as uniform, there are certain @@ -8861,7 +8868,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) return toVPRecipeResult(Recipe); - VPWidenPHIRecipe *PhiRecipe = nullptr; + VPHeaderPHIRecipe *PhiRecipe = nullptr; if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { VPValue *StartV = Operands[0]; if (Legal->isReductionVariable(Phi)) { @@ -8882,11 +8889,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); PhisToFix.push_back(PhiRecipe); } else { - // TODO: record start and backedge value for remaining pointer induction - // phis. + // TODO: record backedge value for remaining pointer induction phis. assert(Phi->getType()->isPointerTy() && "only pointer phis should be handled here"); - PhiRecipe = new VPWidenPHIRecipe(Phi); + assert(Legal->getInductionVars().count(Phi) && + "Not an induction variable"); + InductionDescriptor II = Legal->getInductionVars().lookup(Phi); + VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); + PhiRecipe = new VPWidenPHIRecipe(Phi, Start); } return toVPRecipeResult(PhiRecipe); @@ -8966,6 +8976,40 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } +// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a +// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a +// BranchOnCount VPInstruction to the latch. +static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, + bool HasNUW, bool IsVPlanNative) { + Value *StartIdx = ConstantInt::get(IdxTy, 0); + auto *StartV = Plan.getOrAddVPValue(StartIdx); + + auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); + if (IsVPlanNative) + Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); + Header->insert(CanonicalIVPHI, Header->begin()); + + auto *CanonicalIVIncrement = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW + : VPInstruction::CanonicalIVIncrement, + {CanonicalIVPHI}, DL); + CanonicalIVPHI->addOperand(CanonicalIVIncrement); + + VPBasicBlock *EB = TopRegion->getExitBasicBlock(); + if (IsVPlanNative) { + EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); + EB->setCondBit(nullptr); + } + EB->appendRecipe(CanonicalIVIncrement); + + auto *BranchOnCount = + new VPInstruction(VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchOnCount); +} + VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const MapVector<Instruction *, Instruction *> &SinkAfter) { @@ -9033,6 +9077,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); auto Plan = std::make_unique<VPlan>(TopRegion); + Instruction *DLInst = + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), + DLInst ? DLInst->getDebugLoc() : DebugLoc(), + !CM.foldTailByMasking(), false); + // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -9194,6 +9244,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } + VPlanTransforms::removeRedundantCanonicalIVs(*Plan); VPlanTransforms::removeRedundantInductionCasts(*Plan); // Now that sink-after is done, move induction recipes for optimized truncates @@ -9325,6 +9376,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { OrigLoop, Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, DeadInstructions, *PSE.getSE()); + + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), + true, true); return Plan; } @@ -9414,16 +9468,19 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } // If tail is folded by masking, introduce selects between the phi - // and the live-out instruction of each reduction, at the end of the latch. + // and the live-out instruction of each reduction, at the beginning of the + // dedicated latch block. if (CM.foldTailByMasking()) { + Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); if (!PhiR || PhiR->isInLoop()) continue; - Builder.setInsertPoint(LatchVPBB); VPValue *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); VPValue *Red = PhiR->getBackedgeValue(); + assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && + "reduction recipe must be defined before latch"); Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); } } @@ -9682,9 +9739,8 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), - getStartValue()->getLiveInIRValue(), - getTruncInst(), getVPValue(0), State); + auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); + State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); } void VPWidenPHIRecipe::execute(VPTransformState &State) { @@ -10013,7 +10069,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); } - State.set(getVPSingleValue(), NewLI, Part); + State.set(this, NewLI, Part); } } @@ -10561,6 +10617,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { Checks); VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + + // Ensure that the start values for any VPReductionPHIRecipes are + // updated before vectorising the epilogue loop. + VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { + if (auto *Resume = MainILV.getReductionResumeValue( + ReductionPhi->getRecurrenceDescriptor())) { + VPValue *StartVal = new VPValue(Resume); + BestEpiPlan.addExternalDef(StartVal); + ReductionPhi->setOperand(0, StartVal); + } + } + } + LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT); ++LoopsEpilogueVectorized; |