diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 947 |
1 files changed, 590 insertions, 357 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f24ae6b100d5..23bb6f0860c9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -87,7 +87,6 @@ #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -332,8 +331,8 @@ static cl::opt<bool> cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference.")); -cl::opt<bool> EnableStrictReductions( - "enable-strict-reductions", cl::init(false), cl::Hidden, +static cl::opt<bool> ForceOrderedReductions( + "force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions")); @@ -545,7 +544,8 @@ public: /// vectorized loop. void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask); + VPValue *StoredValue, VPValue *BlockInMask, + bool ConsecutiveStride, bool Reverse); /// Set the debug location in the builder \p Ptr using the debug location in /// \p V. If \p Ptr is None then it uses the class member's Builder. @@ -590,12 +590,11 @@ protected: /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); - /// Fix a first-order recurrence. This is the second phase of vectorizing - /// this phi node. + /// Create the exit value of first order recurrences in the middle block and + /// update their users. void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); - /// Fix a reduction cross-iteration phi. This is the second phase of - /// vectorizing this phi node. + /// Create code for the loop exit value of the reduction. void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); /// Clear NSW/NUW flags from reduction instructions if necessary. @@ -621,9 +620,9 @@ protected: /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) /// to each vector element of Val. The sequence starts at StartIndex. /// \p Opcode is relevant for FP induction variable. - virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, - Instruction::BinaryOps Opcode = - Instruction::BinaryOpsEnd); + virtual Value * + getStepVector(Value *Val, Value *StartIdx, Value *Step, + Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and @@ -890,9 +889,9 @@ public: private: Value *getBroadcastInstrs(Value *V) override; - Value *getStepVector(Value *Val, int StartIdx, Value *Step, - Instruction::BinaryOps Opcode = - Instruction::BinaryOpsEnd) override; + Value *getStepVector( + Value *Val, Value *StartIdx, Value *Step, + Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; Value *reverseVector(Value *Vec) override; }; @@ -911,10 +910,9 @@ struct EpilogueLoopVectorizationInfo { Value *TripCount = nullptr; Value *VectorTripCount = nullptr; - EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, - unsigned EUF) - : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), - EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { + EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, + ElementCount EVF, unsigned EUF) + : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { assert(EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."); } @@ -1105,11 +1103,10 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, } /// Return a value for Step multiplied by VF. -static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { - assert(isa<ConstantInt>(Step) && "Expected an integer step"); - Constant *StepVal = ConstantInt::get( - Step->getType(), - cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); +static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, + int64_t Step) { + assert(Ty->isIntegerTy() && "Expected an integer step"); + Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } @@ -1121,6 +1118,13 @@ Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { return VF.isScalable() ? B.CreateVScale(EC) : EC; } +static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { + assert(FTy->isFloatingPointTy() && "Expected floating point type!"); + Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); + Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); + return B.CreateUIToFP(RuntimeVF, FTy); +} + void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, @@ -1319,8 +1323,7 @@ public: /// the IsOrdered flag of RdxDesc is set and we do not allow reordering /// of FP operations. bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { - return EnableStrictReductions && !Hints->allowReordering() && - RdxDesc.isOrdered(); + return !Hints->allowReordering() && RdxDesc.isOrdered(); } /// \returns The smallest bitwidth each instruction can be represented with. @@ -1495,14 +1498,14 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { - return Legal->isConsecutivePtr(Ptr) && + return Legal->isConsecutivePtr(DataType, Ptr) && TTI.isLegalMaskedStore(DataType, Alignment); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { - return Legal->isConsecutivePtr(Ptr) && + return Legal->isConsecutivePtr(DataType, Ptr) && TTI.isLegalMaskedLoad(DataType, Alignment); } @@ -1539,7 +1542,7 @@ public: // through scalar predication or masked load/store or masked gather/scatter. // Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I) { - if (!blockNeedsPredication(I->getParent())) + if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; // Loads and stores that need some form of masked operation are predicated // instructions. @@ -1593,7 +1596,10 @@ public: /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } - bool blockNeedsPredication(BasicBlock *BB) const { + /// Returns true if the instructions in this block requires predication + /// for any reason, e.g. because tail folding now requires a predicate + /// or because the block in the original loop was predicated. + bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { return foldTailByMasking() || Legal->blockNeedsPredication(BB); } @@ -1928,7 +1934,7 @@ class GeneratedRTChecks { /// The value representing the result of the generated memory runtime checks. /// If it is nullptr, either no memory runtime checks have been generated or /// they have been used. - Instruction *MemRuntimeCheckCond = nullptr; + Value *MemRuntimeCheckCond = nullptr; DominatorTree *DT; LoopInfo *LI; @@ -1971,7 +1977,7 @@ public: MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, "vector.memcheck"); - std::tie(std::ignore, MemRuntimeCheckCond) = + MemRuntimeCheckCond = addRuntimeChecks(MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), MemCheckExp); assert(MemRuntimeCheckCond && @@ -2030,7 +2036,6 @@ public: if (MemCheckExp.isInsertedInstruction(&I)) continue; SE.forgetValue(&I); - SE.eraseValueFromMap(&I); I.eraseFromParent(); } } @@ -2289,9 +2294,11 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( Step = Builder.CreateTrunc(Step, TruncType); Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } + + Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); Value *SplatStart = Builder.CreateVectorSplat(VF, Start); Value *SteppedStart = - getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); + getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); // We create vector phi nodes for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. @@ -2308,12 +2315,11 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. Type *StepType = Step->getType(); + Value *RuntimeVF; if (Step->getType()->isFloatingPointTy()) - StepType = IntegerType::get(StepType->getContext(), - StepType->getScalarSizeInBits()); - Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); - if (Step->getType()->isFloatingPointTy()) - RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); + else + RuntimeVF = getRuntimeVF(Builder, StepType, VF); Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); // Create a vector splat to use in the induction update. @@ -2388,9 +2394,13 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( if (isa<TruncInst>(EntryVal)) return; - const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); - if (Casts.empty()) + if (!CastDef) { + assert(ID.getCastInsts().empty() && + "there are casts for ID, but no CastDef"); return; + } + assert(!ID.getCastInsts().empty() && + "there is a CastDef, but no casts for ID"); // Only the first Cast instruction in the Casts vector is of interest. // The rest of the Casts (if exist) have no uses outside the // induction update chain itself. @@ -2462,9 +2472,14 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { assert(!VF.isScalable() && "scalable vectors not yet supported."); + Value *StartIdx; + if (Step->getType()->isFloatingPointTy()) + StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); + else + StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); + Value *EntryPart = - getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, - ID.getInductionOpcode()); + getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); State.set(Def, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); @@ -2520,7 +2535,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); } -Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, +Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, + Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. auto *ValVTy = cast<VectorType>(Val->getType()); @@ -2543,12 +2559,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, } Value *InitVec = Builder.CreateStepVector(InitVecValVTy); - // Add on StartIdx - Value *StartIdxSplat = Builder.CreateVectorSplat( - VLen, ConstantInt::get(InitVecValSTy, StartIdx)); - InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); + // Splat the StartIdx + Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); if (STy->isIntegerTy()) { + InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); Step = Builder.CreateVectorSplat(VLen, Step); assert(Step->getType() == Val->getType() && "Invalid step vec"); // FIXME: The newly created binary instructions should contain nsw/nuw flags, @@ -2561,6 +2576,8 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && "Binary Opcode should be specified for FP induction"); InitVec = Builder.CreateUIToFP(InitVec, ValVTy); + InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); + Step = Builder.CreateVectorSplat(VLen, Step); Value *MulOp = Builder.CreateFMul(InitVec, Step); return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); @@ -2609,8 +2626,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, } for (unsigned Part = 0; Part < UF; ++Part) { - Value *StartIdx0 = - createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); if (!IsUniform && VF.isScalable()) { auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); @@ -2838,12 +2854,25 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. + MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); + assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && + "masked interleaved groups are not allowed."); + assert((!MaskForGaps || !VF.isScalable()) && + "masking gaps for scalable vectors is not yet supported."); for (unsigned Part = 0; Part < UF; Part++) { // Collect the stored vector from each member. SmallVector<Value *, 4> StoredVecs; for (unsigned i = 0; i < InterleaveFactor; i++) { - // Interleaved store group doesn't allow a gap, so each index has a member - assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); + assert((Group->getMember(i) || MaskForGaps) && + "Fail to get a member from an interleaved store group"); + Instruction *Member = Group->getMember(i); + + // Skip the gaps in the group. + if (!Member) { + Value *Undef = PoisonValue::get(SubVT); + StoredVecs.push_back(Undef); + continue; + } Value *StoredVec = State.get(StoredValues[i], Part); @@ -2867,16 +2896,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( "interleaved.vec"); Instruction *NewStoreInstr; - if (BlockInMask) { - Value *BlockInMaskPart = State.get(BlockInMask, Part); - Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), - "interleaved.mask"); - NewStoreInstr = Builder.CreateMaskedStore( - IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); - } - else + if (BlockInMask || MaskForGaps) { + Value *GroupMask = MaskForGaps; + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + Value *ShuffledMask = Builder.CreateShuffleVector( + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + "interleaved.mask"); + GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, + ShuffledMask, MaskForGaps) + : ShuffledMask; + } + NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], + Group->getAlign(), GroupMask); + } else NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); @@ -2886,7 +2920,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( void InnerLoopVectorizer::vectorizeMemoryInstruction( Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask) { + VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, + bool Reverse) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2895,31 +2930,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( assert((!SI || StoredValue) && "No stored value provided for widened store"); assert((!LI || !StoredValue) && "Stored value provided for widened load"); - LoopVectorizationCostModel::InstWidening Decision = - Cost->getWideningDecision(Instr, VF); - assert((Decision == LoopVectorizationCostModel::CM_Widen || - Decision == LoopVectorizationCostModel::CM_Widen_Reverse || - Decision == LoopVectorizationCostModel::CM_GatherScatter) && - "CM decision is not to widen the memory instruction"); - Type *ScalarDataTy = getLoadStoreType(Instr); auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); - - // Determine if the pointer operand of the access is either consecutive or - // reverse consecutive. - bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); - bool ConsecutiveStride = - Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); - bool CreateGatherScatter = - (Decision == LoopVectorizationCostModel::CM_GatherScatter); - - // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector - // gather/scatter. Otherwise Decision should have been to Scalarize. - assert((ConsecutiveStride || CreateGatherScatter) && - "The instruction should be scalarized"); - (void)ConsecutiveStride; + bool CreateGatherScatter = !ConsecutiveStride; VectorParts BlockInMaskParts(UF); bool isMaskRequired = BlockInMask; @@ -2953,7 +2968,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { - Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); + Value *Increment = + createStepForVF(Builder, Builder.getInt32Ty(), VF, Part); PartPtr = cast<GetElementPtrInst>( Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); PartPtr->setIsInBounds(InBounds); @@ -3172,7 +3188,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { Type *Ty = TC->getType(); // This is where we can make the step a runtime constant. - Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); + Value *Step = createStepForVF(Builder, Ty, VF, UF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -3262,8 +3278,7 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); if (!Cost->foldTailByMasking()) { - Value *Step = - createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); + Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); } // Create new preheader for vector loop. @@ -3433,7 +3448,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex( assert(isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"); return B.CreateGEP( - StartValue->getType()->getPointerElementType(), StartValue, + ID.getElementType(), StartValue, CreateMul(Index, Exp.expandCodeFor(Step, Index->getType()->getScalarType(), GetInsertPoint()))); @@ -3739,7 +3754,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); - Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); + Value *Step = createStepForVF(Builder, IdxTy, VF, UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -3857,21 +3872,19 @@ struct CSEDenseMapInfo { static void cse(BasicBlock *BB) { // Perform simple cse. SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *In = &*I++; - - if (!CSEDenseMapInfo::canHandle(In)) + for (Instruction &In : llvm::make_early_inc_range(*BB)) { + if (!CSEDenseMapInfo::canHandle(&In)) continue; // Check if we can replace this instruction with any of the // visited instructions. - if (Instruction *V = CSEMap.lookup(In)) { - In->replaceAllUsesWith(V); - In->eraseFromParent(); + if (Instruction *V = CSEMap.lookup(&In)) { + In.replaceAllUsesWith(V); + In.eraseFromParent(); continue; } - CSEMap[In] = In; + CSEMap[&In] = &In; } } @@ -3881,7 +3894,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; - for (auto &ArgOp : CI->arg_operands()) + for (auto &ArgOp : CI->args()) ScalarTys.push_back(ArgOp->getType()); // Estimate cost of scalarized vector call. The source operands are assumed @@ -3940,7 +3953,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) FMF = FPMO->getFastMathFlags(); - SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); + SmallVector<const Value *> Arguments(CI->args()); FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); SmallVector<Type *> ParamTys; std::transform(FTy->param_begin(), FTy->param_end(), @@ -3974,7 +3987,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { // If the value wasn't vectorized, we must maintain the original scalar // type. The absence of the value from State indicates that it // wasn't vectorized. - VPValue *Def = State.Plan->getVPValue(KV.first); + // FIXME: Should not rely on getVPValue at this point. + VPValue *Def = State.Plan->getVPValue(KV.first, true); if (!State.hasAnyVectorValue(Def)) continue; for (unsigned Part = 0; Part < UF; ++Part) { @@ -4081,7 +4095,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { // If the value wasn't vectorized, we must maintain the original scalar // type. The absence of the value from State indicates that it // wasn't vectorized. - VPValue *Def = State.Plan->getVPValue(KV.first); + // FIXME: Should not rely on getVPValue at this point. + VPValue *Def = State.Plan->getVPValue(KV.first, true); if (!State.hasAnyVectorValue(Def)) continue; for (unsigned Part = 0; Part < UF; ++Part) { @@ -4222,17 +4237,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, // After execution completes the vector loop, we extract the next value of // the recurrence (x) to use as the initial value in the scalar loop. - auto *IdxTy = Builder.getInt32Ty(); - auto *VecPhi = cast<PHINode>(State.get(PhiR, 0)); - - // Fix the latch value of the new recurrence in the vector loop. - VPValue *PreviousDef = PhiR->getBackedgeValue(); - Value *Incoming = State.get(PreviousDef, UF - 1); - VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); - // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. + VPValue *PreviousDef = PhiR->getBackedgeValue(); + Value *Incoming = State.get(PreviousDef, UF - 1); auto *ExtractForScalar = Incoming; + auto *IdxTy = Builder.getInt32Ty(); if (VF.isVector()) { auto *One = ConstantInt::get(IdxTy, 1); Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -4283,8 +4293,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, // and thus no phis which needed updated. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (any_of(LCSSAPhi.incoming_values(), - [Phi](Value *V) { return V == Phi; })) + if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); } @@ -4301,29 +4310,13 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); setDebugLocFromInst(ReductionStartValue); - VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); + VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); // This is the vector-clone of the value that leaves the loop. Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); // Wrap flags are in general invalid after vectorization, clear them. clearReductionWrapFlags(RdxDesc, State); - // Fix the vector-loop phi. - - // Reductions do not have to start at zero. They can start with - // any loop invariant values. - BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); - - unsigned LastPartForNewPhi = PhiR->isOrdered() ? 1 : UF; - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); - Value *Val = State.get(PhiR->getBackedgeValue(), Part); - if (PhiR->isOrdered()) - Val = State.get(PhiR->getBackedgeValue(), UF - 1); - - cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); - } - // Before each round, move the insertion point right between // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement @@ -4361,7 +4354,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, RdxDesc.getOpcode(), PhiTy, TargetTransformInfo::ReductionFlags())) { auto *VecRdxPhi = - cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); + cast<PHINode>(State.get(PhiR, Part)); VecRdxPhi->setIncomingValueForBlock( LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); } @@ -4382,13 +4375,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) : Builder.CreateZExt(Trunc, VecTy); - for (Value::user_iterator UI = RdxParts[Part]->user_begin(); - UI != RdxParts[Part]->user_end();) - if (*UI != Trunc) { - (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); + for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) + if (U != Trunc) { + U->replaceUsesOfWith(RdxParts[Part], Extnd); RdxParts[Part] = Extnd; - } else { - ++UI; } } Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); @@ -4421,9 +4411,11 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, if (Op != Instruction::ICmp && Op != Instruction::FCmp) { ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - } else { + } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) + ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, + ReducedPartRdx, RdxPart); + else ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); - } } } @@ -4431,7 +4423,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // target reduction in the loop using a Reduction recipe. if (VF.isVector() && !PhiR->isInLoop()) { ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -4456,8 +4448,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // fixFirstOrderRecurrence for a more complete explaination of the logic. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (any_of(LCSSAPhi.incoming_values(), - [LoopExitInst](Value *V) { return V == LoopExitInst; })) + if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); // Fix the scalar loop reduction variable with the incoming reduction sum @@ -4488,7 +4479,8 @@ void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &Rd Instruction *Cur = Worklist.pop_back_val(); if (isa<OverflowingBinaryOperator>(Cur)) for (unsigned Part = 0; Part < UF; ++Part) { - Value *V = State.get(State.Plan->getVPValue(Cur), Part); + // FIXME: Should not rely on getVPValue at this point. + Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); cast<Instruction>(V)->dropPoisonGeneratingFlags(); } @@ -4519,11 +4511,12 @@ void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. + // FIXME: Should not rely on getVPValue at this point. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Value *lastIncomingValue = OrigLoop->isLoopInvariant(IncomingValue) ? IncomingValue - : State.get(State.Plan->getVPValue(IncomingValue), + : State.get(State.Plan->getVPValue(IncomingValue, true), VPIteration(UF - 1, Lane)); LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } @@ -4763,10 +4756,18 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, } for (unsigned Part = 0; Part < UF; ++Part) { - Value *PartStart = createStepForVF( - Builder, ConstantInt::get(PtrInd->getType(), Part), VF); + Value *PartStart = + createStepForVF(Builder, PtrInd->getType(), VF, Part); if (NeedsVectorIndex) { + // Here we cache the whole vector, which means we can support the + // extraction of any lane. However, in some cases the extractelement + // instruction that is generated for scalar uses of this vector (e.g. + // a load instruction) is not folded away. Therefore we still + // calculate values for the first n lanes to avoid redundant moves + // (when extracting the 0th element) and to produce scalar code (i.e. + // additional add/gep instructions instead of expensive extractelement + // instructions) when extracting higher-order elements. Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); @@ -4774,9 +4775,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); SclrGep->setName("next.gep"); State.set(PhiR, SclrGep, Part); - // We've cached the whole vector, which means we can support the - // extraction of any lane. - continue; } for (unsigned Lane = 0; Lane < Lanes; ++Lane) { @@ -4813,7 +4811,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *NumUnrolledElems = Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); Value *InductionGEP = GetElementPtrInst::Create( - ScStValueType->getPointerElementType(), NewPointerPhi, + II.getElementType(), NewPointerPhi, Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", InductionLoc); NewPointerPhi->addIncoming(InductionGEP, LoopLatch); @@ -4832,7 +4830,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); Value *GEP = Builder.CreateGEP( - ScStValueType->getPointerElementType(), NewPointerPhi, + II.getElementType(), NewPointerPhi, Builder.CreateMul( StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), "vector.gep")); @@ -4979,7 +4977,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, auto *CI = cast<CallInst>(&I); SmallVector<Type *, 4> Tys; - for (Value *ArgOperand : CI->arg_operands()) + for (Value *ArgOperand : CI->args()) Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -5128,8 +5126,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { Instruction *Update = cast<Instruction>( cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); - ScalarPtrs.insert(Update); - return; + + // If there is more than one user of Update (Ptr), we shouldn't assume it + // will be scalar after vectorisation as other users of the instruction + // may require widening. Otherwise, add it to ScalarPtrs. + if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) { + ScalarPtrs.insert(Update); + return; + } } // We only care about bitcast and getelementptr instructions contained in // the loop. @@ -5142,12 +5146,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { if (Worklist.count(I)) return; - // If all users of the pointer will be memory accesses and scalar, place the - // pointer in ScalarPtrs. Otherwise, place the pointer in - // PossibleNonScalarPtrs. - if (llvm::all_of(I->users(), [&](User *U) { - return (isa<LoadInst>(U) || isa<StoreInst>(U)) && - isScalarUse(cast<Instruction>(U), Ptr); + // If the use of the pointer will be a scalar use, and all users of the + // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, + // place the pointer in PossibleNonScalarPtrs. + if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { + return isa<LoadInst>(U) || isa<StoreInst>(U); })) ScalarPtrs.insert(I); else @@ -5254,7 +5257,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { } bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { - if (!blockNeedsPredication(I->getParent())) + if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; switch(I->getOpcode()) { default: @@ -5297,12 +5300,20 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that - // needs predication, or it was decided to use masking to deal with gaps. + // needs predication, or it was decided to use masking to deal with gaps + // (either a gap at the end of a load-access that may result in a speculative + // load, or any gaps in a store-access). bool PredicatedAccessRequiresMasking = - Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); - bool AccessWithGapsRequiresMasking = - Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); - if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) + blockNeedsPredicationForAnyReason(I->getParent()) && + Legal->isMaskRequired(I); + bool LoadAccessWithGapsRequiresEpilogMasking = + isa<LoadInst>(I) && Group->requiresScalarEpilogue() && + !isScalarEpilogueAllowed(); + bool StoreAccessWithGapsRequiresMasking = + isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); + if (!PredicatedAccessRequiresMasking && + !LoadAccessWithGapsRequiresEpilogMasking && + !StoreAccessWithGapsRequiresMasking) return true; // If masked interleaving is required, we expect that the user/target had @@ -5311,6 +5322,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( assert(useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."); + if (Group->isReverse()) + return false; + auto *Ty = getLoadStoreType(I); const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) @@ -5320,14 +5334,13 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( Instruction *I, ElementCount VF) { // Get and ensure we have a valid memory instruction. - LoadInst *LI = dyn_cast<LoadInst>(I); - StoreInst *SI = dyn_cast<StoreInst>(I); - assert((LI || SI) && "Invalid memory instruction"); + assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); auto *Ptr = getLoadStorePointerOperand(I); + auto *ScalarTy = getLoadStoreType(I); // In order to be widened, the pointer should be consecutive, first of all. - if (!Legal->isConsecutivePtr(Ptr)) + if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) return false; // If the instruction is a store located in a predicated block, it will be @@ -5338,7 +5351,6 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); - auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); if (hasIrregularType(ScalarTy, DL)) return false; @@ -5369,12 +5381,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { return (!I || !TheLoop->contains(I)); }; + // Worklist containing uniform instructions demanding lane 0. SetVector<Instruction *> Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); - // Instructions that are scalar with predication must not be considered - // uniform after vectorization, because that would create an erroneous - // replicating region where only a single instance out of VF should be formed. + // Add uniform instructions demanding lane 0 to the worklist. Instructions + // that are scalar with predication must not be considered uniform after + // vectorization, because that would create an erroneous replicating region + // where only a single instance out of VF should be formed. // TODO: optimize such seldom cases if found important, see PR40816. auto addToWorklistIfAllowed = [&](Instruction *I) -> void { if (isOutOfScope(I)) { @@ -5433,6 +5447,30 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // lane 0 demanded or b) are uses which demand only lane 0 of their operand. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::sideeffect: + case Intrinsic::experimental_noalias_scope_decl: + case Intrinsic::assume: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + if (TheLoop->hasLoopInvariantOperands(&I)) + addToWorklistIfAllowed(&I); + break; + default: + break; + } + } + + // ExtractValue instructions must be uniform, because the operands are + // known to be loop-invariant. + if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { + assert(isOutOfScope(EVI->getAggregateOperand()) && + "Expected aggregate value to be loop invariant"); + addToWorklistIfAllowed(EVI); + continue; + } + // If there's no pointer operand, there's nothing to do. auto *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) @@ -5565,13 +5603,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { ElementCount LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { - if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { - reportVectorizationInfo( - "Disabling scalable vectorization, because target does not " - "support scalable vectors.", - "ScalableVectorsUnsupported", ORE, TheLoop); + if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) return ElementCount::getScalable(0); - } if (Hints->isScalableVectorizationDisabled()) { reportVectorizationInfo("Scalable vectorization is explicitly disabled", @@ -5579,6 +5612,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { return ElementCount::getScalable(0); } + LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); + auto MaxScalableVF = ElementCount::getScalable( std::numeric_limits<ElementCount::ScalarTy>::max()); @@ -5614,6 +5649,13 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { // Limit MaxScalableVF by the maximum safe dependence distance. Optional<unsigned> MaxVScale = TTI.getMaxVScale(); + if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { + unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) + .getVScaleRangeArgs() + .second; + if (VScaleMax > 0) + MaxVScale = VScaleMax; + } MaxScalableVF = ElementCount::getScalable( MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); if (!MaxScalableVF) @@ -5681,17 +5723,32 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, return MaxSafeFixedVF; } - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe. Ignoring scalable UserVF.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe. Ignoring the hint to let the compiler pick a " - "suitable VF."; - }); + if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is ignored because scalable vectors are not " + "available.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is ignored because the target does not support scalable " + "vectors. The compiler will pick a more suitable value."; + }); + } else { + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe. Ignoring scalable UserVF.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe. Ignoring the hint to let the compiler pick a " + "more suitable value."; + }); + } } LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType @@ -5972,19 +6029,27 @@ bool LoopVectorizationCostModel::isMoreProfitable( return RTCostA < RTCostB; } - // When set to preferred, for now assume vscale may be larger than 1, so - // that scalable vectorization is slightly favorable over fixed-width - // vectorization. + // Improve estimate for the vector width if it is scalable. + unsigned EstimatedWidthA = A.Width.getKnownMinValue(); + unsigned EstimatedWidthB = B.Width.getKnownMinValue(); + if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { + if (A.Width.isScalable()) + EstimatedWidthA *= VScale.getValue(); + if (B.Width.isScalable()) + EstimatedWidthB *= VScale.getValue(); + } + + // When set to preferred, for now assume vscale may be larger than 1 (or the + // one being tuned for), so that scalable vectorization is slightly favorable + // over fixed-width vectorization. if (Hints->isScalableVectorizationPreferred()) if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getKnownMinValue()) <= - (CostB * A.Width.getKnownMinValue()); + return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) // <=> (CostA * B.Width) < (CostB * A.Width) - return (CostA * B.Width.getKnownMinValue()) < - (CostB * A.Width.getKnownMinValue()); + return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); } VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( @@ -6014,11 +6079,22 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( VectorizationCostTy C = expectedCost(i, &InvalidCosts); VectorizationFactor Candidate(i, C.first); - LLVM_DEBUG( - dbgs() << "LV: Vector loop of width " << i << " costs: " - << (Candidate.Cost / Candidate.Width.getKnownMinValue()) - << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") - << ".\n"); + +#ifndef NDEBUG + unsigned AssumedMinimumVscale = 1; + if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) + AssumedMinimumVscale = VScale.getValue(); + unsigned Width = + Candidate.Width.isScalable() + ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale + : Candidate.Width.getFixedValue(); + LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i + << " costs: " << (Candidate.Cost / Width)); + if (i.isScalable()) + LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " + << AssumedMinimumVscale << ")"); + LLVM_DEBUG(dbgs() << ".\n"); +#endif if (!C.second && !ForceVectorization) { LLVM_DEBUG( @@ -6182,15 +6258,6 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( return Result; } - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - if (MainLoopVF.isScalable()) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " - "yet supported.\n"); - return Result; - } - // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { @@ -6202,9 +6269,9 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); - if (LVP.hasPlanWithVFs( - {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) - return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; + ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); + if (LVP.hasPlanWithVF(ForcedEC)) + return {ForcedEC, 0}; else { LLVM_DEBUG( dbgs() @@ -6221,14 +6288,24 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) + auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); + if (MainLoopVF.isScalable()) + LLVM_DEBUG( + dbgs() << "LEV: Epilogue vectorization using scalable vectors not " + "yet supported. Converting to fixed-width (VF=" + << FixedMainLoopVF << ") instead\n"); + + if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " + "this loop\n"); return Result; + } for (auto &NextVF : ProfitableVFs) - if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && (Result.Width.getFixedValue() == 1 || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) + LVP.hasPlanWithVF(NextVF.Width)) Result = NextVF; if (Result != VectorizationFactor::Disabled()) @@ -6471,6 +6548,22 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, unsigned StoresIC = IC / (NumStores ? NumStores : 1); unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); + // There is little point in interleaving for reductions containing selects + // and compares when VF=1 since it may just create more overhead than it's + // worth for loops with small trip counts. This is because we still have to + // do the final reduction after the loop. + bool HasSelectCmpReductions = + HasReductions && + any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return RecurrenceDescriptor::isSelectCmpRecurrenceKind( + RdxDesc.getRecurrenceKind()); + }); + if (HasSelectCmpReductions) { + LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); + return 1; + } + // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. For tree-wise reductions @@ -6756,7 +6849,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockNeedsPredication(BB)) + if (!blockNeedsPredicationForAnyReason(BB)) continue; for (Instruction &I : *BB) if (isScalarWithPredication(&I)) { @@ -6851,7 +6944,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF.getFixedValue()), true, false); + APInt::getAllOnes(VF.getFixedValue()), true, false); ScalarCost += VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); @@ -6870,7 +6963,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( else if (needsExtract(J, VF)) { ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF.getFixedValue()), false, true); + APInt::getAllOnes(VF.getFixedValue()), false, true); } } @@ -7016,7 +7109,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); Cost += TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), + Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), /*Insert=*/false, /*Extract=*/true); Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); @@ -7036,7 +7129,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); - int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && @@ -7117,18 +7210,16 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, unsigned InterleaveFactor = Group->getFactor(); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); - // Holds the indices of existing members in an interleaved load group. - // An interleaved store group doesn't need this as it doesn't allow gaps. + // Holds the indices of existing members in the interleaved group. SmallVector<unsigned, 4> Indices; - if (isa<LoadInst>(I)) { - for (unsigned i = 0; i < InterleaveFactor; i++) - if (Group->getMember(i)) - Indices.push_back(i); - } + for (unsigned IF = 0; IF < InterleaveFactor; IF++) + if (Group->getMember(IF)) + Indices.push_back(IF); // Calculate the cost of the whole interleaved group. bool UseMaskForGaps = - Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); + (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || + (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); @@ -7210,8 +7301,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); Instruction *Op0, *Op1; - if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && - !TheLoop->isLoopInvariant(RedOp)) { + if (RedOp && + match(RedOp, + m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && + match(Op0, m_ZExtOrSExt(m_Value())) && + Op0->getOpcode() == Op1->getOpcode() && + Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && + !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && + (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { + + // Matched reduce(ext(mul(ext(A), ext(B))) + // Note that the extend opcodes need to all match, or if A==B they will have + // been converted to zext(mul(sext(A), sext(A))) as it is known positive, + // which is equally fine. + bool IsUnsigned = isa<ZExtInst>(Op0); + auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); + auto *MulType = VectorType::get(Op0->getType(), VectorTy); + + InstructionCost ExtCost = + TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, + TTI::CastContextHint::None, CostKind, Op0); + InstructionCost MulCost = + TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); + InstructionCost Ext2Cost = + TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, + TTI::CastContextHint::None, CostKind, RedOp); + + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + if (RedCost.isValid() && + RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) + return I == RetI ? RedCost : 0; + } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && + !TheLoop->isLoopInvariant(RedOp)) { // Matched reduce(ext(A)) bool IsUnsigned = isa<ZExtInst>(RedOp); auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); @@ -7245,7 +7369,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) return I == RetI ? RedCost : 0; - } else { + } else if (!match(I, m_ZExtOrSExt(m_Value()))) { // Matched reduce(mul()) InstructionCost MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); @@ -7304,9 +7428,14 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, Type *VectorTy; InstructionCost C = getInstructionCost(I, VF, VectorTy); - bool TypeNotScalarized = - VF.isVector() && VectorTy->isVectorTy() && - TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); + bool TypeNotScalarized = false; + if (VF.isVector() && VectorTy->isVectorTy()) { + unsigned NumParts = TTI.getNumberOfParts(VectorTy); + if (NumParts) + TypeNotScalarized = NumParts < VF.getKnownMinValue(); + else + C = InstructionCost::getInvalid(); + } return VectorizationCostTy(C, TypeNotScalarized); } @@ -7327,8 +7456,8 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, if (!RetTy->isVoidTy() && (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), - true, false); + cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, + false); // Some targets keep addresses scalar. if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) @@ -7340,7 +7469,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Collect operands to consider. CallInst *CI = dyn_cast<CallInst>(I); - Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); + Instruction::op_range Ops = CI ? CI->args() : I->operands(); // Skip operands that do not require extraction/scalarization and do not incur // any overhead. @@ -7391,8 +7520,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // We assume that widening is the best solution when possible. if (memoryInstructionCanBeWidened(&I, VF)) { InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); - int ConsecutiveStride = - Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); + int ConsecutiveStride = Legal->isConsecutivePtr( + getLoadStoreType(&I), getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Expected consecutive stride."); InstWidening Decision = @@ -7579,8 +7708,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); return ( TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false, - true) + + Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. @@ -7893,7 +8021,7 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { // Check if the pointer operand of a load or store instruction is // consecutive. if (auto *Ptr = getLoadStorePointerOperand(Inst)) - return Legal->isConsecutivePtr(Ptr); + return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); return false; } @@ -8019,7 +8147,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return None; // Invalidate interleave groups if all blocks of loop will be predicated. - if (CM.blockNeedsPredication(OrigLoop->getHeader()) && + if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && !useMaskedInterleavedAccesses(*TTI)) { LLVM_DEBUG( dbgs() @@ -8105,28 +8233,30 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return SelectedVF; } -void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { - LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF - << '\n'); - BestVF = VF; - BestUF = UF; +VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { + assert(count_if(VPlans, + [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == + 1 && + "Best VF has not a single VPlan."); - erase_if(VPlans, [VF](const VPlanPtr &Plan) { - return !Plan->hasVF(VF); - }); - assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); + for (const VPlanPtr &Plan : VPlans) { + if (Plan->hasVF(VF)) + return *Plan.get(); + } + llvm_unreachable("No plan found!"); } -void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, +void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, + VPlan &BestVPlan, + InnerLoopVectorizer &ILV, DominatorTree *DT) { + LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF + << '\n'); + // Perform the actual loop transformation. // 1. Create a new empty loop. Unlink the old loop and connect the new one. - assert(BestVF.hasValue() && "Vectorization Factor is missing"); - assert(VPlans.size() == 1 && "Not a single VPlan to execute."); - - VPTransformState State{ - *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; + VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; @@ -8142,7 +8272,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - VPlans.front()->execute(&State); + BestVPlan.execute(&State); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -8222,21 +8352,19 @@ Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, +Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, + Value *Step, Instruction::BinaryOps BinOp) { // When unrolling and the VF is 1, we only need to add a simple scalar. Type *Ty = Val->getType(); assert(!Ty->isVectorTy() && "Val must be a scalar"); if (Ty->isFloatingPointTy()) { - Constant *C = ConstantFP::get(Ty, (double)StartIdx); - // Floating-point operations inherit FMF via the builder's flags. - Value *MulOp = Builder.CreateFMul(C, Step); + Value *MulOp = Builder.CreateFMul(StartIdx, Step); return Builder.CreateBinOp(BinOp, Val, MulOp); } - Constant *C = ConstantInt::get(Ty, StartIdx); - return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); + return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); } static void AddRuntimeUnrollDisableMetaData(Loop *L) { @@ -8311,7 +8439,9 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { OldInduction = Legal->getPrimaryInduction(); Type *IdxTy = Legal->getWidestInductionType(); Value *StartIdx = ConstantInt::get(IdxTy, 0); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + + IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); + Value *Step = getRuntimeVF(B, IdxTy, VF * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); EPI.VectorTripCount = CountRoundDown; Induction = @@ -8329,9 +8459,9 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { LLVM_DEBUG({ dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" - << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() + << "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:" << EPI.MainLoopUF - << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << ", Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }); } @@ -8346,8 +8476,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( Loop *L, BasicBlock *Bypass, bool ForEpilogue) { assert(L && "Expected valid Loop."); assert(Bypass && "Expected valid bypass basic block."); - unsigned VFactor = - ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); + ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; Value *Count = getOrCreateTripCount(L); // Reuse existing vector loop preheader for TC checks. @@ -8361,7 +8490,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), + P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), "min.iters.check"); if (!ForEpilogue) @@ -8513,11 +8642,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; - Value *CheckMinIters = Builder.CreateICmp( - P, Count, - ConstantInt::get(Count->getType(), - EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), - "min.epilog.iters.check"); + Value *CheckMinIters = + Builder.CreateICmp(P, Count, + createStepForVF(Builder, Count->getType(), + EPI.EpilogueVF, EPI.EpilogueUF), + "min.epilog.iters.check"); ReplaceInstWithInst( Insert->getTerminator(), @@ -8530,7 +8659,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { LLVM_DEBUG({ dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" - << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << "Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }); } @@ -8628,7 +8757,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { VPValue *BlockMask = nullptr; if (OrigLoop->getHeader() == BB) { - if (!CM.blockNeedsPredication(BB)) + if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. // Create the block in mask as the first non-phi instruction in the block. @@ -8643,9 +8772,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (Legal->getPrimaryInduction()) IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); else { - auto IVRecipe = new VPWidenCanonicalIVRecipe(); + auto *IVRecipe = new VPWidenCanonicalIVRecipe(); Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); - IV = IVRecipe->getVPSingleValue(); + IV = IVRecipe; } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); @@ -8708,12 +8837,21 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, if (Legal->isMaskRequired(I)) Mask = createBlockInMask(I->getParent(), Plan); + // Determine if the pointer operand of the access is either consecutive or + // reverse consecutive. + LoopVectorizationCostModel::InstWidening Decision = + CM.getWideningDecision(I, Range.Start); + bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; + bool Consecutive = + Reverse || Decision == LoopVectorizationCostModel::CM_Widen; + if (LoadInst *Load = dyn_cast<LoadInst>(I)) - return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); + return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, + Consecutive, Reverse); StoreInst *Store = cast<StoreInst>(I); return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], - Mask); + Mask, Consecutive, Reverse); } VPWidenIntOrFpInductionRecipe * @@ -8829,7 +8967,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return nullptr; - ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); + ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); } @@ -8916,6 +9054,37 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); + // Even if the instruction is not marked as uniform, there are certain + // intrinsic calls that can be effectively treated as such, so we check for + // them here. Conservatively, we only do this for scalable vectors, since + // for fixed-width VFs we can always fall back on full scalarization. + if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { + switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { + case Intrinsic::assume: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // For scalable vectors if one of the operands is variant then we still + // want to mark as uniform, which will generate one instruction for just + // the first lane of the vector. We can't scalarize the call in the same + // way as for fixed-width vectors because we don't know how many lanes + // there are. + // + // The reasons for doing it this way for scalable vectors are: + // 1. For the assume intrinsic generating the instruction for the first + // lane is still be better than not generating any at all. For + // example, the input may be a splat across all lanes. + // 2. For the lifetime start/end intrinsics the pointer operand only + // does anything useful when the input comes from a stack object, + // which suggests it should always be uniform. For non-stack objects + // the effect is to poison the object, which still allows us to + // remove the call. + IsUniform = true; + break; + default: + break; + } + } + auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); setRecipe(I, Recipe); @@ -9137,6 +9306,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RecipeBuilder.recordRecipeOf(R); // For min/max reducitons, where we have a pair of icmp/select, we also // need to record the ICmp recipe, so it can be removed later. + assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && + "Only min/max recurrences allowed for inloop reductions"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); } @@ -9165,22 +9336,27 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - // Create a dummy pre-entry VPBasicBlock to start building the VPlan. auto Plan = std::make_unique<VPlan>(); - VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); - Plan->setEntry(VPBB); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); + VPBasicBlock *VPBB = nullptr; + VPBasicBlock *HeaderVPBB = nullptr; + SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); + if (VPBB) + VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); + else { + Plan->setEntry(FirstVPBBForBB); + HeaderVPBB = FirstVPBBForBB; + } VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); @@ -9222,6 +9398,17 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( Plan->addVPValue(UV, Def); } + if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && + HeaderVPBB->getFirstNonPhi() != VPBB->end()) { + // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section + // of the header block. That can happen for truncates of induction + // variables. Those recipes are moved to the phi section of the header + // block after applying SinkAfter, which relies on the original + // position of the trunc. + assert(isa<TruncInst>(Instr)); + InductionsToMove.push_back( + cast<VPWidenIntOrFpInductionRecipe>(Recipe)); + } RecipeBuilder.setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); continue; @@ -9239,17 +9426,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } + assert(isa<VPBasicBlock>(Plan->getEntry()) && + !Plan->getEntry()->getEntryBasicBlock()->empty() && + "entry block must be set to a non-empty VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks - // may also be empty, such as the last one VPBB, reflecting original - // basic-blocks with no recipes. - VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); - assert(PreEntry->empty() && "Expecting empty pre-entry block."); - VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); - VPBlockUtils::disconnectBlocks(PreEntry, Entry); - delete PreEntry; - // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. @@ -9318,6 +9499,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } + // Now that sink-after is done, move induction recipes for optimized truncates + // to the phi section of the header block. + for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) + Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + + // Adjust the recipes for any inloop reductions. + adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); + // Introduce a recipe to combine the incoming and previous values of a // first-order recurrence. for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { @@ -9325,16 +9514,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (!RecurPhi) continue; + VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); + VPBasicBlock *InsertBlock = PrevRecipe->getParent(); + auto *Region = GetReplicateRegion(PrevRecipe); + if (Region) + InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); + if (Region || PrevRecipe->isPhi()) + Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); + else + Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); + auto *RecurSplice = cast<VPInstruction>( Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, {RecurPhi, RecurPhi->getBackedgeValue()})); - VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); - if (auto *Region = GetReplicateRegion(PrevRecipe)) { - VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor()); - RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi()); - } else - RecurSplice->moveAfter(PrevRecipe); RecurPhi->replaceAllUsesWith(RecurSplice); // Set the first operand of RecurSplice to RecurPhi again, after replacing // all users. @@ -9372,22 +9565,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } - // Adjust the recipes for any inloop reductions. - adjustRecipesForInLoopReductions(Plan, RecipeBuilder, Range.Start); - - // Finally, if tail is folded by masking, introduce selects between the phi - // and the live-out instruction of each reduction, at the end of the latch. - if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { - Builder.setInsertPoint(VPBB); - auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); - for (auto &Reduction : Legal->getReductionVars()) { - if (CM.isInLoopReduction(Reduction.first)) - continue; - VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); - VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); - Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); - } - } + // From this point onwards, VPlan-to-VPlan transformations may change the plan + // in ways that accessing values using original IR values is incorrect. + Plan->disableValue2VPValue(); VPlanTransforms::sinkScalarOperands(*Plan); VPlanTransforms::mergeReplicateRegions(*Plan); @@ -9405,6 +9585,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RSO.flush(); Plan->setName(PlanName); + assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } @@ -9443,12 +9624,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { return Plan; } -// Adjust the recipes for any inloop reductions. The chain of instructions -// leading from the loop exit instr to the phi need to be converted to -// reductions, with one operand being vector and the other being the scalar -// reduction chain. -void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( - VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { +// Adjust the recipes for reductions. For in-loop reductions the chain of +// instructions leading from the loop exit instr to the phi need to be converted +// to reductions, with one operand being vector and the other being the scalar +// reduction chain. For other reductions, a select is introduced between the phi +// and live-out recipes when folding the tail. +void LoopVectorizationPlanner::adjustRecipesForReductions( + VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, + ElementCount MinVF) { for (auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; @@ -9468,6 +9651,8 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( VPValue *ChainOp = Plan->getVPValue(Chain); unsigned FirstOpId; + assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && + "Only min/max recurrences allowed for inloop reductions"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa<VPWidenSelectRecipe>(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); @@ -9505,6 +9690,21 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( Chain = R; } } + + // If tail is folded by masking, introduce selects between the phi + // and the live-out instruction of each reduction, at the end of the latch. + if (CM.foldTailByMasking()) { + for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { + VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); + if (!PhiR || PhiR->isInLoop()) + continue; + Builder.setInsertPoint(LatchVPBB); + VPValue *Cond = + RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); + VPValue *Red = PhiR->getBackedgeValue(); + Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); + } + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -9519,9 +9719,22 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, O << ", "; Mask->printAsOperand(O, SlotTracker); } - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *I = IG->getMember(i)) - O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; + + unsigned OpIdx = 0; + for (unsigned i = 0; i < IG->getFactor(); ++i) { + if (!IG->getMember(i)) + continue; + if (getNumStoreOperands() > 0) { + O << "\n" << Indent << " store "; + getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); + O << " to index " << i; + } else { + O << "\n" << Indent << " "; + getVPValue(OpIdx)->printAsOperand(O, SlotTracker); + O << " = load from index " << i; + } + ++OpIdx; + } } #endif @@ -9605,17 +9818,20 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Reduction being replicated."); Value *PrevInChain = State.get(getChainOp(), 0); + RecurKind Kind = RdxDesc->getRecurrenceKind(); + bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); + // Propagate the fast-math flags carried by the underlying instruction. + IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); + State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); for (unsigned Part = 0; Part < State.UF; ++Part) { - RecurKind Kind = RdxDesc->getRecurrenceKind(); - bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); Value *NewVecOp = State.get(getVecOp(), Part); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, Part); VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); - Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( + Value *Iden = RdxDesc->getRecurrenceIdentity( Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); - Constant *IdenVec = - ConstantVector::getSplat(VecTy->getElementCount(), Iden); + Value *IdenVec = + State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); NewVecOp = Select; } @@ -9627,8 +9843,8 @@ void VPReductionRecipe::execute(VPTransformState &State) { PrevInChain); else NewRed = State.Builder.CreateBinOp( - (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), - PrevInChain, NewVecOp); + (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, + NewVecOp); PrevInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), Part); @@ -9640,11 +9856,10 @@ void VPReductionRecipe::execute(VPTransformState &State) { NewRed, PrevInChain); } else if (IsOrdered) NextInChain = NewRed; - else { + else NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, + (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, PrevInChain); - } State.set(this, NextInChain, Part); } } @@ -9757,7 +9972,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; State.ILV->vectorizeMemoryInstruction( &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), - StoredValue, getMask()); + StoredValue, getMask(), Consecutive, Reverse); } // Determine how to lower the scalar epilogue, which depends on 1) optimising @@ -9923,7 +10138,7 @@ static bool processLoopInVPlanNativePath( VectorizationFactor::Disabled() == VF) return false; - LVP.setBestPlan(VF.Width, 1); + VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, @@ -9932,7 +10147,7 @@ static bool processLoopInVPlanNativePath( &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(LB, DT); + LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -10103,7 +10318,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (!LVL.canVectorizeFPMath(EnableStrictReductions)) { + bool AllowOrderedReductions; + // If the flag is set, use that instead and override the TTI behaviour. + if (ForceOrderedReductions.getNumOccurrences() > 0) + AllowOrderedReductions = ForceOrderedReductions; + else + AllowOrderedReductions = TTI->enableOrderedReductions(); + if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { ORE->emit([&]() { auto *ExactFPMathInst = Requirements.getExactFPInst(); return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", @@ -10248,7 +10469,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { F->getParent()->getDataLayout()); if (!VF.Width.isScalar() || IC > 1) Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); - LVP.setBestPlan(VF.Width, IC); using namespace ore; if (!VectorizeLoop) { @@ -10257,7 +10477,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); - LVP.executePlan(Unroller, DT); + + VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), @@ -10276,14 +10498,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. - EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, - EpilogueVF.Width.getKnownMinValue(), - 1); + EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); - LVP.executePlan(MainILV, DT); + VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); + LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, + DT); ++LoopsVectorized; simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); @@ -10291,13 +10512,15 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Second pass vectorizes the epilogue and adjusts the control flow // edges from the first pass. - LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - LVP.executePlan(EpilogILV, DT); + + VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, + DT); ++LoopsEpilogueVectorized; if (!MainILV.areSafetyChecksAdded()) @@ -10305,7 +10528,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM, BFI, PSI, Checks); - LVP.executePlan(LB, DT); + + VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there @@ -10423,15 +10648,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &AC = AM.getResult<AssumptionAnalysis>(F); auto &DB = AM.getResult<DemandedBitsAnalysis>(F); auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); - MemorySSA *MSSA = EnableMSSALoopDependency - ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() - : nullptr; auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, - TLI, TTI, nullptr, MSSA}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, nullptr, nullptr}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); @@ -10455,3 +10677,14 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, PA.preserveSet<CFGAnalyses>(); return PA; } + +void LoopVectorizePass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( + OS, MapClassName2PassName); + + OS << "<"; + OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; + OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; + OS << ">"; +} |