diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1305 |
1 files changed, 712 insertions, 593 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 684a3098e564..35af8e425778 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -91,7 +91,6 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -134,9 +133,11 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <algorithm> @@ -294,15 +295,6 @@ cl::opt<bool> llvm::EnableLoopVectorization( "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); -/// A helper function for converting Scalar types to vector types. -/// If the incoming type is void, we return void. If the VF is 1, we return -/// the scalar type. -static Type *ToVectorTy(Type *Scalar, unsigned VF) { - if (Scalar->isVoidTy() || VF == 1) - return Scalar; - return VectorType::get(Scalar, VF); -} - /// A helper function that returns the type of loaded or stored value. static Type *getMemInstValueType(Value *I) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && @@ -319,7 +311,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a <VF x Ty> vector. if (VF > 1) { - auto *VectorTy = VectorType::get(Ty, VF); + auto *VectorTy = FixedVectorType::get(Ty, VF); return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); } @@ -415,7 +407,16 @@ public: BasicBlock *createVectorizedLoopSkeleton(); /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I); + void widenInstruction(Instruction &I, VPUser &Operands, + VPTransformState &State); + + /// Widen a single call instruction within the innermost loop. + void widenCallInstruction(CallInst &I, VPUser &ArgOperands, + VPTransformState &State); + + /// Widen a single select instruction within the innermost loop. + void widenSelectInstruction(SelectInst &I, VPUser &Operands, + bool InvariantCond, VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -430,8 +431,9 @@ public: /// Vectorize a single GetElementPtrInst based on information gathered and /// decisions taken during planning. - void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, - bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); + void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, + unsigned VF, bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and @@ -441,9 +443,11 @@ public: /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive.. - void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, - bool IfPredicateInstr); + /// inclusive. Uses the VPValue operands from \p Operands instead of \p + /// Instr's operands. + void scalarizeInstruction(Instruction *Instr, VPUser &Operands, + const VPIteration &Instance, bool IfPredicateInstr, + VPTransformState &State); /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to @@ -482,20 +486,21 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to - /// with the base address given in \p Addr, optionally masking the vector - /// operations if \p BlockInMask is non-null. Use \p State to translate given - /// VPValues to IR values in the vectorized loop. - void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, - VPValue *Addr, VPValue *BlockInMask = nullptr); + /// Try to vectorize interleaved access group \p Group with the base address + /// given in \p Addr, optionally masking the vector operations if \p + /// BlockInMask is non-null. Use \p State to translate given VPValues to IR + /// values in the vectorized loop. + void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, + VPTransformState &State, VPValue *Addr, + VPValue *BlockInMask = nullptr); /// Vectorize Load and Store instructions with the base address given in \p /// Addr, optionally masking the vector operations if \p BlockInMask is /// non-null. Use \p State to translate given VPValues to IR values in the /// vectorized loop. void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, - VPValue *Addr, - VPValue *BlockInMask = nullptr); + VPValue *Addr, VPValue *StoredValue, + VPValue *BlockInMask); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -682,7 +687,7 @@ protected: DominatorTree *DT; /// Alias Analysis. - AliasAnalysis *AA; + AAResults *AA; /// Target Library Info. const TargetLibraryInfo *TLI; @@ -974,7 +979,7 @@ public: /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional<unsigned> computeMaxVF(); + Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -1066,7 +1071,7 @@ public: auto UniformsPerVF = Uniforms.find(VF); assert(UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"); - return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); + return UniformsPerVF->second.count(I); } /// Returns true if \p I is known to be scalar after vectorization. @@ -1082,7 +1087,7 @@ public: auto ScalarsPerVF = Scalars.find(VF); assert(ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"); - return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); + return ScalarsPerVF->second.count(I); } /// \returns True if instruction \p I can be truncated to a smaller bitwidth @@ -1200,27 +1205,27 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType, Alignment); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType, Alignment); } /// Returns true if the target machine supports masked scatter operation /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedScatter(Type *DataType, Align Alignment) { return TTI.isLegalMaskedScatter(DataType, Alignment); } /// Returns true if the target machine supports masked gather operation /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedGather(Type *DataType, Align Alignment) { return TTI.isLegalMaskedGather(DataType, Alignment); } @@ -1232,7 +1237,7 @@ public: if (!LI && !SI) return false; auto *Ty = getMemInstValueType(V); - MaybeAlign Align = getLoadStoreAlignment(V); + Align Align = getLoadStoreAlignment(V); return (LI && isLegalMaskedGather(Ty, Align)) || (SI && isLegalMaskedScatter(Ty, Align)); } @@ -1309,11 +1314,19 @@ public: /// i.e. either vector version isn't available, or is too expensive. unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + /// Invalidates decisions already taken by the cost model. + void invalidateCostModelingDecisions() { + WideningDecisions.clear(); + Uniforms.clear(); + Scalars.clear(); + } + private: unsigned NumPredStores = 0; - /// \return An upper bound for the vectorization factor, larger than zero. - /// One is returned if vectorization should best be avoided due to cost. + /// \return An upper bound for the vectorization factor, a power-of-2 larger + /// than zero. One is returned if vectorization should best be avoided due + /// to cost. unsigned computeFeasibleMaxVF(unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean @@ -1598,9 +1611,8 @@ struct LoopVectorize : public FunctionPass { explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, bool VectorizeOnlyWhenForced = false) - : FunctionPass(ID) { - Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; - Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; + : FunctionPass(ID), + Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -1626,7 +1638,7 @@ struct LoopVectorize : public FunctionPass { [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, - GetLAA, *ORE, PSI); + GetLAA, *ORE, PSI).MadeAnyChange; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -1640,6 +1652,7 @@ struct LoopVectorize : public FunctionPass { AU.addRequired<LoopAccessLegacyAnalysis>(); AU.addRequired<DemandedBitsWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.addRequired<InjectTLIMappingsLegacy>(); // We currently do not preserve loopinfo/dominator analyses with outer loop // vectorization. Until this is addressed, mark these analyses as preserved @@ -1724,9 +1737,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - Value *SplatVF = isa<Constant>(Mul) - ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(VF, Mul); + Value *SplatVF = + isa<Constant>(Mul) + ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -1806,57 +1820,37 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); - auto II = Legal->getInductionVars()->find(IV); - assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); + auto II = Legal->getInductionVars().find(IV); + assert(II != Legal->getInductionVars().end() && "IV is not an induction"); auto ID = II->second; assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); - // The scalar value to broadcast. This will be derived from the canonical - // induction variable. - Value *ScalarIV = nullptr; - // The value from the original loop to which we are mapping the new induction // variable. Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; - // True if we have vectorized the induction variable. - auto VectorizedIV = false; - - // Determine if we want a scalar version of the induction variable. This is - // true if the induction variable itself is not widened, or if it has at - // least one user in the loop that is not widened. - auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); + auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); // Generate code for the induction step. Note that induction steps are // required to be loop-invariant - assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && - "Induction step should be loop invariant"); - auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - Value *Step = nullptr; - if (PSE.getSE()->isSCEVable(IV->getType())) { - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), - LoopVectorPreHeader->getTerminator()); - } else { - Step = cast<SCEVUnknown>(ID.getStep())->getValue(); - } - - // Try to create a new independent vector induction variable. If we can't - // create the phi node, we will splat the scalar induction variable in each - // loop iteration. - if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, EntryVal); - VectorizedIV = true; - } + auto CreateStepValue = [&](const SCEV *Step) -> Value * { + assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && + "Induction step should be loop invariant"); + if (PSE.getSE()->isSCEVable(IV->getType())) { + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + return Exp.expandCodeFor(Step, Step->getType(), + LoopVectorPreHeader->getTerminator()); + } + return cast<SCEVUnknown>(Step)->getValue(); + }; - // If we haven't yet vectorized the induction variable, or if we will create - // a scalar one, we need to define the scalar induction variable and step - // values. If we were given a truncation type, truncate the canonical + // The scalar value to broadcast. This is derived from the canonical + // induction variable. If a truncation type is given, truncate the canonical // induction variable and step. Otherwise, derive these values from the // induction descriptor. - if (!VectorizedIV || NeedsScalarIV) { - ScalarIV = Induction; + auto CreateScalarIV = [&](Value *&Step) -> Value * { + Value *ScalarIV = Induction; if (IV != OldInduction) { ScalarIV = IV->getType()->isIntegerTy() ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) @@ -1872,12 +1866,12 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); Step = Builder.CreateTrunc(Step, TruncType); } - } + return ScalarIV; + }; - // If we haven't yet vectorized the induction variable, splat the scalar - // induction variable, and build the necessary step vectors. - // TODO: Don't do it unless the vectorized IV is really required. - if (!VectorizedIV) { + // Create the vector values from the scalar IV, in the absence of creating a + // vector IV. + auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { Value *EntryPart = @@ -1887,23 +1881,53 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { addMetadata(EntryPart, Trunc); recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); } + }; + + // Now do the actual transformations, and start with creating the step value. + Value *Step = CreateStepValue(ID.getStep()); + if (VF <= 1) { + Value *ScalarIV = CreateScalarIV(Step); + CreateSplatIV(ScalarIV, Step); + return; + } + + // Determine if we want a scalar version of the induction variable. This is + // true if the induction variable itself is not widened, or if it has at + // least one user in the loop that is not widened. + auto NeedsScalarIV = needsScalarInduction(EntryVal); + if (!NeedsScalarIV) { + createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + return; } - // If an induction variable is only used for counting loop iterations or - // calculating addresses, it doesn't need to be widened. Create scalar steps - // that can be used by instructions we will later scalarize. Note that the - // addition of the scalar steps will not increase the number of instructions - // in the loop in the common case prior to InstCombine. We will be trading - // one vector extract for each scalar step. - if (NeedsScalarIV) + // Try to create a new independent vector induction variable. If we can't + // create the phi node, we will splat the scalar induction variable in each + // loop iteration. + if (!shouldScalarizeInstruction(EntryVal)) { + createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + Value *ScalarIV = CreateScalarIV(Step); + // Create scalar steps that can be used by instructions we will later + // scalarize. Note that the addition of the scalar steps will not increase + // the number of instructions in the loop in the common case prior to + // InstCombine. We will be trading one vector extract for each scalar step. buildScalarSteps(ScalarIV, Step, EntryVal, ID); + return; + } + + // All IV users are scalar instructions, so only emit a scalar IV, not a + // vectorised IV. Except when we tail-fold, then the splat IV feeds the + // predicate used by the masked loads/stores. + Value *ScalarIV = CreateScalarIV(Step); + if (!Cost->isScalarEpilogueAllowed()) + CreateSplatIV(ScalarIV, Step); + buildScalarSteps(ScalarIV, Step, EntryVal, ID); } Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. - assert(Val->getType()->isVectorTy() && "Must be a vector"); - int VLen = Val->getType()->getVectorNumElements(); + auto *ValVTy = cast<VectorType>(Val->getType()); + int VLen = ValVTy->getNumElements(); Type *STy = Val->getType()->getScalarType(); assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && @@ -2052,7 +2076,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { // Initialize packing with insertelements to start from undef. - Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); + Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); VectorLoopValueMap.setVectorValue(V, Part, Undef); for (unsigned Lane = 0; Lane < VF; ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); @@ -2118,13 +2142,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue( Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - SmallVector<Constant *, 8> ShuffleMask; + SmallVector<int, 8> ShuffleMask; for (unsigned i = 0; i < VF; ++i) - ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); + ShuffleMask.push_back(VF - i - 1); return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), - ConstantVector::get(ShuffleMask), - "reverse"); + ShuffleMask, "reverse"); } // Return whether we allow using masked interleave-groups (for dealing with @@ -2166,24 +2189,16 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, - VPTransformState &State, - VPValue *Addr, - VPValue *BlockInMask) { - const InterleaveGroup<Instruction> *Group = - Cost->getInterleavedAccessGroup(Instr); - assert(Group && "Fail to get an interleaved access group."); - - // Skip if current instruction is not the insert position. - if (Instr != Group->getInsertPos()) - return; - +void InnerLoopVectorizer::vectorizeInterleaveGroup( + const InterleaveGroup<Instruction> *Group, VPTransformState &State, + VPValue *Addr, VPValue *BlockInMask) { + Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); + auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); // Prepare for the new pointers. SmallVector<Value *, 2> AddrParts; @@ -2252,21 +2267,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); - auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, + createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) : ShuffledMask; } NewLoad = - Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(), + Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), GroupMask, UndefVec, "wide.masked.vec"); } else NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], - Group->getAlignment(), "wide.vec"); + Group->getAlign(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2280,14 +2295,14 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, if (!Member) continue; - Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); + auto StrideMask = createStrideMask(I, InterleaveFactor, VF); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2301,7 +2316,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, } // The sub vector type for current instruction. - VectorType *SubVT = VectorType::get(ScalarTy, VF); + auto *SubVT = FixedVectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. for (unsigned Part = 0; Part < UF; Part++) { @@ -2329,23 +2344,23 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. - Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); - Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, - "interleaved.vec"); + Value *IVec = Builder.CreateShuffleVector( + WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), + "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); - auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), + "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( - IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask); + IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); } else - NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], - Group->getAlignment()); + NewStoreInstr = + Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); Group->addMetadata(NewStoreInstr); } @@ -2354,27 +2369,26 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, VPValue *Addr, + VPValue *StoredValue, VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); assert((LI || SI) && "Invalid Load/Store instruction"); + assert((!SI || StoredValue) && "No stored value provided for widened store"); + assert((!LI || !StoredValue) && "Stored value provided for widened load"); LoopVectorizationCostModel::InstWidening Decision = Cost->getWideningDecision(Instr, VF); - assert(Decision != LoopVectorizationCostModel::CM_Unknown && - "CM decision should be taken at this point"); - if (Decision == LoopVectorizationCostModel::CM_Interleave) - return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); + assert((Decision == LoopVectorizationCostModel::CM_Widen || + Decision == LoopVectorizationCostModel::CM_Widen_Reverse || + Decision == LoopVectorizationCostModel::CM_GatherScatter) && + "CM decision is not to widen the memory instruction"); Type *ScalarDataTy = getMemInstValueType(Instr); - Type *DataTy = VectorType::get(ScalarDataTy, VF); - // An alignment of 0 means target abi alignment. We need to use the scalar's - // target abi alignment in such a case. - const DataLayout &DL = Instr->getModule()->getDataLayout(); - const Align Alignment = - DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); + auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); + const Align Alignment = getLoadStoreAlignment(Instr); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -2431,12 +2445,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, for (unsigned Part = 0; Part < UF; ++Part) { Instruction *NewSI = nullptr; - Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); + Value *StoredVal = State.get(StoredValue, Part); if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(Addr, Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, - Alignment.value(), MaskPart); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); } else { if (Reverse) { // If we store to reverse consecutive memory locations, then we need @@ -2447,11 +2461,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, } auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewSI = Builder.CreateMaskedStore( - StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]); + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, + BlockInMaskParts[Part]); else - NewSI = - Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); + NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); } addMetadata(NewSI, SI); } @@ -2466,18 +2479,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(Addr, Part); - NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, + NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad( - VecPtr, Alignment.value(), BlockInMaskParts[Part], - UndefValue::get(DataTy), "wide.masked.load"); + VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), + "wide.masked.load"); else - NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), - "wide.load"); + NewLI = + Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); // Add metadata to the load, but setVectorValue to the reverse shuffle. addMetadata(NewLI, LI); @@ -2488,9 +2501,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, const VPIteration &Instance, - bool IfPredicateInstr) { + bool IfPredicateInstr, + VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); setDebugLocFromInst(Builder, Instr); @@ -2504,8 +2518,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); + for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { + auto *NewOp = State.get(User.getOperand(op), Instance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2578,7 +2592,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { // compare. The only way that we get a backedge taken count is that the // induction variable was signed and as such will not overflow. In such a case // truncation is legal. - if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > + if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > IdxTy->getPrimitiveSizeInBits()) BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); @@ -2676,7 +2690,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, "Only one type should be a floating point type"); Type *IntTy = IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); - VectorType *VecIntTy = VectorType::get(IntTy, VF); + auto *VecIntTy = FixedVectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); return Builder.CreateBitOrPointerCast(CastVal, DstVTy); } @@ -2774,12 +2788,17 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. + auto *LAI = Legal->getLAI(); + const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); + if (!RtPtrChecking.Need) + return; Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); - if (!MemRuntimeCheck) - return; + addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, + RtPtrChecking.getChecks(), RtPtrChecking.getSE()); + assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " + "claimed checks are required"); if (MemCheckBlock->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && @@ -2858,6 +2877,18 @@ Value *InnerLoopVectorizer::emitTransformedIndex( return B.CreateMul(X, Y); }; + // Get a suitable insert point for SCEV expansion. For blocks in the vector + // loop, choose the end of the vector loop header (=LoopVectorBody), because + // the DomTree is not kept up-to-date for additional blocks generated in the + // vector loop. By using the header as insertion point, we guarantee that the + // expanded instructions dominate all their uses. + auto GetInsertPoint = [this, &B]() { + BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); + if (InsertBB != LoopVectorBody && + LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) + return LoopVectorBody->getTerminator(); + return &*B.GetInsertPoint(); + }; switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { assert(Index->getType() == StartValue->getType() && @@ -2865,7 +2896,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex( if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) return B.CreateSub(StartValue, Index); auto *Offset = CreateMul( - Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); + Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); return CreateAdd(StartValue, Offset); } case InductionDescriptor::IK_PtrInduction: { @@ -2873,8 +2904,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex( "Expected constant step for pointer induction"); return B.CreateGEP( StartValue->getType()->getPointerElementType(), StartValue, - CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), - &*B.GetInsertPoint()))); + CreateMul(Index, + Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); } case InductionDescriptor::IK_FpInduction: { assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); @@ -3034,8 +3065,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // This variable saves the new starting index for the scalar loop. It is used // to test if there are any tail iterations left once the vector loop has // completed. - LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); - for (auto &InductionEntry : *List) { + for (auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; @@ -3258,7 +3288,6 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize) { Function *F = CI->getCalledFunction(); - StringRef FnName = CI->getCalledFunction()->getName(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; for (auto &ArgOp : CI->arg_operands()) @@ -3268,7 +3297,8 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // to be vectors, so we need to extract individual elements from there, // execute VF scalar calls, and then gather the result into the vector return // value. - unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); + unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, + TTI::TCK_RecipThroughput); if (VF == 1) return ScalarCallCost; @@ -3286,11 +3316,15 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) + VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + + if (!TLI || CI->isNoBuiltin() || !VecFunc) return Cost; // If the corresponding vector cost is cheaper, return its cost. - unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); + unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, + TTI::TCK_RecipThroughput); if (VectorCallCost < Cost) { NeedToScalarize = false; return VectorCallCost; @@ -3303,22 +3337,20 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - FastMathFlags FMF; - if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) - FMF = FPMO->getFastMathFlags(); - - SmallVector<Value *, 4> Operands(CI->arg_operands()); - return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); + IntrinsicCostAttributes CostAttrs(ID, *CI, VF); + return TTI.getIntrinsicInstrCost(CostAttrs, + TargetTransformInfo::TCK_RecipThroughput); } static Type *smallestIntegerVectorType(Type *T1, Type *T2) { - auto *I1 = cast<IntegerType>(T1->getVectorElementType()); - auto *I2 = cast<IntegerType>(T2->getVectorElementType()); + auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); + auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; } + static Type *largestIntegerVectorType(Type *T1, Type *T2) { - auto *I1 = cast<IntegerType>(T1->getVectorElementType()); - auto *I2 = cast<IntegerType>(T2->getVectorElementType()); + auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); + auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; } @@ -3335,14 +3367,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { continue; for (unsigned Part = 0; Part < UF; ++Part) { Value *I = getOrCreateVectorValue(KV.first, Part); - if (Erased.find(I) != Erased.end() || I->use_empty() || - !isa<Instruction>(I)) + if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) continue; Type *OriginalTy = I->getType(); Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), KV.second); - Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, - OriginalTy->getVectorNumElements()); + auto *TruncatedTy = FixedVectorType::get( + ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); if (TruncatedTy == OriginalTy) continue; @@ -3392,27 +3423,35 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { break; } } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { - auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); + auto Elements0 = + cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( - SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); + SI->getOperand(0), + FixedVectorType::get(ScalarTruncatedTy, Elements0)); + auto Elements1 = + cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); auto *O1 = B.CreateZExtOrTrunc( - SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); + SI->getOperand(1), + FixedVectorType::get(ScalarTruncatedTy, Elements1)); - NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); + NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { - auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); + auto Elements = + cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( - IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); + IE->getOperand(0), + FixedVectorType::get(ScalarTruncatedTy, Elements)); auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { - auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); + auto Elements = + cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( - EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); + EE->getOperand(0), + FixedVectorType::get(ScalarTruncatedTy, Elements)); NewI = B.CreateExtractElement(O0, EE->getOperand(2)); } else { // If we don't know what to do, be conservative and don't do anything. @@ -3471,7 +3510,7 @@ void InnerLoopVectorizer::fixVectorizedLoop() { PSE.getSE()->forgetLoop(OrigLoop); // Fix-up external users of the induction variables. - for (auto &Entry : *Legal->getInductionVars()) + for (auto &Entry : Legal->getInductionVars()) fixupIVUsers(Entry.first, Entry.second, getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), IVEndValues[Entry.first], LoopMiddleBlock); @@ -3482,6 +3521,19 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // Remove redundant induction instructions. cse(LoopVectorBody); + + // Set/update profile weights for the vector and remainder loops as original + // loop iterations are now distributed among them. Note that original loop + // represented by LoopScalarBody becomes remainder loop after vectorization. + // + // For cases like foldTailByMasking() and requiresScalarEpiloque() we may + // end up getting slightly roughened result but that should be OK since + // profile is not inherently precise anyway. Note also possible bypass of + // vector code caused by legality checks is ignored, assigning all the weight + // to the vector loop, optimistically. + setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), + LI->getLoopFor(LoopVectorBody), + LI->getLoopFor(LoopScalarBody), VF * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -3563,8 +3615,8 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { if (VF > 1) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); VectorInit = Builder.CreateInsertElement( - UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, - Builder.getInt32(VF - 1), "vector.recur.init"); + UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), + VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -3605,10 +3657,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - SmallVector<Constant *, 8> ShuffleMask(VF); - ShuffleMask[0] = Builder.getInt32(VF - 1); + SmallVector<int, 8> ShuffleMask(VF); + ShuffleMask[0] = VF - 1; for (unsigned I = 1; I < VF; ++I) - ShuffleMask[I] = Builder.getInt32(I + VF - 1); + ShuffleMask[I] = I + VF - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -3618,10 +3670,9 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *PreviousPart = getOrCreateVectorValue(Previous, Part); Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = - VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, - ConstantVector::get(ShuffleMask)) - : Incoming; + auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, + ShuffleMask) + : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast<Instruction>(PhiPart)->eraseFromParent(); VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); @@ -3684,7 +3735,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Get it's reduction variable descriptor. assert(Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"); - RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; + RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); @@ -3725,7 +3776,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // incoming scalar reduction. VectorStart = ReductionStartValue; } else { - Identity = ConstantVector::getSplat(VF, Iden); + Identity = ConstantVector::getSplat({VF, false}, Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. @@ -3787,7 +3838,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { - Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); @@ -4036,9 +4087,11 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } } -void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, - unsigned VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant) { +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, + unsigned UF, unsigned VF, + bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant, + VPTransformState &State) { // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP // results in a vector of pointers when at least one operand of the GEP @@ -4075,19 +4128,18 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, for (unsigned Part = 0; Part < UF; ++Part) { // The pointer operand of the new GEP. If it's loop-invariant, we // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? GEP->getPointerOperand() - : getOrCreateVectorValue(GEP->getPointerOperand(), Part); + auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) + : State.get(Operands.getOperand(0), Part); // Collect all the indices for the new GEP. If any index is // loop-invariant, we won't broadcast it. SmallVector<Value *, 4> Indices; - for (auto Index : enumerate(GEP->indices())) { - Value *User = Index.value().get(); - if (IsIndexLoopInvariant[Index.index()]) - Indices.push_back(User); + for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { + VPValue *Operand = Operands.getOperand(I); + if (IsIndexLoopInvariant[I - 1]) + Indices.push_back(State.get(Operand, {0, 0})); else - Indices.push_back(getOrCreateVectorValue(User, Part)); + Indices.push_back(State.get(Operand, Part)); } // Create the new GEP. Note that this GEP may be a scalar if VF == 1, @@ -4114,7 +4166,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = - (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -4133,7 +4185,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. Type *VecTy = - (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); VectorLoopValueMap.setVectorValue(P, Part, EntryPart); @@ -4145,9 +4197,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // This PHINode must be an induction variable. // Make sure that we know about it. - assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); + assert(Legal->getInductionVars().count(P) && "Not an induction variable"); - InductionDescriptor II = Legal->getInductionVars()->lookup(P); + InductionDescriptor II = Legal->getInductionVars().lookup(P); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); // FIXME: The newly created binary instructions should contain nsw/nuw flags, @@ -4203,11 +4255,14 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I) { +void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, + VPTransformState &State) { switch (I.getOpcode()) { + case Instruction::Call: case Instruction::Br: case Instruction::PHI: case Instruction::GetElementPtr: + case Instruction::Select: llvm_unreachable("This instruction is handled by a different recipe."); case Instruction::UDiv: case Instruction::SDiv: @@ -4233,8 +4288,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 2> Ops; - for (Value *Op : I.operands()) - Ops.push_back(getOrCreateVectorValue(Op, Part)); + for (VPValue *VPOp : User.operands()) + Ops.push_back(State.get(VPOp, Part)); Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); @@ -4248,35 +4303,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { break; } - case Instruction::Select: { - // Widen selects. - // If the selector is loop invariant we can create a select - // instruction with a scalar condition. Otherwise, use vector-select. - auto *SE = PSE.getSE(); - bool InvariantCond = - SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); - setDebugLocFromInst(Builder, &I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - - auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); - Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); - Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); - Value *Sel = - Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); - VectorLoopValueMap.setVectorValue(&I, Part, Sel); - addMetadata(Sel, &I); - } - - break; - } - case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. @@ -4284,8 +4310,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { auto *Cmp = cast<CmpInst>(&I); setDebugLocFromInst(Builder, Cmp); for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); - Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); + Value *A = State.get(User.getOperand(0), Part); + Value *B = State.get(User.getOperand(1), Part); Value *C = nullptr; if (FCmp) { // Propagate fast math flags. @@ -4319,78 +4345,80 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { /// Vectorize casts. Type *DestTy = - (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); + (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); + Value *A = State.get(User.getOperand(0), Part); Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); VectorLoopValueMap.setVectorValue(&I, Part, Cast); addMetadata(Cast, &I); } break; } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. +} - case Instruction::Call: { - // Ignore dbg intrinsics. - if (isa<DbgInfoIntrinsic>(I)) - break; - setDebugLocFromInst(Builder, &I); - - Module *M = I.getParent()->getParent()->getParent(); - auto *CI = cast<CallInst>(&I); +void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, + VPTransformState &State) { + assert(!isa<DbgInfoIntrinsic>(I) && + "DbgInfoIntrinsic should have been dropped during VPlan construction"); + setDebugLocFromInst(Builder, &I); - StringRef FnName = CI->getCalledFunction()->getName(); - Function *F = CI->getCalledFunction(); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector<Type *, 4> Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + Module *M = I.getParent()->getParent()->getParent(); + auto *CI = cast<CallInst>(&I); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + SmallVector<Type *, 4> Tys; + for (Value *ArgOperand : CI->arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 4> Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - Value *Arg = CI->getArgOperand(i); - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) - Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); - Args.push_back(Arg); - } + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize = false; + unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); + bool UseVectorIntrinsic = + ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; + assert((UseVectorIntrinsic || !NeedToScalarize) && + "Instruction should be scalarized elsewhere."); - Function *VectorF; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - } else { - // Use vector version of the library call. - StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); - assert(!VFnName.empty() && "Vector function name is empty."); - VectorF = M->getFunction(VFnName); - if (!VectorF) { - // Generate a declaration - FunctionType *FTy = FunctionType::get(RetTy, Tys, false); - VectorF = - Function::Create(FTy, Function::ExternalLinkage, VFnName, M); - VectorF->copyAttributesFrom(F); - } - } - assert(VectorF && "Can't create vector function."); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value *, 4> Args; + for (auto &I : enumerate(ArgOperands.operands())) { + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + Value *Arg; + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) + Arg = State.get(I.value(), Part); + else + Arg = State.get(I.value(), {0, 0}); + Args.push_back(Arg); + } + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = + FixedVectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + assert(VectorF && "Can't retrieve vector intrinsic."); + } else { + // Use vector version of the function call. + const VFShape Shape = + VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); +#ifndef NDEBUG + assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && + "Can't create vector function."); +#endif + VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); + } SmallVector<OperandBundleDef, 1> OpBundles; CI->getOperandBundlesAsDefs(OpBundles); CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); @@ -4400,16 +4428,31 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { VectorLoopValueMap.setVectorValue(&I, Part, V); addMetadata(V, &I); - } - - break; } +} - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. +void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, + VPUser &Operands, + bool InvariantCond, + VPTransformState &State) { + setDebugLocFromInst(Builder, &I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + auto *InvarCond = + InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; + + for (unsigned Part = 0; Part < UF; ++Part) { + Value *Cond = + InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); + Value *Op0 = State.get(Operands.getOperand(1), Part); + Value *Op1 = State.get(Operands.getOperand(2), Part); + Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); + VectorLoopValueMap.setVectorValue(&I, Part, Sel); + addMetadata(Sel, &I); + } } void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { @@ -4502,7 +4545,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { } } for (auto *I : ScalarPtrs) - if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { + if (!PossibleNonScalarPtrs.count(I)) { LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); Worklist.insert(I); } @@ -4513,7 +4556,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // TODO: Once we are able to vectorize pointer induction variables we should // no longer insert them into the worklist here. auto *Latch = TheLoop->getLoopLatch(); - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) @@ -4556,7 +4599,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // An induction variable will remain scalar if all users of the induction // variable and induction variable update remain scalar. - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); @@ -4568,6 +4611,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) continue; + // If tail-folding is applied, the primary induction variable will be used + // to feed a vector compare. + if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) + continue; + // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { @@ -4618,7 +4666,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne "Widening decision should be ready at this moment"); return WideningDecision == CM_Scalarize; } - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty, Alignment)) : !(isLegalMaskedStore(Ty, Ptr, Alignment) || @@ -4665,7 +4713,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, "Masked interleave-groups for predicated accesses are not enabled."); auto *Ty = getMemInstValueType(I); - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) : TTI.isLegalMaskedStore(Ty, Alignment); } @@ -4803,7 +4851,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Add to the Worklist all consecutive and consecutive-like pointers that // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) - if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) + if (!PossibleNonUniformPtrs.count(V)) addToWorklistIfAllowed(V); // Expand Worklist in topological order: whenever a new instruction @@ -4847,7 +4895,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // nodes separately. An induction variable will remain uniform if all users // of the induction variable and induction variable update remain uniform. // The code below handles both pointer and non-pointer induction variables. - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); @@ -4903,10 +4951,9 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { // FIXME: Avoid specializing for stride==1 instead of bailing out. if (!Legal->getLAI()->getSymbolicStrides().empty()) { - reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", + reportVectorizationFailure("Runtime stride check for small trip count", "runtime stride == 1 checks needed. Enable vectorization of " - "this loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz", + "this loop without such check by compiling with -Os/-Oz", "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } @@ -4914,7 +4961,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { +Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, + unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -4936,7 +4984,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return computeFeasibleMaxVF(TC); + return UserVF ? UserVF : computeFeasibleMaxVF(TC); case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" @@ -4964,11 +5012,18 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. - if (!useMaskedInterleavedAccesses(TTI)) + if (!useMaskedInterleavedAccesses(TTI)) { + assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && + "No decisions should have been taken at this point"); + // Note: There is no need to invalidate any cost modeling decisions here, as + // non where taken so far. InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); + } - unsigned MaxVF = computeFeasibleMaxVF(TC); - if (TC > 0 && TC % MaxVF == 0) { + unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); + assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); + unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; + if (TC > 0 && TC % MaxVFtimesIC == 0) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxVF; @@ -5015,7 +5070,9 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); - unsigned MaxVectorSize = WidestRegister / WidestType; + // Ensure MaxVF is a power of 2; the dependence distance bound may not be. + // Note that both WidestRegister and WidestType may not be a powers of 2. + unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); @@ -5140,7 +5197,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { Type *T = I.getType(); // Skip ignored values. - if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) + if (ValuesToIgnore.count(&I)) continue; // Only examine Loads, Stores and PHINodes. @@ -5152,7 +5209,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { if (auto *PN = dyn_cast<PHINode>(&I)) { if (!Legal->isReductionVariable(PN)) continue; - RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; + RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; T = RdxDesc.getRecurrenceType(); } @@ -5294,7 +5351,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && !Legal->getReductionVars()->empty()) { + if (VF > 1 && !Legal->getReductionVars().empty()) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5325,7 +5382,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. - if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { + if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -5345,7 +5402,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. - bool HasReductions = !Legal->getReductionVars()->empty(); + bool HasReductions = !Legal->getReductionVars().empty(); if (TTI.enableAggressiveInterleaving(HasReductions)) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; @@ -5459,11 +5516,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { OpenIntervals.erase(ToRemove); // Ignore instructions that are never used within the loop. - if (Ends.find(I) == Ends.end()) + if (!Ends.count(I)) continue; // Skip ignored values. - if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) + if (ValuesToIgnore.count(I)) continue; // For each VF find the maximum usage of registers. @@ -5483,7 +5540,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { collectUniformsAndScalars(VFs[j]); for (auto Inst : OpenIntervals) { // Skip ignored values for VF > 1. - if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) + if (VecValuesToIgnore.count(Inst)) continue; if (isScalarAfterVectorization(Inst, VFs[j])) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); @@ -5676,9 +5733,11 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { - ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), - true, false); - ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); + ScalarCost += TTI.getScalarizationOverhead( + cast<VectorType>(ToVectorTy(I->getType(), VF)), + APInt::getAllOnesValue(VF), true, false); + ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, + TTI::TCK_RecipThroughput); } // Compute the scalarization overhead of needed extractelement @@ -5693,7 +5752,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( Worklist.push_back(J); else if (needsExtract(J, VF)) ScalarCost += TTI.getScalarizationOverhead( - ToVectorTy(J->getType(),VF), false, true); + cast<VectorType>(ToVectorTy(J->getType(), VF)), + APInt::getAllOnesValue(VF), false, true); } // Scale the total scalar cost by block probability. @@ -5719,8 +5779,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || - (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) + if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); @@ -5806,9 +5865,10 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS); + Alignment, AS, + TTI::TCK_RecipThroughput); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5832,20 +5892,22 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); - Type *VectorTy = ToVectorTy(ValTy, VF); + auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); unsigned Cost = 0; if (Legal->isMaskRequired(I)) - Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, - Alignment ? Alignment->value() : 0, AS); + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, + CostKind); else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, + CostKind, I); bool Reverse = ConsecutiveStride < 0; if (Reverse) @@ -5856,19 +5918,22 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); - Type *VectorTy = ToVectorTy(ValTy, VF); - const MaybeAlign Alignment = getLoadStoreAlignment(I); + auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); + const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); + enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isa<LoadInst>(I)) { return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, + CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } StoreInst *SI = cast<StoreInst>(I); bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, + CostKind) + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, @@ -5878,27 +5943,27 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); - Type *VectorTy = ToVectorTy(ValTy, VF); - const MaybeAlign Alignment = getLoadStoreAlignment(I); - Value *Ptr = getLoadStorePointerOperand(I); + auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); + const Align Alignment = getLoadStoreAlignment(I); + const Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + - TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), - Alignment ? Alignment->value() : 0); + TTI.getGatherScatterOpCost( + I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, + TargetTransformInfo::TCK_RecipThroughput, I); } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); - Type *VectorTy = ToVectorTy(ValTy, VF); + auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); auto Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. // An interleaved store group doesn't need this as it doesn't allow gaps. @@ -5913,8 +5978,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, bool UseMaskForGaps = Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); unsigned Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); + I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), + AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. @@ -5932,11 +5997,12 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, // moment. if (VF == 1) { Type *ValTy = getMemInstValueType(I); - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, + TTI::TCK_RecipThroughput, I); } return getWideningCost(I, VF); } @@ -5955,7 +6021,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { auto ForcedScalar = ForcedScalars.find(VF); if (VF > 1 && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; - if (InstSet.find(I) != InstSet.end()) + if (InstSet.count(I)) return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); } @@ -5977,7 +6043,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, Type *RetTy = ToVectorTy(I->getType(), VF); if (!RetTy->isVoidTy() && (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead(RetTy, true, false); + Cost += TTI.getScalarizationOverhead( + cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); // Some targets keep addresses scalar. if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) @@ -6157,6 +6224,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { @@ -6173,21 +6241,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); if (VF > 1 && BI->isConditional() && - (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != - PredicatedBBsAfterVectorization.end() || - PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != - PredicatedBBsAfterVectorization.end())) + (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. - Type *Vec_i1Ty = - VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + - (TTI.getCFInstrCost(Instruction::Br) * VF)); + auto *Vec_i1Ty = + FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), + false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) // The back-edge branch will remain, as will all scalar branches. - return TTI.getCFInstrCost(Instruction::Br); + return TTI.getCFInstrCost(Instruction::Br, CostKind); else // This branch will be eliminated by if-conversion. return 0; @@ -6202,7 +6269,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - VectorTy, VF - 1, VectorType::get(RetTy, 1)); + cast<VectorType>(VectorTy), VF - 1, + FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi @@ -6211,9 +6279,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), - ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); + ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), + CostKind); - return TTI.getCFInstrCost(Instruction::PHI); + return TTI.getCFInstrCost(Instruction::PHI, CostKind); } case Instruction::UDiv: case Instruction::SDiv: @@ -6230,10 +6299,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF * TTI.getCFInstrCost(Instruction::PHI); + Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); + Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -6274,13 +6343,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, SmallVector<const Value *, 4> Operands(I->operand_values()); unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, + I->getOpcode(), VectorTy, CostKind, + TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, + I->getOpcode(), VectorTy, CostKind, + TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, I->getOperand(0), I); @@ -6291,9 +6362,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) - CondTy = VectorType::get(CondTy, VF); + CondTy = FixedVectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, + CostKind, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -6302,7 +6374,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, + I); } case Instruction::Store: case Instruction::Load: { @@ -6335,7 +6408,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (isOptimizableIVTruncate(I, VF)) { auto *Trunc = cast<TruncInst>(I); return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), - Trunc->getSrcTy(), Trunc); + Trunc->getSrcTy(), CostKind, Trunc); } Type *SrcScalarTy = I->getOperand(0)->getType(); @@ -6361,7 +6434,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; - return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); + return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, + CostKind, I); } case Instruction::Call: { bool NeedToScalarize; @@ -6374,7 +6448,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + + return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, + CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6397,6 +6472,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -6424,14 +6500,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore type-promoting instructions we identified during reduction // detection. - for (auto &Reduction : *Legal->getReductionVars()) { + for (auto &Reduction : Legal->getReductionVars()) { RecurrenceDescriptor &RedDes = Reduction.second; SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction // detection. - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { InductionDescriptor &IndDes = Induction.second; const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); @@ -6490,9 +6566,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { return VectorizationFactor::Disabled(); } -Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { +Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, + unsigned UserIC) { assert(OrigLoop->empty() && "Inner loop expected."); - Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); + Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6503,7 +6580,11 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " "which requires masked-interleaved support.\n"); - CM.InterleaveInfo.reset(); + if (CM.InterleaveInfo.invalidateGroups()) + // Invalidating interleave groups also requires invalidating all decisions + // based on them, which includes widening decisions and uniform and scalar + // values. + CM.invalidateCostModelingDecisions(); } if (UserVF) { @@ -6563,6 +6644,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, &ILV, CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); + State.CanonicalIV = ILV.Induction; //===------------------------------------------------===// // @@ -6595,12 +6677,11 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( // We create new "steps" for induction variable updates to which the original // induction variables map. An original update instruction will be dead if // all its users except the induction variable are dead. - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { PHINode *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != - DeadInstructions.end(); + return U == Ind || DeadInstructions.count(cast<Instruction>(U)); })) DeadInstructions.insert(IndUpdate); @@ -6716,7 +6797,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); assert(BI && "Unexpected terminator found"); - if (!BI->isConditional()) + if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) return EdgeMaskCache[Edge] = SrcMask; VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); @@ -6749,9 +6830,21 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. - VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); + // Start by constructing the desired canonical IV. + VPValue *IV = nullptr; + if (Legal->getPrimaryInduction()) + IV = Plan->getVPValue(Legal->getPrimaryInduction()); + else { + auto IVRecipe = new VPWidenCanonicalIVRecipe(); + Builder.getInsertBlock()->appendRecipe(IVRecipe); + IV = IVRecipe->getVPValue(); + } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + bool TailFolded = !CM.isScalarEpilogueAllowed(); + if (TailFolded && CM.TTI.emitGetActiveLaneMask()) + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); + else + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); return BlockMaskCache[BB] = BlockMask; } @@ -6775,8 +6868,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { VPWidenMemoryInstructionRecipe * VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan) { - if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) - return nullptr; + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Must be called with either a load or store"); auto willWiden = [&](unsigned VF) -> bool { if (VF == 1) @@ -6801,22 +6894,29 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, Mask = createBlockInMask(I->getParent(), Plan); VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); - return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); + if (LoadInst *Load = dyn_cast<LoadInst>(I)) + return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); + + StoreInst *Store = cast<StoreInst>(I); + VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); + return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); } VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { - if (PHINode *Phi = dyn_cast<PHINode>(I)) { - // Check if this is an integer or fp induction. If so, build the recipe that - // produces its scalar and vector values. - InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) - return new VPWidenIntOrFpInductionRecipe(Phi); +VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { + // Check if this is an integer or fp induction. If so, build the recipe that + // produces its scalar and vector values. + InductionDescriptor II = Legal->getInductionVars().lookup(Phi); + if (II.getKind() == InductionDescriptor::IK_IntInduction || + II.getKind() == InductionDescriptor::IK_FpInduction) + return new VPWidenIntOrFpInductionRecipe(Phi); - return nullptr; - } + return nullptr; +} +VPWidenIntOrFpInductionRecipe * +VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, + VFRange &Range) const { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -6830,54 +6930,89 @@ VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; }; - if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( - isOptimizableIVTruncate(I), Range)) + if (LoopVectorizationPlanner::getDecisionAndClampRange( + isOptimizableIVTruncate(I), Range)) return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - cast<TruncInst>(I)); + I); return nullptr; } -VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { - PHINode *Phi = dyn_cast<PHINode>(I); - if (!Phi || Phi->getParent() == OrigLoop->getHeader()) - return nullptr; - +VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { // We know that all PHIs in non-header blocks are converted into selects, so // we don't have to worry about the insertion order and we can just use the // builder. At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - SmallVector<VPValue *, 2> Masks; + SmallVector<VPValue *, 2> Operands; unsigned NumIncoming = Phi->getNumIncomingValues(); for (unsigned In = 0; In < NumIncoming; In++) { VPValue *EdgeMask = createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); assert((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"); + Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); if (EdgeMask) - Masks.push_back(EdgeMask); + Operands.push_back(EdgeMask); } - return new VPBlendRecipe(Phi, Masks); + return new VPBlendRecipe(Phi, Operands); } -bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, - VFRange &Range) { +VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, + VPlan &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, + Range); if (IsPredicated) - return false; + return nullptr; + + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) + return nullptr; + + auto willWiden = [&](unsigned VF) -> bool { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + // The following case may be scalarized depending on the VF. + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize = false; + unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); + bool UseVectorIntrinsic = + ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; + return UseVectorIntrinsic || !NeedToScalarize; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) + return nullptr; + + return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); +} +bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { + assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && + !isa<StoreInst>(I) && "Instruction should have been handled earlier"); + // Instruction should be widened, unless it is scalar after vectorization, + // scalarization is profitable or it is predicated. + auto WillScalarize = [this, I](unsigned VF) -> bool { + return CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF) || + CM.isScalarWithPredication(I, VF); + }; + return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, + Range); +} + +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { auto IsVectorizableOpcode = [](unsigned Opcode) { switch (Opcode) { case Instruction::Add: case Instruction::And: case Instruction::AShr: case Instruction::BitCast: - case Instruction::Br: - case Instruction::Call: case Instruction::FAdd: case Instruction::FCmp: case Instruction::FDiv: @@ -6891,11 +7026,9 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, case Instruction::FSub: case Instruction::ICmp: case Instruction::IntToPtr: - case Instruction::Load: case Instruction::LShr: case Instruction::Mul: case Instruction::Or: - case Instruction::PHI: case Instruction::PtrToInt: case Instruction::SDiv: case Instruction::Select: @@ -6903,7 +7036,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, case Instruction::Shl: case Instruction::SIToFP: case Instruction::SRem: - case Instruction::Store: case Instruction::Sub: case Instruction::Trunc: case Instruction::UDiv: @@ -6917,60 +7049,10 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, }; if (!IsVectorizableOpcode(I->getOpcode())) - return false; - - if (CallInst *CI = dyn_cast<CallInst>(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) - return false; - } - - auto willWiden = [&](unsigned VF) -> bool { - if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF))) - return false; - if (CallInst *CI = dyn_cast<CallInst>(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; - return UseVectorIntrinsic || !NeedToScalarize; - } - if (isa<LoadInst>(I) || isa<StoreInst>(I)) { - assert(CM.getWideningDecision(I, VF) == - LoopVectorizationCostModel::CM_Scalarize && - "Memory widening decisions should have been taken care by now"); - return false; - } - return true; - }; - - if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) - return false; - // If this ingredient's recipe is to be recorded, keep its recipe a singleton - // to avoid having to split recipes later. - bool IsSingleton = Ingredient2Recipe.count(I); + return nullptr; // Success: widen this instruction. - - // Use the default widening recipe. We optimize the common case where - // consecutive instructions can be represented by a single recipe. - if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && - LastExtensibleRecipe->appendInstruction(I)) - return true; - - VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); - if (!IsSingleton) - LastExtensibleRecipe = WidenRecipe; - setRecipe(I, WidenRecipe); - VPBB->appendRecipe(WidenRecipe); - return true; + return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); } VPBasicBlock *VPRecipeBuilder::handleReplication( @@ -6984,7 +7066,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); - auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), + IsUniform, IsPredicated); setRecipe(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar @@ -7041,43 +7124,45 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, return Region; } -bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, - VPlanPtr &Plan, VPBasicBlock *VPBB) { - VPRecipeBase *Recipe = nullptr; - - // First, check for specific widening recipes that deal with memory +VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, + VFRange &Range, + VPlanPtr &Plan) { + // First, check for specific widening recipes that deal with calls, memory // operations, inductions and Phi nodes. - if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || - (Recipe = tryToOptimizeInduction(Instr, Range)) || - (Recipe = tryToBlend(Instr, Plan)) || - (isa<PHINode>(Instr) && - (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { - setRecipe(Instr, Recipe); - VPBB->appendRecipe(Recipe); - return true; - } + if (auto *CI = dyn_cast<CallInst>(Instr)) + return tryToWidenCall(CI, Range, *Plan); - // Handle GEP widening. - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { - auto Scalarize = [&](unsigned VF) { - return CM.isScalarWithPredication(Instr, VF) || - CM.isScalarAfterVectorization(Instr, VF) || - CM.isProfitableToScalarize(Instr, VF); - }; - if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) - return false; - VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); - setRecipe(Instr, Recipe); - VPBB->appendRecipe(Recipe); - return true; + if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) + return tryToWidenMemory(Instr, Range, Plan); + + VPRecipeBase *Recipe; + if (auto Phi = dyn_cast<PHINode>(Instr)) { + if (Phi->getParent() != OrigLoop->getHeader()) + return tryToBlend(Phi, Plan); + if ((Recipe = tryToOptimizeInductionPHI(Phi))) + return Recipe; + return new VPWidenPHIRecipe(Phi); } - // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes. - if (tryToWiden(Instr, VPBB, Range)) - return true; + if (isa<TruncInst>(Instr) && + (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) + return Recipe; - return false; + if (!shouldWiden(Instr, Range)) + return nullptr; + + if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) + return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), + OrigLoop); + + if (auto *SI = dyn_cast<SelectInst>(Instr)) { + bool InvariantCond = + PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); + return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), + InvariantCond); + } + + return tryToWiden(Instr, *Plan); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, @@ -7097,13 +7182,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, NeedDef.insert(Branch->getCondition()); } - // If the tail is to be folded by masking, the primary induction variable - // needs to be represented in VPlan for it to model early-exit masking. + // If the tail is to be folded by masking, the primary induction variable, if + // exists needs to be represented in VPlan for it to model early-exit masking. // Also, both the Phi and the live-out instruction of each reduction are // required in order to introduce a select between them in VPlan. if (CM.foldTailByMasking()) { - NeedDef.insert(Legal->getPrimaryInduction()); - for (auto &Reduction : *Legal->getReductionVars()) { + if (Legal->getPrimaryInduction()) + NeedDef.insert(Legal->getPrimaryInduction()); + for (auto &Reduction : Legal->getReductionVars()) { NeedDef.insert(Reduction.first); NeedDef.insert(Reduction.second.getLoopExitInstr()); } @@ -7118,28 +7204,39 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, SmallPtrSet<Instruction *, 4> DeadInstructions; collectTriviallyDeadInstructions(DeadInstructions); + // Add assume instructions we need to drop to DeadInstructions, to prevent + // them from being added to the VPlan. + // TODO: We only need to drop assumes in blocks that get flattend. If the + // control flow is preserved, we should keep them. + auto &ConditionalAssumes = Legal->getConditionalAssumes(); + DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); + + DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); + // Dead instructions do not need sinking. Remove them from SinkAfter. + for (Instruction *I : DeadInstructions) + SinkAfter.erase(I); + for (unsigned VF = MinVF; VF < MaxVF + 1;) { VFRange SubRange = {VF, MaxVF + 1}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); + VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, + DeadInstructions, SinkAfter)); VF = SubRange.End; } } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, - SmallPtrSetImpl<Instruction *> &DeadInstructions) { + SmallPtrSetImpl<Instruction *> &DeadInstructions, + const DenseMap<Instruction *, Instruction *> &SinkAfter) { // Hold a mapping from predicated instructions to their recipes, in order to // fix their AlsoPack behavior if a user is determined to replicate and use a // scalar instead of vector value. DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; - DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further @@ -7177,8 +7274,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // --------------------------------------------------------------------------- // Create a dummy pre-entry VPBasicBlock to start building the VPlan. + auto Plan = std::make_unique<VPlan>(); VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); - auto Plan = std::make_unique<VPlan>(VPBB); + Plan->setEntry(VPBB); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) @@ -7199,17 +7297,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. + // TODO: Model and preserve debug instrinsics in VPlan. for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; // First filter out irrelevant instructions, to ensure no recipes are // built for them. - if (isa<BranchInst>(Instr) || - DeadInstructions.find(Instr) != DeadInstructions.end()) + if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) continue; - if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) + if (auto Recipe = + RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { + RecipeBuilder.setRecipe(Instr, Recipe); + VPBB->appendRecipe(Recipe); continue; + } // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. @@ -7264,7 +7366,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (CM.foldTailByMasking()) { Builder.setInsertPoint(VPBB); auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); - for (auto &Reduction : *Legal->getReductionVars()) { + for (auto &Reduction : Legal->getReductionVars()) { VPValue *Phi = Plan->getVPValue(Reduction.first); VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); @@ -7330,32 +7432,37 @@ Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( return ILV.getOrCreateScalarValue(V, Instance); } -void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" - << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; +void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; - getAddr()->printAsOperand(O); + getAddr()->printAsOperand(O, SlotTracker); VPValue *Mask = getMask(); if (Mask) { O << ", "; - Mask->printAsOperand(O); + Mask->printAsOperand(O, SlotTracker); } - O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) - O << " +\n" - << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; + O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; +} + +void VPWidenCallRecipe::execute(VPTransformState &State) { + State.ILV->widenCallInstruction(Ingredient, User, State); +} + +void VPWidenSelectRecipe::execute(VPTransformState &State) { + State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); } void VPWidenRecipe::execute(VPTransformState &State) { - for (auto &Instr : make_range(Begin, End)) - State.ILV->widenInstruction(Instr); + State.ILV->widenInstruction(Ingredient, User, State); } void VPWidenGEPRecipe::execute(VPTransformState &State) { - State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, - IsIndexLoopInvariant); + State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, + IsIndexLoopInvariant, State); } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { @@ -7376,27 +7483,27 @@ void VPBlendRecipe::execute(VPTransformState &State) { // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - unsigned NumIncoming = Phi->getNumIncomingValues(); + unsigned NumIncoming = getNumIncomingValues(); - assert((User || NumIncoming == 1) && - "Multiple predecessors with predecessors having a full mask"); // Generate a sequence of selects of the form: // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // ( ...))) + // SELECT(Mask2, In2, + // SELECT(Mask1, In1, + // In0))) + // Note that Mask0 is never used: lanes for which no path reaches this phi and + // are essentially undef are taken from In0. InnerLoopVectorizer::VectorParts Entry(State.UF); for (unsigned In = 0; In < NumIncoming; ++In) { for (unsigned Part = 0; Part < State.UF; ++Part) { // We might have single edge PHIs (blocks) - use an identity // 'select' for the first PHI operand. - Value *In0 = - State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); + Value *In0 = State.get(getIncomingValue(In), Part); if (In == 0) Entry[Part] = In0; // Initialize with the first incoming value. else { // Select between the current value and the previous incoming edge // based on the incoming mask. - Value *Cond = State.get(User->getOperand(In), Part); + Value *Cond = State.get(getMask(In), Part); Entry[Part] = State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); } @@ -7408,19 +7515,19 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), - getMask()); + State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); } void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. - State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); + State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, + IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF > 1) { // If we're constructing lane 0, initialize to start from undef. if (State.Instance->Lane == 0) { - Value *Undef = - UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); + Value *Undef = UndefValue::get( + FixedVectorType::get(Ingredient->getType(), State.VF)); State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); } State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); @@ -7434,7 +7541,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) { unsigned EndLane = IsUniform ? 1 : State.VF; for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); + State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, + IsPredicated, State); } void VPBranchOnMaskRecipe::execute(VPTransformState &State) { @@ -7444,15 +7552,14 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { unsigned Lane = State.Instance->Lane; Value *ConditionBit = nullptr; - if (!User) // Block in mask is all-one. - ConditionBit = State.Builder.getTrue(); - else { - VPValue *BlockInMask = User->getOperand(0); + VPValue *BlockInMask = getMask(); + if (BlockInMask) { ConditionBit = State.get(BlockInMask, Part); if (ConditionBit->getType()->isVectorTy()) ConditionBit = State.Builder.CreateExtractElement( ConditionBit, State.Builder.getInt32(Lane)); - } + } else // Block in mask is all-one. + ConditionBit = State.Builder.getTrue(); // Replace the temporary unreachable terminator with a new conditional branch, // whose two destinations will be set later when they are created. @@ -7496,7 +7603,9 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); + VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; + State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, + getMask()); } // Determine how to lower the scalar epilogue, which depends on 1) optimising @@ -7513,16 +7622,15 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( PGSOQueryType::IRPass); // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. - if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) + if (OptSize) return CM_ScalarEpilogueNotAllowedOptSize; bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && !PreferPredicateOverEpilog; // 2) Next, if disabling predication is requested on the command line, honour - // this and request a scalar epilogue. Also do this if we don't have a - // primary induction variable, which is required for predication. - if (PredicateOptDisabled || !LVL.getPrimaryInduction()) + // this and request a scalar epilogue. + if (PredicateOptDisabled) return CM_ScalarEpilogueAllowed; // 3) and 4) look if enabling predication is requested on the command line, @@ -7549,6 +7657,10 @@ static bool processLoopInVPlanNativePath( OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { + if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { + LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); + return false; + } assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); @@ -7561,7 +7673,7 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); @@ -7587,10 +7699,16 @@ static bool processLoopInVPlanNativePath( // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); + assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } +LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) + : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || + !EnableLoopInterleaving), + VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || + !EnableLoopVectorization) {} + bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->empty()) && "VPlan-native path is not enabled. Only process inner loops."); @@ -7720,17 +7838,17 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); - // Get user vectorization factor. + // Get user vectorization factor and interleave count. unsigned UserVF = Hints.getWidth(); + unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); + Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; - unsigned UserIC = Hints.getInterleave(); if (MaybeVF) { VF = *MaybeVF; @@ -7883,14 +8001,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.setAlreadyVectorized(); } - LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); + assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } -bool LoopVectorizePass::runImpl( +LoopVectorizeResult LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, - DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, + DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; @@ -7915,9 +8033,9 @@ bool LoopVectorizePass::runImpl( // interleaving. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && TTI->getMaxInterleaveFactor(1) < 2) - return false; + return LoopVectorizeResult(false, false); - bool Changed = false; + bool Changed = false, CFGChanged = false; // The vectorizer requires loops to be in simplified form. // Since simplification may add new inner loops, it has to run before the @@ -7925,7 +8043,7 @@ bool LoopVectorizePass::runImpl( // will simplify all loops, regardless of whether anything end up being // vectorized. for (auto &L : *LI) - Changed |= + Changed |= CFGChanged |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); // Build up a worklist of inner-loops to vectorize. This is necessary as @@ -7946,11 +8064,11 @@ bool LoopVectorizePass::runImpl( // transform. Changed |= formLCSSARecursively(*L, *DT, LI, SE); - Changed |= processLoop(L); + Changed |= CFGChanged |= processLoop(L); } // Process each loop nest in the function. - return Changed; + return LoopVectorizeResult(Changed, CFGChanged); } PreservedAnalyses LoopVectorizePass::run(Function &F, @@ -7975,13 +8093,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; - const ModuleAnalysisManager &MAM = - AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); + auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); ProfileSummaryInfo *PSI = - MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); - bool Changed = + MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); + LoopVectorizeResult Result = runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); - if (!Changed) + if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; @@ -7995,5 +8112,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, } PA.preserve<BasicAA>(); PA.preserve<GlobalsAA>(); + if (!Result.MadeCFGChange) + PA.preserveSet<CFGAnalyses>(); return PA; } |