diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
commit | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch) | |
tree | 4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | 7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff) |
Notes
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1011 |
1 files changed, 548 insertions, 463 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8f0bf70f873c..684a3098e564 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,8 +58,8 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanHCFGBuilder.h" -#include "VPlanHCFGTransforms.h" #include "VPlanPredicator.h" +#include "VPlanTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -124,6 +124,7 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -149,7 +150,6 @@ #include <string> #include <tuple> #include <utility> -#include <vector> using namespace llvm; @@ -200,9 +200,10 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); -/// We don't interleave loops with a known constant trip count below this -/// number. -static const unsigned TinyTripCountInterleaveThreshold = 128; +static cl::opt<unsigned> TinyTripCountInterleaveThreshold( + "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, + cl::desc("We don't interleave loops with a estimated constant trip count " + "below this number")); static cl::opt<unsigned> ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, @@ -427,6 +428,11 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; + /// Vectorize a single GetElementPtrInst based on information gathered and + /// decisions taken during planning. + void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, + bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -476,15 +482,20 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to, - /// optionally masking the vector operations if \p BlockInMask is non-null. - void vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask = nullptr); - - /// Vectorize Load and Store instructions, optionally masking the vector - /// operations if \p BlockInMask is non-null. - void vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask = nullptr); + /// Try to vectorize the interleaved access group that \p Instr belongs to + /// with the base address given in \p Addr, optionally masking the vector + /// operations if \p BlockInMask is non-null. Use \p State to translate given + /// VPValues to IR values in the vectorized loop. + void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, + VPValue *Addr, VPValue *BlockInMask = nullptr); + + /// Vectorize Load and Store instructions with the base address given in \p + /// Addr, optionally masking the vector operations if \p BlockInMask is + /// non-null. Use \p State to translate given VPValues to IR values in the + /// vectorized loop. + void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask = nullptr); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -525,6 +536,9 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); + /// Clear NSW/NUW flags from reduction instructions if necessary. + void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); + /// The Loop exit block may have single value PHI nodes with some /// incoming value. While vectorizing we only handled real values /// that were defined inside the loop and we should have one value for @@ -539,10 +553,6 @@ protected: /// represented as. void truncateToMinimalBitwidths(); - /// Insert the new loop to the loop hierarchy and pass manager - /// and update the analysis passes. - void updateAnalysis(); - /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -1204,14 +1214,14 @@ public: /// Returns true if the target machine supports masked scatter operation /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType) { - return TTI.isLegalMaskedScatter(DataType); + bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { + return TTI.isLegalMaskedScatter(DataType, Alignment); } /// Returns true if the target machine supports masked gather operation /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType) { - return TTI.isLegalMaskedGather(DataType); + bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { + return TTI.isLegalMaskedGather(DataType, Alignment); } /// Returns true if the target machine can represent \p V as a masked gather @@ -1222,7 +1232,9 @@ public: if (!LI && !SI) return false; auto *Ty = getMemInstValueType(V); - return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + MaybeAlign Align = getLoadStoreAlignment(V); + return (LI && isLegalMaskedGather(Ty, Align)) || + (SI && isLegalMaskedScatter(Ty, Align)); } /// Returns true if \p I is an instruction that will be scalarized with @@ -2155,7 +2167,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { const InterleaveGroup<Instruction> *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -2165,27 +2179,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, return; const DataLayout &DL = Instr->getModule()->getDataLayout(); - Value *Ptr = getLoadStorePointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); - Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); // Prepare for the new pointers. - setDebugLocFromInst(Builder, Ptr); - SmallVector<Value *, 2> NewPtrs; + SmallVector<Value *, 2> AddrParts; unsigned Index = Group->getIndex(Instr); - VectorParts Mask; - bool IsMaskForCondRequired = BlockInMask; - if (IsMaskForCondRequired) { - Mask = *BlockInMask; - // TODO: extend the masked interleaved-group support to reversed access. - assert(!Group->isReverse() && "Reversed masked interleave-group " - "not supported."); - } + // TODO: extend the masked interleaved-group support to reversed access. + assert((!BlockInMask || !Group->isReverse()) && + "Reversed masked interleave-group not supported."); // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, @@ -2196,12 +2202,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, if (Group->isReverse()) Index += (VF - 1) * Group->getFactor(); - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); + Value *AddrPart = State.get(Addr, {Part, 0}); + setDebugLocFromInst(Builder, AddrPart); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2214,12 +2217,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, // A[i] = b; // Member of index 0 // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. - NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); - if (InBounds) - cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) + InBounds = gep->isInBounds(); + AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); + cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); // Cast to the vector pointer type. - NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); + unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); + Type *PtrTy = VecTy->getPointerTo(AddressSpace); + AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); } setDebugLocFromInst(Builder, Instr); @@ -2237,26 +2245,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { Instruction *NewLoad; - if (IsMaskForCondRequired || MaskForGaps) { + if (BlockInMask || MaskForGaps) { assert(useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."); Value *GroupMask = MaskForGaps; - if (IsMaskForCondRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) : ShuffledMask; } NewLoad = - Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(), GroupMask, UndefVec, "wide.masked.vec"); } else - NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], + NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); @@ -2325,24 +2334,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, "interleaved.vec"); Instruction *NewStoreInstr; - if (IsMaskForCondRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( - IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); + IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask); } else - NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], - Group->getAlignment()); + NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], + Group->getAlignment()); Group->addMetadata(NewStoreInstr); } } void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2354,17 +2366,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point"); if (Decision == LoopVectorizationCostModel::CM_Interleave) - return vectorizeInterleaveGroup(Instr); + return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = getLoadStorePointerOperand(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); const Align Alignment = DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); - unsigned AddressSpace = getLoadStoreAddressSpace(Instr); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -2378,25 +2388,22 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // gather/scatter. Otherwise Decision should have been to Scalarize. assert((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"); + (void)ConsecutiveStride; - // Handle consecutive loads/stores. - if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, {0, 0}); - - VectorParts Mask; + VectorParts BlockInMaskParts(UF); bool isMaskRequired = BlockInMask; if (isMaskRequired) - Mask = *BlockInMask; - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>( - getLoadStorePointerOperand(Instr)->stripPointerCasts())) - InBounds = gep->isInBounds(); + for (unsigned Part = 0; Part < UF; ++Part) + BlockInMaskParts[Part] = State.get(BlockInMask, Part); const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. GetElementPtrInst *PartPtr = nullptr; + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. @@ -2407,13 +2414,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - Mask[Part] = reverseVector(Mask[Part]); + BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { PartPtr = cast<GetElementPtrInst>( Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); PartPtr->setIsInBounds(InBounds); } + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); }; @@ -2425,8 +2433,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Instruction *NewSI = nullptr; Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(Addr, Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment.value(), MaskPart); } else { @@ -2437,10 +2445,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, - Alignment.value(), Mask[Part]); + NewSI = Builder.CreateMaskedStore( + StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]); else NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); @@ -2456,17 +2464,17 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, for (unsigned Part = 0; Part < UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(Addr, Part); NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part], - UndefValue::get(DataTy), - "wide.masked.load"); + NewLI = Builder.CreateMaskedLoad( + VecPtr, Alignment.value(), BlockInMaskParts[Part], + UndefValue::get(DataTy), "wide.masked.load"); else NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), "wide.load"); @@ -2676,8 +2684,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); - BasicBlock *BB = L->getLoopPreheader(); - IRBuilder<> Builder(BB->getTerminator()); + // Reuse existing vector loop preheader for TC checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + IRBuilder<> Builder(TCCheckBlock->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the @@ -2694,48 +2704,61 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, CheckMinIters)); - LoopBypassBlocks.push_back(BB); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, + "vector.ph"); + + assert(DT->properlyDominates(DT->getNode(TCCheckBlock), + DT->getNode(Bypass)->getIDom()) && + "TC check is expected to dominate Bypass"); + + // Update dominator for Bypass & LoopExit. + DT->changeImmediateDominator(Bypass, TCCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + + ReplaceInstWithInst( + TCCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + LoopBypassBlocks.push_back(TCCheckBlock); } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for SCEV checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), "scev.check"); - Value *SCEVCheck = - Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + Value *SCEVCheck = Exp.expandCodeForPredicate( + &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) if (C->isZero()) return; - assert(!BB->getParent()->hasOptSize() && + assert(!SCEVCheckBlock->getParent()->hasOptSize() && "Cannot SCEV check stride or overflow when optimizing for size"); - // Create a new block containing the stride check. - BB->setName("vector.scevcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, SCEVCheck)); - LoopBypassBlocks.push_back(BB); + SCEVCheckBlock->setName("vector.scevcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, + nullptr, "vector.ph"); + + // Update dominator only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, SCEVCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); + } + + ReplaceInstWithInst( + SCEVCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; } @@ -2744,7 +2767,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (EnableVPlanNativePath) return; - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for runtime memory checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const MemCheckBlock = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements @@ -2752,11 +2777,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); if (!MemRuntimeCheck) return; - if (BB->getParent()->hasOptSize()) { + if (MemCheckBlock->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -2770,24 +2795,28 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { }); } - // Create a new block containing the memory check. - BB->setName("vector.memcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); - LoopBypassBlocks.push_back(BB); + MemCheckBlock->setName("vector.memcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, + "vector.ph"); + + // Update dominator only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, MemCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); + } + + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); + LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -2912,12 +2941,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { ... */ - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); - BasicBlock *ExitBlock = OrigLoop->getExitBlock(); MDNode *OrigLoopID = OrigLoop->getLoopID(); - assert(VectorPH && "Invalid loop structure"); - assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer @@ -2934,12 +2958,27 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. - BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); - BasicBlock *MiddleBlock = - VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + LoopScalarBody = OrigLoop->getHeader(); + LoopVectorPreHeader = OrigLoop->getLoopPreheader(); + LoopExitBlock = OrigLoop->getExitBlock(); + assert(LoopExitBlock && "Must have an exit block"); + assert(LoopVectorPreHeader && "Invalid loop structure"); + + LoopMiddleBlock = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + LI, nullptr, "middle.block"); + LoopScalarPreHeader = + SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, + nullptr, "scalar.ph"); + // We intentionally don't let SplitBlock to update LoopInfo since + // LoopVectorBody should belong to another loop than LoopVectorPreHeader. + // LoopVectorBody is explicitly added to the correct place few lines later. + LoopVectorBody = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + nullptr, nullptr, "vector.body"); + + // Update dominator for loop exit. + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); @@ -2949,12 +2988,10 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, *LI); + Lp->addBasicBlockToLoop(LoopVectorBody, *LI); // Find the loop boundaries. Value *Count = getOrCreateTripCount(Lp); @@ -2966,16 +3003,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. - emitSCEVChecks(Lp, ScalarPH); + emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, ScalarPH); + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) @@ -3003,8 +3040,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { InductionDescriptor II = InductionEntry.second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create( - OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; @@ -3015,23 +3053,23 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); - const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - BCResumeVal->addIncoming(EndValue, MiddleBlock); + BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); - OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } // We need the OrigLoop (scalar loop part) latch terminator to help @@ -3049,9 +3087,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { - CmpN = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); + CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead // of the corresponding compare because they may have ended up with @@ -3060,20 +3098,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); } - BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); - ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); - - // Save the state. - LoopVectorPreHeader = Lp->getLoopPreheader(); - LoopScalarPreHeader = ScalarPH; - LoopMiddleBlock = MiddleBlock; - LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; - LoopScalarBody = OldBasicBlock; + assert(LoopVectorPreHeader == Lp->getLoopPreheader() && + "Inconsistent vector loop preheader"); + Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); Optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, @@ -3094,6 +3127,11 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); +#ifdef EXPENSIVE_CHECKS + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); + LI->verify(*DT); +#endif + return LoopVectorPreHeader; } @@ -3429,15 +3467,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // This is the second stage of vectorizing recurrences. fixCrossIterationPHIs(); - // Update the dominator tree. - // - // FIXME: After creating the structure of the new loop, the dominator tree is - // no longer up-to-date, and it remains that way until we update it - // here. An out-of-date dominator tree is problematic for SCEV, - // because SCEVExpander uses it to guide code generation. The - // vectorizer use SCEVExpanders in several places. Instead, we should - // keep the dominator tree up-to-date as we go. - updateAnalysis(); + // Forget the original basic block. + PSE.getSE()->forgetLoop(OrigLoop); // Fix-up external users of the induction variables. for (auto &Entry : *Legal->getInductionVars()) @@ -3550,17 +3581,27 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // among all unrolled iterations, due to the order of their construction. Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); - // Set the insertion point after the previous value if it is an instruction. + // Find and set the insertion point after the previous value if it is an + // instruction. + BasicBlock::iterator InsertPt; // Note that the previous value may have been constant-folded so it is not - // guaranteed to be an instruction in the vector loop. Also, if the previous - // value is a phi node, we should insert after all the phi nodes to avoid - // breaking basic block verification. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || - isa<PHINode>(PreviousLastPart)) - Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); - else - Builder.SetInsertPoint( - &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); + // guaranteed to be an instruction in the vector loop. + // FIXME: Loop invariant values do not form recurrences. We should deal with + // them earlier. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) + InsertPt = LoopVectorBody->getFirstInsertionPt(); + else { + Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); + if (isa<PHINode>(PreviousLastPart)) + // If the previous value is a phi node, we should insert after all the phi + // nodes in the block containing the PHI to avoid breaking basic block + // verification. Note that the basic block may be different to + // LoopVectorBody, in case we predicate the loop. + InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); + else + InsertPt = ++PreviousInst->getIterator(); + } + Builder.SetInsertPoint(&*InsertPt); // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. @@ -3693,16 +3734,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } } + // Wrap flags are in general invalid after vectorization, clear them. + clearReductionWrapFlags(RdxDesc); + // Fix the vector-loop phi. // Reductions do not have to start at zero. They can start with // any loop invariant values. BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); + for (unsigned Part = 0; Part < UF; ++Part) { Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); Value *Val = getOrCreateVectorValue(LoopVal, Part); - // Make sure to add the reduction stat value only to the + // Make sure to add the reduction start value only to the // first unroll part. Value *StartVal = (Part == 0) ? VectorStart : Identity; cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); @@ -3839,6 +3884,37 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } +void InnerLoopVectorizer::clearReductionWrapFlags( + RecurrenceDescriptor &RdxDesc) { + RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); + if (RK != RecurrenceDescriptor::RK_IntegerAdd && + RK != RecurrenceDescriptor::RK_IntegerMult) + return; + + Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); + assert(LoopExitInstr && "null loop exit instruction"); + SmallVector<Instruction *, 8> Worklist; + SmallPtrSet<Instruction *, 8> Visited; + Worklist.push_back(LoopExitInstr); + Visited.insert(LoopExitInstr); + + while (!Worklist.empty()) { + Instruction *Cur = Worklist.pop_back_val(); + if (isa<OverflowingBinaryOperator>(Cur)) + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = getOrCreateVectorValue(Cur, Part); + cast<Instruction>(V)->dropPoisonGeneratingFlags(); + } + + for (User *U : Cur->users()) { + Instruction *UI = cast<Instruction>(U); + if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && + Visited.insert(UI).second) + Worklist.push_back(UI); + } + } +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { @@ -3960,6 +4036,75 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } } +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, + unsigned VF, bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant) { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); + VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); + addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? GEP->getPointerOperand() + : getOrCreateVectorValue(GEP->getPointerOperand(), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (auto Index : enumerate(GEP->indices())) { + Value *User = Index.value().get(); + if (IsIndexLoopInvariant[Index.index()]) + Indices.push_back(User); + else + Indices.push_back(getOrCreateVectorValue(User, Part)); + } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = + GEP->isInBounds() + ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, + Indices) + : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); + addMetadata(NewGEP, GEP); + } + } +} + void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { PHINode *P = cast<PHINode>(PN); @@ -4062,76 +4207,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { switch (I.getOpcode()) { case Instruction::Br: case Instruction::PHI: + case Instruction::GetElementPtr: llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::GetElementPtr: { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - auto *GEP = cast<GetElementPtrInst>(&I); - - if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = - OrigLoop->isLoopInvariant(GEP->getPointerOperand()) - ? GEP->getPointerOperand() - : getOrCreateVectorValue(GEP->getPointerOperand(), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { - if (OrigLoop->isLoopInvariant(U.get())) - Indices.push_back(U.get()); - else - Indices.push_back(getOrCreateVectorValue(U.get(), Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF == 1 || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); - addMetadata(NewGEP, GEP); - } - } - - break; - } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -4335,26 +4412,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { } // end of switch. } -void InnerLoopVectorizer::updateAnalysis() { - // Forget the original basic block. - PSE.getSE()->forgetLoop(OrigLoop); - - // DT is not kept up-to-date for outer loop vectorization - if (EnableVPlanNativePath) - return; - - // Update the dominator tree information. - assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && - "Entry does not dominate exit."); - - DT->addNewBlock(LoopMiddleBlock, - LI->getLoopFor(LoopVectorBody)->getLoopLatch()); - DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); - DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); -} - void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -4562,9 +4619,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return WideningDecision == CM_Scalarize; } const MaybeAlign Alignment = getLoadStoreAlignment(I); - return isa<LoadInst>(I) ? - !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty)) - : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty)); + return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || + isLegalMaskedGather(Ty, Alignment)) + : !(isLegalMaskedStore(Ty, Ptr, Alignment) || + isLegalMaskedScatter(Ty, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -4667,14 +4725,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { SetVector<Instruction *> Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); + // Instructions that are scalar with predication must not be considered + // uniform after vectorization, because that would create an erroneous + // replicating region where only a single instance out of VF should be formed. + // TODO: optimize such seldom cases if found important, see PR40816. + auto addToWorklistIfAllowed = [&](Instruction *I) -> void { + if (isScalarWithPredication(I, VF)) { + LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " + << *I << "\n"); + return; + } + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); + Worklist.insert(I); + }; + // Start with the conditional branch. If the branch condition is an // instruction contained in the loop that is only used by the branch, it is // uniform. auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { - Worklist.insert(Cmp); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); - } + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) + addToWorklistIfAllowed(Cmp); // Holds consecutive and consecutive-like pointers. Consecutive-like pointers // are pointers that are treated like consecutive pointers during @@ -4733,10 +4803,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Add to the Worklist all consecutive and consecutive-like pointers that // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) - if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); - Worklist.insert(V); - } + if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) + addToWorklistIfAllowed(V); // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures @@ -4762,10 +4830,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { return Worklist.count(J) || (OI == getLoadStorePointerOperand(J) && isUniformDecision(J, VF)); - })) { - Worklist.insert(OI); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); - } + })) + addToWorklistIfAllowed(OI); } } @@ -4807,11 +4873,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { continue; // The induction variable and its update instruction will remain uniform. - Worklist.insert(Ind); - Worklist.insert(IndUpdate); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate - << "\n"); + addToWorklistIfAllowed(Ind); + addToWorklistIfAllowed(IndUpdate); } Uniforms[VF].insert(Worklist.begin(), Worklist.end()); @@ -5143,9 +5206,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not interleave loops with a relatively small trip count. - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (TC > 1 && TC < TinyTripCountInterleaveThreshold) + // Do not interleave loops with a relatively small known or estimated trip + // count. + auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) return 1; RegisterUsage R = calculateRegisterUsage({VF})[0]; @@ -5208,12 +5272,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - // If the trip count is constant, limit the interleave count to be less than - // the trip count divided by VF. - if (TC > 0) { - assert(TC >= VF && "VF exceeds trip count?"); - if ((TC / VF) < MaxInterleaveCount) - MaxInterleaveCount = (TC / VF); + // If trip count is known or estimated compile time constant, limit the + // interleave count to be less than the trip count divided by VF. + if (BestKnownTC) { + MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5746,7 +5808,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // vectorized loop where the user of it is a vectorized instruction. const MaybeAlign Alignment = getLoadStoreAlignment(I); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment ? Alignment->value() : 0, AS); + Alignment, AS); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5783,8 +5845,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment ? Alignment->value() : 0, AS); else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, - Alignment ? Alignment->value() : 0, AS, I); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); bool Reverse = ConsecutiveStride < 0; if (Reverse) @@ -5800,16 +5861,14 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); if (isa<LoadInst>(I)) { return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, - Alignment ? Alignment->value() : 0, AS) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } StoreInst *SI = cast<StoreInst>(I); bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, - Alignment ? Alignment->value() : 0, AS) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, @@ -5877,8 +5936,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, - Alignment ? Alignment->value() : 0, AS, I); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); } return getWideningCost(I, VF); } @@ -6217,7 +6275,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); + Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; @@ -6225,7 +6283,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, - I->getOperand(0)); + I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); @@ -6714,37 +6772,6 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; } -VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range, - VPlanPtr &Plan) { - const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); - if (!IG) - return nullptr; - - // Now check if IG is relevant for VF's in the given range. - auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { - return [=](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(I, VF) == - LoopVectorizationCostModel::CM_Interleave); - }; - }; - if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) - return nullptr; - - // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) - // range. If it's the primary member of the IG construct a VPInterleaveRecipe. - // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. - assert(I == IG->getInsertPos() && - "Generating a recipe for an adjunct member of an interleave group"); - - VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), Plan); - - return new VPInterleaveRecipe(IG, Mask); -} - VPWidenMemoryInstructionRecipe * VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan) { @@ -6754,15 +6781,15 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, auto willWiden = [&](unsigned VF) -> bool { if (VF == 1) return false; - if (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF)) - return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point."); - assert(Decision != LoopVectorizationCostModel::CM_Interleave && - "Interleave memory opportunity should be caught earlier."); + if (Decision == LoopVectorizationCostModel::CM_Interleave) + return true; + if (CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF)) + return false; return Decision != LoopVectorizationCostModel::CM_Scalarize; }; @@ -6773,7 +6800,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, if (Legal->isMaskRequired(I)) Mask = createBlockInMask(I->getParent(), Plan); - return new VPWidenMemoryInstructionRecipe(*I, Mask); + VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); + return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); } VPWidenIntOrFpInductionRecipe * @@ -6861,7 +6889,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, case Instruction::FPTrunc: case Instruction::FRem: case Instruction::FSub: - case Instruction::GetElementPtr: case Instruction::ICmp: case Instruction::IntToPtr: case Instruction::Load: @@ -6926,16 +6953,23 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return false; + // If this ingredient's recipe is to be recorded, keep its recipe a singleton + // to avoid having to split recipes later. + bool IsSingleton = Ingredient2Recipe.count(I); + + // Success: widen this instruction. - // Success: widen this instruction. We optimize the common case where + // Use the default widening recipe. We optimize the common case where // consecutive instructions can be represented by a single recipe. - if (!VPBB->empty()) { - VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); - if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) - return true; - } + if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && + LastExtensibleRecipe->appendInstruction(I)) + return true; - VPBB->appendRecipe(new VPWidenRecipe(I)); + VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); + if (!IsSingleton) + LastExtensibleRecipe = WidenRecipe; + setRecipe(I, WidenRecipe); + VPBB->appendRecipe(WidenRecipe); return true; } @@ -6951,6 +6985,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + setRecipe(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar // value. Avoid hoisting the insert-element which packs the scalar value into @@ -7009,36 +7044,36 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, VPBasicBlock *VPBB) { VPRecipeBase *Recipe = nullptr; - // Check if Instr should belong to an interleave memory recipe, or already - // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { - VPBB->appendRecipe(Recipe); - return true; - } - // Check if Instr is a memory operation that should be widened. - if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { + // First, check for specific widening recipes that deal with memory + // operations, inductions and Phi nodes. + if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || + (Recipe = tryToOptimizeInduction(Instr, Range)) || + (Recipe = tryToBlend(Instr, Plan)) || + (isa<PHINode>(Instr) && + (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { + setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); return true; } - // Check if Instr should form some PHI recipe. - if ((Recipe = tryToOptimizeInduction(Instr, Range))) { - VPBB->appendRecipe(Recipe); - return true; - } - if ((Recipe = tryToBlend(Instr, Plan))) { + // Handle GEP widening. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { + auto Scalarize = [&](unsigned VF) { + return CM.isScalarWithPredication(Instr, VF) || + CM.isScalarAfterVectorization(Instr, VF) || + CM.isProfitableToScalarize(Instr, VF); + }; + if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) + return false; + VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); + setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); return true; } - if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { - VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); - return true; - } // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes that deal with - // Interleave Groups, Inductions and Phi nodes. + // having first checked for specific widening recipes. if (tryToWiden(Instr, VPBB, Range)) return true; @@ -7094,19 +7129,57 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, SmallPtrSetImpl<Instruction *> &DeadInstructions) { + // Hold a mapping from predicated instructions to their recipes, in order to // fix their AlsoPack behavior if a user is determined to replicate and use a // scalar instead of vector value. DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - DenseMap<Instruction *, Instruction *> SinkAfterInverse; + + SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; + + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); + + // --------------------------------------------------------------------------- + // Pre-construction: record ingredients whose recipes we'll need to further + // process after constructing the initial VPlan. + // --------------------------------------------------------------------------- + + // Mark instructions we'll need to sink later and their targets as + // ingredients whose recipe we'll need to record. + for (auto &Entry : SinkAfter) { + RecipeBuilder.recordRecipeOf(Entry.first); + RecipeBuilder.recordRecipeOf(Entry.second); + } + + // For each interleave group which is relevant for this (possibly trimmed) + // Range, add it to the set of groups to be later applied to the VPlan and add + // placeholders for its members' Recipes which we'll be replacing with a + // single VPInterleaveRecipe. + for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { + auto applyIG = [IG, this](unsigned VF) -> bool { + return (VF >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), VF) == + LoopVectorizationCostModel::CM_Interleave); + }; + if (!getDecisionAndClampRange(applyIG, Range)) + continue; + InterleaveGroups.insert(IG); + for (unsigned i = 0; i < IG->getFactor(); i++) + if (Instruction *Member = IG->getMember(i)) + RecipeBuilder.recordRecipeOf(Member); + }; + + // --------------------------------------------------------------------------- + // Build initial VPlan: Scan the body of the loop in a topological order to + // visit each basic block after having visited its predecessor basic blocks. + // --------------------------------------------------------------------------- // Create a dummy pre-entry VPBasicBlock to start building the VPlan. VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = std::make_unique<VPlan>(VPBB); - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); @@ -7125,10 +7198,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); - std::vector<Instruction *> Ingredients; - - // Organize the ingredients to vectorize from current basic block in the - // right order. + // Introduce each ingredient into VPlan. for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; @@ -7138,43 +7208,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( DeadInstructions.find(Instr) != DeadInstructions.end()) continue; - // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct - // member of the IG, do not construct any Recipe for it. - const InterleaveGroup<Instruction> *IG = - CM.getInterleavedAccessGroup(Instr); - if (IG && Instr != IG->getInsertPos() && - Range.Start >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(Instr, Range.Start) == - LoopVectorizationCostModel::CM_Interleave) { - auto SinkCandidate = SinkAfterInverse.find(Instr); - if (SinkCandidate != SinkAfterInverse.end()) - Ingredients.push_back(SinkCandidate->second); - continue; - } - - // Move instructions to handle first-order recurrences, step 1: avoid - // handling this instruction until after we've handled the instruction it - // should follow. - auto SAIt = SinkAfter.find(Instr); - if (SAIt != SinkAfter.end()) { - LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" - << *SAIt->second - << " to vectorize a 1st order recurrence.\n"); - SinkAfterInverse[SAIt->second] = Instr; - continue; - } - - Ingredients.push_back(Instr); - - // Move instructions to handle first-order recurrences, step 2: push the - // instruction to be sunk at its insertion point. - auto SAInvIt = SinkAfterInverse.find(Instr); - if (SAInvIt != SinkAfterInverse.end()) - Ingredients.push_back(SAInvIt->second); - } - - // Introduce each ingredient into VPlan. - for (Instruction *Instr : Ingredients) { if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) continue; @@ -7199,6 +7232,33 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; + // --------------------------------------------------------------------------- + // Transform initial VPlan: Apply previously taken decisions, in order, to + // bring the VPlan to its final state. + // --------------------------------------------------------------------------- + + // Apply Sink-After legal constraints. + for (auto &Entry : SinkAfter) { + VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); + VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); + Sink->moveAfter(Target); + } + + // Interleave memory: for each Interleave Group we marked earlier as relevant + // for this VPlan, replace the Recipes widening its memory instructions with a + // single VPInterleaveRecipe at its insertion point. + for (auto IG : InterleaveGroups) { + auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( + RecipeBuilder.getRecipe(IG->getInsertPos())); + (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) + ->insertBefore(Recipe); + + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (Instruction *Member = IG->getMember(i)) { + RecipeBuilder.getRecipe(Member)->eraseFromParent(); + } + } + // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. if (CM.foldTailByMasking()) { @@ -7255,9 +7315,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { } SmallPtrSet<Instruction *, 1> DeadInstructions; - VPlanHCFGTransforms::VPInstructionsToVPRecipes( - Plan, Legal->getInductionVars(), DeadInstructions); - + VPlanTransforms::VPInstructionsToVPRecipes( + OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); return Plan; } @@ -7266,13 +7325,21 @@ getOrCreateVectorValues(Value *V, unsigned Part) { return ILV.getOrCreateVectorValue(V, Part); } +Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( + Value *V, const VPIteration &Instance) { + return ILV.getOrCreateScalarValue(V, Instance); +} + void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); - if (User) { + O << ", "; + getAddr()->printAsOperand(O); + VPValue *Mask = getMask(); + if (Mask) { O << ", "; - User->getOperand(0)->printAsOperand(O); + Mask->printAsOperand(O); } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) @@ -7286,6 +7353,11 @@ void VPWidenRecipe::execute(VPTransformState &State) { State.ILV->widenInstruction(Instr); } +void VPWidenGEPRecipe::execute(VPTransformState &State) { + State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, + IsIndexLoopInvariant); +} + void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); State.ILV->widenIntOrFpInduction(IV, Trunc); @@ -7336,15 +7408,8 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - if (!User) - return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); - - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), + getMask()); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7431,29 +7496,46 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - if (!User) - return State.ILV->vectorizeMemoryInstruction(&Instr); - - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); -} - -static ScalarEpilogueLowering -getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, - ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { - ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; - if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) - SEL = CM_ScalarEpilogueNotAllowedOptSize; - else if (PreferPredicateOverEpilog || Hints.getPredicate()) - SEL = CM_ScalarEpilogueNotNeededUsePredicate; - - return SEL; + State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); +} + +// Determine how to lower the scalar epilogue, which depends on 1) optimising +// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing +// predication, and 4) a TTI hook that analyses whether the loop is suitable +// for predication. +static ScalarEpilogueLowering getScalarEpilogueLowering( + Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, + AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + LoopVectorizationLegality &LVL) { + bool OptSize = + F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); + // 1) OptSize takes precedence over all other options, i.e. if this is set, + // don't look at hints or options, and don't request a scalar epilogue. + if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) + return CM_ScalarEpilogueNotAllowedOptSize; + + bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && + !PreferPredicateOverEpilog; + + // 2) Next, if disabling predication is requested on the command line, honour + // this and request a scalar epilogue. Also do this if we don't have a + // primary induction variable, which is required for predication. + if (PredicateOptDisabled || !LVL.getPrimaryInduction()) + return CM_ScalarEpilogueAllowed; + + // 3) and 4) look if enabling predication is requested on the command line, + // with a loop hint, or if the TTI hook indicates this is profitable, request + // predication . + if (PreferPredicateOverEpilog || + Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || + (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, + LVL.getLAI()) && + Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) + return CM_ScalarEpilogueNotNeededUsePredicate; + + return CM_ScalarEpilogueAllowed; } // Process the loop in the VPlan-native vectorization path. This path builds @@ -7470,14 +7552,16 @@ static bool processLoopInVPlanNativePath( assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); @@ -7562,7 +7646,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7635,7 +7720,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); |