diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 144 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 74 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h | 23 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1305 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 730 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 95 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 301 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 372 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h | 3 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 31 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanValue.h | 55 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanVerifier.h | 8 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 699 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/Vectorize.cpp | 4 |
16 files changed, 2661 insertions, 1190 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 7478daa2a0a52..9b81afbb4b6cb 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -50,7 +50,6 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" -#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -129,22 +128,6 @@ public: private: unsigned getPointerAddressSpace(Value *I); - unsigned getAlignment(LoadInst *LI) const { - unsigned Align = LI->getAlignment(); - if (Align != 0) - return Align; - - return DL.getABITypeAlignment(LI->getType()); - } - - unsigned getAlignment(StoreInst *SI) const { - unsigned Align = SI->getAlignment(); - if (Align != 0) - return Align; - - return DL.getABITypeAlignment(SI->getValueOperand()->getType()); - } - static const unsigned MaxDepth = 3; bool isConsecutiveAccess(Value *A, Value *B); @@ -447,20 +430,78 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, // Now we need to prove that adding IdxDiff to ValA won't overflow. bool Safe = false; + auto CheckFlags = [](Instruction *I, bool Signed) { + BinaryOperator *BinOpI = cast<BinaryOperator>(I); + return (Signed && BinOpI->hasNoSignedWrap()) || + (!Signed && BinOpI->hasNoUnsignedWrap()); + }; + // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to // ValA, we're okay. if (OpB->getOpcode() == Instruction::Add && isa<ConstantInt>(OpB->getOperand(1)) && - IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) { - if (Signed) - Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); - else - Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); + IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) && + CheckFlags(OpB, Signed)) + Safe = true; + + // Second attempt: If both OpA and OpB is an add with NSW/NUW and with + // the same LHS operand, we can guarantee that the transformation is safe + // if we can prove that OpA won't overflow when IdxDiff added to the RHS + // of OpA. + // For example: + // %tmp7 = add nsw i32 %tmp2, %v0 + // %tmp8 = sext i32 %tmp7 to i64 + // ... + // %tmp11 = add nsw i32 %v0, 1 + // %tmp12 = add nsw i32 %tmp2, %tmp11 + // %tmp13 = sext i32 %tmp12 to i64 + // + // Both %tmp7 and %tmp2 has the nsw flag and the first operand + // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow + // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the + // nsw flag. + OpA = dyn_cast<Instruction>(ValA); + if (!Safe && OpA && OpA->getOpcode() == Instruction::Add && + OpB->getOpcode() == Instruction::Add && + OpA->getOperand(0) == OpB->getOperand(0) && CheckFlags(OpA, Signed) && + CheckFlags(OpB, Signed)) { + Value *RHSA = OpA->getOperand(1); + Value *RHSB = OpB->getOperand(1); + Instruction *OpRHSA = dyn_cast<Instruction>(RHSA); + Instruction *OpRHSB = dyn_cast<Instruction>(RHSB); + // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`. + if (OpRHSB && OpRHSB->getOpcode() == Instruction::Add && + CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSB->getOperand(1))) { + int64_t CstVal = cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue(); + if (OpRHSB->getOperand(0) == RHSA && IdxDiff.getSExtValue() == CstVal) + Safe = true; + } + // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`. + if (OpRHSA && OpRHSA->getOpcode() == Instruction::Add && + CheckFlags(OpRHSA, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1))) { + int64_t CstVal = cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue(); + if (OpRHSA->getOperand(0) == RHSB && IdxDiff.getSExtValue() == -CstVal) + Safe = true; + } + // Match `x +nsw/nuw (y +nsw/nuw c)` and + // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`. + if (OpRHSA && OpRHSB && OpRHSA->getOpcode() == Instruction::Add && + OpRHSB->getOpcode() == Instruction::Add && CheckFlags(OpRHSA, Signed) && + CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1)) && + isa<ConstantInt>(OpRHSB->getOperand(1))) { + int64_t CstValA = + cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue(); + int64_t CstValB = + cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue(); + if (OpRHSA->getOperand(0) == OpRHSB->getOperand(0) && + IdxDiff.getSExtValue() == (CstValB - CstValA)) + Safe = true; + } } unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); - // Second attempt: + // Third attempt: // If all set bits of IdxDiff or any higher order bit other than the sign bit // are known to be zero in ValA, we can add Diff to it while guaranteeing no // overflow of any sort. @@ -503,7 +544,6 @@ bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, } void Vectorizer::reorder(Instruction *I) { - OrderedBasicBlock OBB(I->getParent()); SmallPtrSet<Instruction *, 16> InstructionsToMove; SmallVector<Instruction *, 16> Worklist; @@ -521,7 +561,7 @@ void Vectorizer::reorder(Instruction *I) { if (IM->getParent() != I->getParent()) continue; - if (!OBB.dominates(IM, I)) { + if (!IM->comesBefore(I)) { InstructionsToMove.insert(IM); Worklist.push_back(IM); } @@ -637,8 +677,6 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { } } - OrderedBasicBlock OBB(Chain[0]->getParent()); - // Loop until we find an instruction in ChainInstrs that we can't vectorize. unsigned ChainInstrIdx = 0; Instruction *BarrierMemoryInstr = nullptr; @@ -648,14 +686,14 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { // If a barrier memory instruction was found, chain instructions that follow // will not be added to the valid prefix. - if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr)) + if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr)) break; // Check (in BB order) if any instruction prevents ChainInstr from being // vectorized. Find and store the first such "conflicting" instruction. for (Instruction *MemInstr : MemoryInstrs) { // If a barrier memory instruction was found, do not check past it. - if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr)) + if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr)) break; auto *MemLoad = dyn_cast<LoadInst>(MemInstr); @@ -674,12 +712,12 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { // vectorize it (the vectorized load is inserted at the location of the // first load in the chain). if (isa<StoreInst>(MemInstr) && ChainLoad && - (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr))) + (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr))) continue; // Same case, but in reverse. if (MemLoad && isa<StoreInst>(ChainInstr) && - (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr))) + (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr))) continue; if (!AA.isNoAlias(MemoryLocation::get(MemInstr), @@ -705,7 +743,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { // the basic block. if (IsLoadChain && BarrierMemoryInstr) { // The BarrierMemoryInstr is a store that precedes ChainInstr. - assert(OBB.dominates(BarrierMemoryInstr, ChainInstr)); + assert(BarrierMemoryInstr->comesBefore(ChainInstr)); break; } } @@ -961,7 +999,7 @@ bool Vectorizer::vectorizeStoreChain( unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); unsigned VF = VecRegSize / Sz; unsigned ChainSize = Chain.size(); - unsigned Alignment = getAlignment(S0); + Align Alignment = S0->getAlign(); if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { InstructionsProcessed->insert(Chain.begin(), Chain.end()); @@ -992,10 +1030,10 @@ bool Vectorizer::vectorizeStoreChain( VectorType *VecTy; VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy); if (VecStoreTy) - VecTy = VectorType::get(StoreTy->getScalarType(), - Chain.size() * VecStoreTy->getNumElements()); + VecTy = FixedVectorType::get(StoreTy->getScalarType(), + Chain.size() * VecStoreTy->getNumElements()); else - VecTy = VectorType::get(StoreTy, Chain.size()); + VecTy = FixedVectorType::get(StoreTy, Chain.size()); // If it's more than the max vector size or the target has a better // vector factor, break it into two pieces. @@ -1019,18 +1057,20 @@ bool Vectorizer::vectorizeStoreChain( InstructionsProcessed->insert(Chain.begin(), Chain.end()); // If the store is going to be misaligned, don't vectorize it. - if (accessIsMisaligned(SzInBytes, AS, Alignment)) { + if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) { if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { auto Chains = splitOddVectorElts(Chain, Sz); return vectorizeStoreChain(Chains.first, InstructionsProcessed) | vectorizeStoreChain(Chains.second, InstructionsProcessed); } - unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), - StackAdjustedAlignment, - DL, S0, nullptr, &DT); - if (NewAlign != 0) + Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), + Align(StackAdjustedAlignment), + DL, S0, nullptr, &DT); + if (NewAlign >= Alignment) Alignment = NewAlign; + else + return false; } if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { @@ -1112,7 +1152,7 @@ bool Vectorizer::vectorizeLoadChain( unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); unsigned VF = VecRegSize / Sz; unsigned ChainSize = Chain.size(); - unsigned Alignment = getAlignment(L0); + Align Alignment = L0->getAlign(); if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { InstructionsProcessed->insert(Chain.begin(), Chain.end()); @@ -1142,10 +1182,10 @@ bool Vectorizer::vectorizeLoadChain( VectorType *VecTy; VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy); if (VecLoadTy) - VecTy = VectorType::get(LoadTy->getScalarType(), - Chain.size() * VecLoadTy->getNumElements()); + VecTy = FixedVectorType::get(LoadTy->getScalarType(), + Chain.size() * VecLoadTy->getNumElements()); else - VecTy = VectorType::get(LoadTy, Chain.size()); + VecTy = FixedVectorType::get(LoadTy, Chain.size()); // If it's more than the max vector size or the target has a better // vector factor, break it into two pieces. @@ -1162,15 +1202,20 @@ bool Vectorizer::vectorizeLoadChain( InstructionsProcessed->insert(Chain.begin(), Chain.end()); // If the load is going to be misaligned, don't vectorize it. - if (accessIsMisaligned(SzInBytes, AS, Alignment)) { + if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) { if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { auto Chains = splitOddVectorElts(Chain, Sz); return vectorizeLoadChain(Chains.first, InstructionsProcessed) | vectorizeLoadChain(Chains.second, InstructionsProcessed); } - Alignment = getOrEnforceKnownAlignment( - L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT); + Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), + Align(StackAdjustedAlignment), + DL, L0, nullptr, &DT); + if (NewAlign >= Alignment) + Alignment = NewAlign; + else + return false; } if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { @@ -1194,7 +1239,8 @@ bool Vectorizer::vectorizeLoadChain( Value *Bitcast = Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); - LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); + LoadInst *LI = + Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment)); propagateMetadata(LI, Chain); if (VecLoadTy) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 3f943f4c0688e..23613775d896d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -13,14 +13,17 @@ // pass. It should be easy to create an analysis pass around it if there // is a need (but D45420 needs to happen first). // -#include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/Vectorize/LoopVectorize.h" using namespace llvm; +using namespace PatternMatch; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -566,6 +569,28 @@ bool LoopVectorizationLegality::setupOuterLoopInductions() { return false; } +/// Checks if a function is scalarizable according to the TLI, in +/// the sense that it should be vectorized and then expanded in +/// multiple scalarcalls. This is represented in the +/// TLI via mappings that do not specify a vector name, as in the +/// following example: +/// +/// const VecDesc VecIntrinsics[] = { +/// {"llvm.phx.abs.i32", "", 4} +/// }; +static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) { + const StringRef ScalarName = CI.getCalledFunction()->getName(); + bool Scalarize = TLI.isFunctionVectorizable(ScalarName); + // Check that all known VFs are not associated to a vector + // function, i.e. the vector name is emty. + if (Scalarize) + for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); + VF <= WidestVF; VF *= 2) { + Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); + } + return Scalarize; +} + bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *Header = TheLoop->getHeader(); @@ -644,6 +669,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, SinkAfter, DT)) { + AllowedExit.insert(Phi); FirstOrderRecurrences.insert(Phi); continue; } @@ -667,10 +693,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // * Have a mapping to an IR intrinsic. // * Have a vector version available. auto *CI = dyn_cast<CallInst>(&I); + if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) && !(CI->getCalledFunction() && TLI && - TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { + (!VFDatabase::getMappings(*CI).empty() || + isTLIScalarize(*TLI, *CI)))) { // If the call is a recognized math libary call, it is likely that // we can vectorize it given loosened floating-point constraints. LibFunc Func; @@ -685,7 +713,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // but it's hard to provide meaningful yet generic advice. // Also, should this be guarded by allowExtraAnalysis() and/or be part // of the returned info from isFunctionVectorizable()? - reportVectorizationFailure("Found a non-intrinsic callsite", + reportVectorizationFailure( + "Found a non-intrinsic callsite", "library call cannot be vectorized. " "Try compiling with -fno-math-errno, -ffast-math, " "or similar flags", @@ -739,11 +768,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // supported on the target. if (ST->getMetadata(LLVMContext::MD_nontemporal)) { // Arbitrarily try a vector of 2 elements. - Type *VecTy = VectorType::get(T, /*NumElements=*/2); + auto *VecTy = FixedVectorType::get(T, /*NumElements=*/2); assert(VecTy && "did not find vectorized version of stored type"); - const MaybeAlign Alignment = getLoadStoreAlignment(ST); - assert(Alignment && "Alignment should be set"); - if (!TTI->isLegalNTStore(VecTy, *Alignment)) { + if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { reportVectorizationFailure( "nontemporal store instruction cannot be vectorized", "nontemporal store instruction cannot be vectorized", @@ -756,11 +783,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (LD->getMetadata(LLVMContext::MD_nontemporal)) { // For nontemporal loads, check that a nontemporal vector version is // supported on the target (arbitrarily try a vector of 2 elements). - Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2); + auto *VecTy = FixedVectorType::get(I.getType(), /*NumElements=*/2); assert(VecTy && "did not find vectorized version of load type"); - const MaybeAlign Alignment = getLoadStoreAlignment(LD); - assert(Alignment && "Alignment should be set"); - if (!TTI->isLegalNTLoad(VecTy, *Alignment)) { + if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { reportVectorizationFailure( "nontemporal load instruction cannot be vectorized", "nontemporal load instruction cannot be vectorized", @@ -897,6 +922,14 @@ bool LoopVectorizationLegality::blockCanBePredicated( if (C->canTrap()) return false; } + + // We can predicate blocks with calls to assume, as long as we drop them in + // case we flatten the CFG via predication. + if (match(&I, m_Intrinsic<Intrinsic::assume>())) { + ConditionalAssumes.insert(&I); + continue; + } + // We might be able to hoist the load. if (I.mayReadFromMemory()) { auto *LI = dyn_cast<LoadInst>(&I); @@ -947,14 +980,14 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // the memory pointed to can be dereferenced (with the access size implied by // the value's type) unconditionally within the loop header without // introducing a new fault. - SmallPtrSet<Value *, 8> SafePointes; + SmallPtrSet<Value *, 8> SafePointers; // Collect safe addresses. for (BasicBlock *BB : TheLoop->blocks()) { if (!blockNeedsPredication(BB)) { for (Instruction &I : *BB) if (auto *Ptr = getLoadStorePointerOperand(&I)) - SafePointes.insert(Ptr); + SafePointers.insert(Ptr); continue; } @@ -968,7 +1001,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { LoadInst *LI = dyn_cast<LoadInst>(&I); if (LI && !mustSuppressSpeculation(*LI) && isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT)) - SafePointes.insert(LI->getPointerOperand()); + SafePointers.insert(LI->getPointerOperand()); } } @@ -986,7 +1019,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointes)) { + if (!blockCanBePredicated(BB, SafePointers)) { reportVectorizationFailure( "Control flow cannot be substituted for a select", "control flow cannot be substituted for a select", @@ -1198,18 +1231,9 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); - if (!PrimaryInduction) { - reportVectorizationFailure( - "No primary induction, cannot fold tail by masking", - "Missing a primary induction variable in the loop, which is " - "needed in order to fold tail by masking as required.", - "NoPrimaryInduction", ORE, TheLoop); - return false; - } - SmallPtrSet<const Value *, 8> ReductionLiveOuts; - for (auto &Reduction : *getReductionVars()) + for (auto &Reduction : getReductionVars()) ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); // TODO: handle non-reduction outside users when tail is folded by masking. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index c3ca43fcd4927..8dd06983cd84d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -31,9 +31,12 @@ namespace llvm { +class LoopVectorizationLegality; +class LoopVectorizationCostModel; +class PredicatedScalarEvolution; + /// VPlan-based builder utility analogous to IRBuilder. class VPBuilder { -private: VPBasicBlock *BB = nullptr; VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); @@ -204,6 +207,8 @@ class LoopVectorizationPlanner { /// The interleaved access analysis. InterleavedAccessInfo &IAI; + PredicatedScalarEvolution &PSE; + SmallVector<VPlanPtr, 4> VPlans; /// This class is used to enable the VPlan to invoke a method of ILV. This is @@ -229,13 +234,14 @@ public: const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, - InterleavedAccessInfo &IAI) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), - IAI(IAI) {} + InterleavedAccessInfo &IAI, + PredicatedScalarEvolution &PSE) + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), + PSE(PSE) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional<VectorizationFactor> plan(unsigned UserVF); + Optional<VectorizationFactor> plan(unsigned UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. @@ -279,9 +285,10 @@ private: /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. - VPlanPtr - buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, - SmallPtrSetImpl<Instruction *> &DeadInstructions); + VPlanPtr buildVPlanWithVPRecipes( + VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, + SmallPtrSetImpl<Instruction *> &DeadInstructions, + const DenseMap<Instruction *, Instruction *> &SinkAfter); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 684a3098e5645..35af8e4257789 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -91,7 +91,6 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -134,9 +133,11 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <algorithm> @@ -294,15 +295,6 @@ cl::opt<bool> llvm::EnableLoopVectorization( "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); -/// A helper function for converting Scalar types to vector types. -/// If the incoming type is void, we return void. If the VF is 1, we return -/// the scalar type. -static Type *ToVectorTy(Type *Scalar, unsigned VF) { - if (Scalar->isVoidTy() || VF == 1) - return Scalar; - return VectorType::get(Scalar, VF); -} - /// A helper function that returns the type of loaded or stored value. static Type *getMemInstValueType(Value *I) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && @@ -319,7 +311,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a <VF x Ty> vector. if (VF > 1) { - auto *VectorTy = VectorType::get(Ty, VF); + auto *VectorTy = FixedVectorType::get(Ty, VF); return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); } @@ -415,7 +407,16 @@ public: BasicBlock *createVectorizedLoopSkeleton(); /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I); + void widenInstruction(Instruction &I, VPUser &Operands, + VPTransformState &State); + + /// Widen a single call instruction within the innermost loop. + void widenCallInstruction(CallInst &I, VPUser &ArgOperands, + VPTransformState &State); + + /// Widen a single select instruction within the innermost loop. + void widenSelectInstruction(SelectInst &I, VPUser &Operands, + bool InvariantCond, VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -430,8 +431,9 @@ public: /// Vectorize a single GetElementPtrInst based on information gathered and /// decisions taken during planning. - void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, - bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); + void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, + unsigned VF, bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and @@ -441,9 +443,11 @@ public: /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive.. - void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, - bool IfPredicateInstr); + /// inclusive. Uses the VPValue operands from \p Operands instead of \p + /// Instr's operands. + void scalarizeInstruction(Instruction *Instr, VPUser &Operands, + const VPIteration &Instance, bool IfPredicateInstr, + VPTransformState &State); /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to @@ -482,20 +486,21 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to - /// with the base address given in \p Addr, optionally masking the vector - /// operations if \p BlockInMask is non-null. Use \p State to translate given - /// VPValues to IR values in the vectorized loop. - void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, - VPValue *Addr, VPValue *BlockInMask = nullptr); + /// Try to vectorize interleaved access group \p Group with the base address + /// given in \p Addr, optionally masking the vector operations if \p + /// BlockInMask is non-null. Use \p State to translate given VPValues to IR + /// values in the vectorized loop. + void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, + VPTransformState &State, VPValue *Addr, + VPValue *BlockInMask = nullptr); /// Vectorize Load and Store instructions with the base address given in \p /// Addr, optionally masking the vector operations if \p BlockInMask is /// non-null. Use \p State to translate given VPValues to IR values in the /// vectorized loop. void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, - VPValue *Addr, - VPValue *BlockInMask = nullptr); + VPValue *Addr, VPValue *StoredValue, + VPValue *BlockInMask); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -682,7 +687,7 @@ protected: DominatorTree *DT; /// Alias Analysis. - AliasAnalysis *AA; + AAResults *AA; /// Target Library Info. const TargetLibraryInfo *TLI; @@ -974,7 +979,7 @@ public: /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional<unsigned> computeMaxVF(); + Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -1066,7 +1071,7 @@ public: auto UniformsPerVF = Uniforms.find(VF); assert(UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"); - return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); + return UniformsPerVF->second.count(I); } /// Returns true if \p I is known to be scalar after vectorization. @@ -1082,7 +1087,7 @@ public: auto ScalarsPerVF = Scalars.find(VF); assert(ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"); - return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); + return ScalarsPerVF->second.count(I); } /// \returns True if instruction \p I can be truncated to a smaller bitwidth @@ -1200,27 +1205,27 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType, Alignment); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType, Alignment); } /// Returns true if the target machine supports masked scatter operation /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedScatter(Type *DataType, Align Alignment) { return TTI.isLegalMaskedScatter(DataType, Alignment); } /// Returns true if the target machine supports masked gather operation /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedGather(Type *DataType, Align Alignment) { return TTI.isLegalMaskedGather(DataType, Alignment); } @@ -1232,7 +1237,7 @@ public: if (!LI && !SI) return false; auto *Ty = getMemInstValueType(V); - MaybeAlign Align = getLoadStoreAlignment(V); + Align Align = getLoadStoreAlignment(V); return (LI && isLegalMaskedGather(Ty, Align)) || (SI && isLegalMaskedScatter(Ty, Align)); } @@ -1309,11 +1314,19 @@ public: /// i.e. either vector version isn't available, or is too expensive. unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + /// Invalidates decisions already taken by the cost model. + void invalidateCostModelingDecisions() { + WideningDecisions.clear(); + Uniforms.clear(); + Scalars.clear(); + } + private: unsigned NumPredStores = 0; - /// \return An upper bound for the vectorization factor, larger than zero. - /// One is returned if vectorization should best be avoided due to cost. + /// \return An upper bound for the vectorization factor, a power-of-2 larger + /// than zero. One is returned if vectorization should best be avoided due + /// to cost. unsigned computeFeasibleMaxVF(unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean @@ -1598,9 +1611,8 @@ struct LoopVectorize : public FunctionPass { explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, bool VectorizeOnlyWhenForced = false) - : FunctionPass(ID) { - Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; - Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; + : FunctionPass(ID), + Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -1626,7 +1638,7 @@ struct LoopVectorize : public FunctionPass { [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, - GetLAA, *ORE, PSI); + GetLAA, *ORE, PSI).MadeAnyChange; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -1640,6 +1652,7 @@ struct LoopVectorize : public FunctionPass { AU.addRequired<LoopAccessLegacyAnalysis>(); AU.addRequired<DemandedBitsWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.addRequired<InjectTLIMappingsLegacy>(); // We currently do not preserve loopinfo/dominator analyses with outer loop // vectorization. Until this is addressed, mark these analyses as preserved @@ -1724,9 +1737,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - Value *SplatVF = isa<Constant>(Mul) - ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(VF, Mul); + Value *SplatVF = + isa<Constant>(Mul) + ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -1806,57 +1820,37 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); - auto II = Legal->getInductionVars()->find(IV); - assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); + auto II = Legal->getInductionVars().find(IV); + assert(II != Legal->getInductionVars().end() && "IV is not an induction"); auto ID = II->second; assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); - // The scalar value to broadcast. This will be derived from the canonical - // induction variable. - Value *ScalarIV = nullptr; - // The value from the original loop to which we are mapping the new induction // variable. Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; - // True if we have vectorized the induction variable. - auto VectorizedIV = false; - - // Determine if we want a scalar version of the induction variable. This is - // true if the induction variable itself is not widened, or if it has at - // least one user in the loop that is not widened. - auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); + auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); // Generate code for the induction step. Note that induction steps are // required to be loop-invariant - assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && - "Induction step should be loop invariant"); - auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - Value *Step = nullptr; - if (PSE.getSE()->isSCEVable(IV->getType())) { - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), - LoopVectorPreHeader->getTerminator()); - } else { - Step = cast<SCEVUnknown>(ID.getStep())->getValue(); - } - - // Try to create a new independent vector induction variable. If we can't - // create the phi node, we will splat the scalar induction variable in each - // loop iteration. - if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, EntryVal); - VectorizedIV = true; - } + auto CreateStepValue = [&](const SCEV *Step) -> Value * { + assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && + "Induction step should be loop invariant"); + if (PSE.getSE()->isSCEVable(IV->getType())) { + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + return Exp.expandCodeFor(Step, Step->getType(), + LoopVectorPreHeader->getTerminator()); + } + return cast<SCEVUnknown>(Step)->getValue(); + }; - // If we haven't yet vectorized the induction variable, or if we will create - // a scalar one, we need to define the scalar induction variable and step - // values. If we were given a truncation type, truncate the canonical + // The scalar value to broadcast. This is derived from the canonical + // induction variable. If a truncation type is given, truncate the canonical // induction variable and step. Otherwise, derive these values from the // induction descriptor. - if (!VectorizedIV || NeedsScalarIV) { - ScalarIV = Induction; + auto CreateScalarIV = [&](Value *&Step) -> Value * { + Value *ScalarIV = Induction; if (IV != OldInduction) { ScalarIV = IV->getType()->isIntegerTy() ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) @@ -1872,12 +1866,12 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); Step = Builder.CreateTrunc(Step, TruncType); } - } + return ScalarIV; + }; - // If we haven't yet vectorized the induction variable, splat the scalar - // induction variable, and build the necessary step vectors. - // TODO: Don't do it unless the vectorized IV is really required. - if (!VectorizedIV) { + // Create the vector values from the scalar IV, in the absence of creating a + // vector IV. + auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { Value *EntryPart = @@ -1887,23 +1881,53 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { addMetadata(EntryPart, Trunc); recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); } + }; + + // Now do the actual transformations, and start with creating the step value. + Value *Step = CreateStepValue(ID.getStep()); + if (VF <= 1) { + Value *ScalarIV = CreateScalarIV(Step); + CreateSplatIV(ScalarIV, Step); + return; + } + + // Determine if we want a scalar version of the induction variable. This is + // true if the induction variable itself is not widened, or if it has at + // least one user in the loop that is not widened. + auto NeedsScalarIV = needsScalarInduction(EntryVal); + if (!NeedsScalarIV) { + createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + return; } - // If an induction variable is only used for counting loop iterations or - // calculating addresses, it doesn't need to be widened. Create scalar steps - // that can be used by instructions we will later scalarize. Note that the - // addition of the scalar steps will not increase the number of instructions - // in the loop in the common case prior to InstCombine. We will be trading - // one vector extract for each scalar step. - if (NeedsScalarIV) + // Try to create a new independent vector induction variable. If we can't + // create the phi node, we will splat the scalar induction variable in each + // loop iteration. + if (!shouldScalarizeInstruction(EntryVal)) { + createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + Value *ScalarIV = CreateScalarIV(Step); + // Create scalar steps that can be used by instructions we will later + // scalarize. Note that the addition of the scalar steps will not increase + // the number of instructions in the loop in the common case prior to + // InstCombine. We will be trading one vector extract for each scalar step. buildScalarSteps(ScalarIV, Step, EntryVal, ID); + return; + } + + // All IV users are scalar instructions, so only emit a scalar IV, not a + // vectorised IV. Except when we tail-fold, then the splat IV feeds the + // predicate used by the masked loads/stores. + Value *ScalarIV = CreateScalarIV(Step); + if (!Cost->isScalarEpilogueAllowed()) + CreateSplatIV(ScalarIV, Step); + buildScalarSteps(ScalarIV, Step, EntryVal, ID); } Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. - assert(Val->getType()->isVectorTy() && "Must be a vector"); - int VLen = Val->getType()->getVectorNumElements(); + auto *ValVTy = cast<VectorType>(Val->getType()); + int VLen = ValVTy->getNumElements(); Type *STy = Val->getType()->getScalarType(); assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && @@ -2052,7 +2076,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { // Initialize packing with insertelements to start from undef. - Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); + Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); VectorLoopValueMap.setVectorValue(V, Part, Undef); for (unsigned Lane = 0; Lane < VF; ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); @@ -2118,13 +2142,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue( Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - SmallVector<Constant *, 8> ShuffleMask; + SmallVector<int, 8> ShuffleMask; for (unsigned i = 0; i < VF; ++i) - ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); + ShuffleMask.push_back(VF - i - 1); return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), - ConstantVector::get(ShuffleMask), - "reverse"); + ShuffleMask, "reverse"); } // Return whether we allow using masked interleave-groups (for dealing with @@ -2166,24 +2189,16 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, - VPTransformState &State, - VPValue *Addr, - VPValue *BlockInMask) { - const InterleaveGroup<Instruction> *Group = - Cost->getInterleavedAccessGroup(Instr); - assert(Group && "Fail to get an interleaved access group."); - - // Skip if current instruction is not the insert position. - if (Instr != Group->getInsertPos()) - return; - +void InnerLoopVectorizer::vectorizeInterleaveGroup( + const InterleaveGroup<Instruction> *Group, VPTransformState &State, + VPValue *Addr, VPValue *BlockInMask) { + Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); + auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); // Prepare for the new pointers. SmallVector<Value *, 2> AddrParts; @@ -2252,21 +2267,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); - auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, + createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) : ShuffledMask; } NewLoad = - Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(), + Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), GroupMask, UndefVec, "wide.masked.vec"); } else NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], - Group->getAlignment(), "wide.vec"); + Group->getAlign(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2280,14 +2295,14 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, if (!Member) continue; - Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); + auto StrideMask = createStrideMask(I, InterleaveFactor, VF); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2301,7 +2316,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, } // The sub vector type for current instruction. - VectorType *SubVT = VectorType::get(ScalarTy, VF); + auto *SubVT = FixedVectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. for (unsigned Part = 0; Part < UF; Part++) { @@ -2329,23 +2344,23 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. - Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); - Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, - "interleaved.vec"); + Value *IVec = Builder.CreateShuffleVector( + WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), + "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); - auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), + "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( - IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask); + IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); } else - NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], - Group->getAlignment()); + NewStoreInstr = + Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); Group->addMetadata(NewStoreInstr); } @@ -2354,27 +2369,26 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, VPValue *Addr, + VPValue *StoredValue, VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); assert((LI || SI) && "Invalid Load/Store instruction"); + assert((!SI || StoredValue) && "No stored value provided for widened store"); + assert((!LI || !StoredValue) && "Stored value provided for widened load"); LoopVectorizationCostModel::InstWidening Decision = Cost->getWideningDecision(Instr, VF); - assert(Decision != LoopVectorizationCostModel::CM_Unknown && - "CM decision should be taken at this point"); - if (Decision == LoopVectorizationCostModel::CM_Interleave) - return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); + assert((Decision == LoopVectorizationCostModel::CM_Widen || + Decision == LoopVectorizationCostModel::CM_Widen_Reverse || + Decision == LoopVectorizationCostModel::CM_GatherScatter) && + "CM decision is not to widen the memory instruction"); Type *ScalarDataTy = getMemInstValueType(Instr); - Type *DataTy = VectorType::get(ScalarDataTy, VF); - // An alignment of 0 means target abi alignment. We need to use the scalar's - // target abi alignment in such a case. - const DataLayout &DL = Instr->getModule()->getDataLayout(); - const Align Alignment = - DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); + auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); + const Align Alignment = getLoadStoreAlignment(Instr); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -2431,12 +2445,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, for (unsigned Part = 0; Part < UF; ++Part) { Instruction *NewSI = nullptr; - Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); + Value *StoredVal = State.get(StoredValue, Part); if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(Addr, Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, - Alignment.value(), MaskPart); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); } else { if (Reverse) { // If we store to reverse consecutive memory locations, then we need @@ -2447,11 +2461,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, } auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewSI = Builder.CreateMaskedStore( - StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]); + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, + BlockInMaskParts[Part]); else - NewSI = - Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); + NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); } addMetadata(NewSI, SI); } @@ -2466,18 +2479,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(Addr, Part); - NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, + NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad( - VecPtr, Alignment.value(), BlockInMaskParts[Part], - UndefValue::get(DataTy), "wide.masked.load"); + VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), + "wide.masked.load"); else - NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), - "wide.load"); + NewLI = + Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); // Add metadata to the load, but setVectorValue to the reverse shuffle. addMetadata(NewLI, LI); @@ -2488,9 +2501,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, const VPIteration &Instance, - bool IfPredicateInstr) { + bool IfPredicateInstr, + VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); setDebugLocFromInst(Builder, Instr); @@ -2504,8 +2518,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); + for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { + auto *NewOp = State.get(User.getOperand(op), Instance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2578,7 +2592,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { // compare. The only way that we get a backedge taken count is that the // induction variable was signed and as such will not overflow. In such a case // truncation is legal. - if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > + if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > IdxTy->getPrimitiveSizeInBits()) BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); @@ -2676,7 +2690,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, "Only one type should be a floating point type"); Type *IntTy = IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); - VectorType *VecIntTy = VectorType::get(IntTy, VF); + auto *VecIntTy = FixedVectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); return Builder.CreateBitOrPointerCast(CastVal, DstVTy); } @@ -2774,12 +2788,17 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. + auto *LAI = Legal->getLAI(); + const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); + if (!RtPtrChecking.Need) + return; Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); - if (!MemRuntimeCheck) - return; + addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, + RtPtrChecking.getChecks(), RtPtrChecking.getSE()); + assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " + "claimed checks are required"); if (MemCheckBlock->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && @@ -2858,6 +2877,18 @@ Value *InnerLoopVectorizer::emitTransformedIndex( return B.CreateMul(X, Y); }; + // Get a suitable insert point for SCEV expansion. For blocks in the vector + // loop, choose the end of the vector loop header (=LoopVectorBody), because + // the DomTree is not kept up-to-date for additional blocks generated in the + // vector loop. By using the header as insertion point, we guarantee that the + // expanded instructions dominate all their uses. + auto GetInsertPoint = [this, &B]() { + BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); + if (InsertBB != LoopVectorBody && + LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) + return LoopVectorBody->getTerminator(); + return &*B.GetInsertPoint(); + }; switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { assert(Index->getType() == StartValue->getType() && @@ -2865,7 +2896,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex( if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) return B.CreateSub(StartValue, Index); auto *Offset = CreateMul( - Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); + Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); return CreateAdd(StartValue, Offset); } case InductionDescriptor::IK_PtrInduction: { @@ -2873,8 +2904,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex( "Expected constant step for pointer induction"); return B.CreateGEP( StartValue->getType()->getPointerElementType(), StartValue, - CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), - &*B.GetInsertPoint()))); + CreateMul(Index, + Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); } case InductionDescriptor::IK_FpInduction: { assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); @@ -3034,8 +3065,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // This variable saves the new starting index for the scalar loop. It is used // to test if there are any tail iterations left once the vector loop has // completed. - LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); - for (auto &InductionEntry : *List) { + for (auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; @@ -3258,7 +3288,6 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize) { Function *F = CI->getCalledFunction(); - StringRef FnName = CI->getCalledFunction()->getName(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; for (auto &ArgOp : CI->arg_operands()) @@ -3268,7 +3297,8 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // to be vectors, so we need to extract individual elements from there, // execute VF scalar calls, and then gather the result into the vector return // value. - unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); + unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, + TTI::TCK_RecipThroughput); if (VF == 1) return ScalarCallCost; @@ -3286,11 +3316,15 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) + VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + + if (!TLI || CI->isNoBuiltin() || !VecFunc) return Cost; // If the corresponding vector cost is cheaper, return its cost. - unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); + unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, + TTI::TCK_RecipThroughput); if (VectorCallCost < Cost) { NeedToScalarize = false; return VectorCallCost; @@ -3303,22 +3337,20 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - FastMathFlags FMF; - if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) - FMF = FPMO->getFastMathFlags(); - - SmallVector<Value *, 4> Operands(CI->arg_operands()); - return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); + IntrinsicCostAttributes CostAttrs(ID, *CI, VF); + return TTI.getIntrinsicInstrCost(CostAttrs, + TargetTransformInfo::TCK_RecipThroughput); } static Type *smallestIntegerVectorType(Type *T1, Type *T2) { - auto *I1 = cast<IntegerType>(T1->getVectorElementType()); - auto *I2 = cast<IntegerType>(T2->getVectorElementType()); + auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); + auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; } + static Type *largestIntegerVectorType(Type *T1, Type *T2) { - auto *I1 = cast<IntegerType>(T1->getVectorElementType()); - auto *I2 = cast<IntegerType>(T2->getVectorElementType()); + auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); + auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; } @@ -3335,14 +3367,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { continue; for (unsigned Part = 0; Part < UF; ++Part) { Value *I = getOrCreateVectorValue(KV.first, Part); - if (Erased.find(I) != Erased.end() || I->use_empty() || - !isa<Instruction>(I)) + if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) continue; Type *OriginalTy = I->getType(); Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), KV.second); - Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, - OriginalTy->getVectorNumElements()); + auto *TruncatedTy = FixedVectorType::get( + ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); if (TruncatedTy == OriginalTy) continue; @@ -3392,27 +3423,35 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { break; } } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { - auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); + auto Elements0 = + cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( - SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); + SI->getOperand(0), + FixedVectorType::get(ScalarTruncatedTy, Elements0)); + auto Elements1 = + cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); auto *O1 = B.CreateZExtOrTrunc( - SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); + SI->getOperand(1), + FixedVectorType::get(ScalarTruncatedTy, Elements1)); - NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); + NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { - auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); + auto Elements = + cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( - IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); + IE->getOperand(0), + FixedVectorType::get(ScalarTruncatedTy, Elements)); auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { - auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); + auto Elements = + cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( - EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); + EE->getOperand(0), + FixedVectorType::get(ScalarTruncatedTy, Elements)); NewI = B.CreateExtractElement(O0, EE->getOperand(2)); } else { // If we don't know what to do, be conservative and don't do anything. @@ -3471,7 +3510,7 @@ void InnerLoopVectorizer::fixVectorizedLoop() { PSE.getSE()->forgetLoop(OrigLoop); // Fix-up external users of the induction variables. - for (auto &Entry : *Legal->getInductionVars()) + for (auto &Entry : Legal->getInductionVars()) fixupIVUsers(Entry.first, Entry.second, getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), IVEndValues[Entry.first], LoopMiddleBlock); @@ -3482,6 +3521,19 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // Remove redundant induction instructions. cse(LoopVectorBody); + + // Set/update profile weights for the vector and remainder loops as original + // loop iterations are now distributed among them. Note that original loop + // represented by LoopScalarBody becomes remainder loop after vectorization. + // + // For cases like foldTailByMasking() and requiresScalarEpiloque() we may + // end up getting slightly roughened result but that should be OK since + // profile is not inherently precise anyway. Note also possible bypass of + // vector code caused by legality checks is ignored, assigning all the weight + // to the vector loop, optimistically. + setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), + LI->getLoopFor(LoopVectorBody), + LI->getLoopFor(LoopScalarBody), VF * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -3563,8 +3615,8 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { if (VF > 1) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); VectorInit = Builder.CreateInsertElement( - UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, - Builder.getInt32(VF - 1), "vector.recur.init"); + UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), + VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -3605,10 +3657,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - SmallVector<Constant *, 8> ShuffleMask(VF); - ShuffleMask[0] = Builder.getInt32(VF - 1); + SmallVector<int, 8> ShuffleMask(VF); + ShuffleMask[0] = VF - 1; for (unsigned I = 1; I < VF; ++I) - ShuffleMask[I] = Builder.getInt32(I + VF - 1); + ShuffleMask[I] = I + VF - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -3618,10 +3670,9 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *PreviousPart = getOrCreateVectorValue(Previous, Part); Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = - VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, - ConstantVector::get(ShuffleMask)) - : Incoming; + auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, + ShuffleMask) + : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast<Instruction>(PhiPart)->eraseFromParent(); VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); @@ -3684,7 +3735,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Get it's reduction variable descriptor. assert(Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"); - RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; + RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); @@ -3725,7 +3776,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // incoming scalar reduction. VectorStart = ReductionStartValue; } else { - Identity = ConstantVector::getSplat(VF, Iden); + Identity = ConstantVector::getSplat({VF, false}, Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. @@ -3787,7 +3838,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { - Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); @@ -4036,9 +4087,11 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } } -void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, - unsigned VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant) { +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, + unsigned UF, unsigned VF, + bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant, + VPTransformState &State) { // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP // results in a vector of pointers when at least one operand of the GEP @@ -4075,19 +4128,18 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, for (unsigned Part = 0; Part < UF; ++Part) { // The pointer operand of the new GEP. If it's loop-invariant, we // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? GEP->getPointerOperand() - : getOrCreateVectorValue(GEP->getPointerOperand(), Part); + auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) + : State.get(Operands.getOperand(0), Part); // Collect all the indices for the new GEP. If any index is // loop-invariant, we won't broadcast it. SmallVector<Value *, 4> Indices; - for (auto Index : enumerate(GEP->indices())) { - Value *User = Index.value().get(); - if (IsIndexLoopInvariant[Index.index()]) - Indices.push_back(User); + for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { + VPValue *Operand = Operands.getOperand(I); + if (IsIndexLoopInvariant[I - 1]) + Indices.push_back(State.get(Operand, {0, 0})); else - Indices.push_back(getOrCreateVectorValue(User, Part)); + Indices.push_back(State.get(Operand, Part)); } // Create the new GEP. Note that this GEP may be a scalar if VF == 1, @@ -4114,7 +4166,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = - (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -4133,7 +4185,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. Type *VecTy = - (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); VectorLoopValueMap.setVectorValue(P, Part, EntryPart); @@ -4145,9 +4197,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // This PHINode must be an induction variable. // Make sure that we know about it. - assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); + assert(Legal->getInductionVars().count(P) && "Not an induction variable"); - InductionDescriptor II = Legal->getInductionVars()->lookup(P); + InductionDescriptor II = Legal->getInductionVars().lookup(P); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); // FIXME: The newly created binary instructions should contain nsw/nuw flags, @@ -4203,11 +4255,14 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I) { +void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, + VPTransformState &State) { switch (I.getOpcode()) { + case Instruction::Call: case Instruction::Br: case Instruction::PHI: case Instruction::GetElementPtr: + case Instruction::Select: llvm_unreachable("This instruction is handled by a different recipe."); case Instruction::UDiv: case Instruction::SDiv: @@ -4233,8 +4288,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 2> Ops; - for (Value *Op : I.operands()) - Ops.push_back(getOrCreateVectorValue(Op, Part)); + for (VPValue *VPOp : User.operands()) + Ops.push_back(State.get(VPOp, Part)); Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); @@ -4248,35 +4303,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { break; } - case Instruction::Select: { - // Widen selects. - // If the selector is loop invariant we can create a select - // instruction with a scalar condition. Otherwise, use vector-select. - auto *SE = PSE.getSE(); - bool InvariantCond = - SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); - setDebugLocFromInst(Builder, &I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - - auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); - Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); - Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); - Value *Sel = - Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); - VectorLoopValueMap.setVectorValue(&I, Part, Sel); - addMetadata(Sel, &I); - } - - break; - } - case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. @@ -4284,8 +4310,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { auto *Cmp = cast<CmpInst>(&I); setDebugLocFromInst(Builder, Cmp); for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); - Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); + Value *A = State.get(User.getOperand(0), Part); + Value *B = State.get(User.getOperand(1), Part); Value *C = nullptr; if (FCmp) { // Propagate fast math flags. @@ -4319,78 +4345,80 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { /// Vectorize casts. Type *DestTy = - (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); + (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); + Value *A = State.get(User.getOperand(0), Part); Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); VectorLoopValueMap.setVectorValue(&I, Part, Cast); addMetadata(Cast, &I); } break; } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. +} - case Instruction::Call: { - // Ignore dbg intrinsics. - if (isa<DbgInfoIntrinsic>(I)) - break; - setDebugLocFromInst(Builder, &I); - - Module *M = I.getParent()->getParent()->getParent(); - auto *CI = cast<CallInst>(&I); +void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, + VPTransformState &State) { + assert(!isa<DbgInfoIntrinsic>(I) && + "DbgInfoIntrinsic should have been dropped during VPlan construction"); + setDebugLocFromInst(Builder, &I); - StringRef FnName = CI->getCalledFunction()->getName(); - Function *F = CI->getCalledFunction(); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector<Type *, 4> Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + Module *M = I.getParent()->getParent()->getParent(); + auto *CI = cast<CallInst>(&I); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + SmallVector<Type *, 4> Tys; + for (Value *ArgOperand : CI->arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 4> Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - Value *Arg = CI->getArgOperand(i); - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) - Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); - Args.push_back(Arg); - } + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize = false; + unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); + bool UseVectorIntrinsic = + ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; + assert((UseVectorIntrinsic || !NeedToScalarize) && + "Instruction should be scalarized elsewhere."); - Function *VectorF; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - } else { - // Use vector version of the library call. - StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); - assert(!VFnName.empty() && "Vector function name is empty."); - VectorF = M->getFunction(VFnName); - if (!VectorF) { - // Generate a declaration - FunctionType *FTy = FunctionType::get(RetTy, Tys, false); - VectorF = - Function::Create(FTy, Function::ExternalLinkage, VFnName, M); - VectorF->copyAttributesFrom(F); - } - } - assert(VectorF && "Can't create vector function."); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value *, 4> Args; + for (auto &I : enumerate(ArgOperands.operands())) { + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + Value *Arg; + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) + Arg = State.get(I.value(), Part); + else + Arg = State.get(I.value(), {0, 0}); + Args.push_back(Arg); + } + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = + FixedVectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + assert(VectorF && "Can't retrieve vector intrinsic."); + } else { + // Use vector version of the function call. + const VFShape Shape = + VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); +#ifndef NDEBUG + assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && + "Can't create vector function."); +#endif + VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); + } SmallVector<OperandBundleDef, 1> OpBundles; CI->getOperandBundlesAsDefs(OpBundles); CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); @@ -4400,16 +4428,31 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { VectorLoopValueMap.setVectorValue(&I, Part, V); addMetadata(V, &I); - } - - break; } +} - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. +void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, + VPUser &Operands, + bool InvariantCond, + VPTransformState &State) { + setDebugLocFromInst(Builder, &I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + auto *InvarCond = + InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; + + for (unsigned Part = 0; Part < UF; ++Part) { + Value *Cond = + InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); + Value *Op0 = State.get(Operands.getOperand(1), Part); + Value *Op1 = State.get(Operands.getOperand(2), Part); + Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); + VectorLoopValueMap.setVectorValue(&I, Part, Sel); + addMetadata(Sel, &I); + } } void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { @@ -4502,7 +4545,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { } } for (auto *I : ScalarPtrs) - if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { + if (!PossibleNonScalarPtrs.count(I)) { LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); Worklist.insert(I); } @@ -4513,7 +4556,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // TODO: Once we are able to vectorize pointer induction variables we should // no longer insert them into the worklist here. auto *Latch = TheLoop->getLoopLatch(); - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) @@ -4556,7 +4599,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // An induction variable will remain scalar if all users of the induction // variable and induction variable update remain scalar. - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); @@ -4568,6 +4611,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) continue; + // If tail-folding is applied, the primary induction variable will be used + // to feed a vector compare. + if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) + continue; + // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { @@ -4618,7 +4666,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne "Widening decision should be ready at this moment"); return WideningDecision == CM_Scalarize; } - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty, Alignment)) : !(isLegalMaskedStore(Ty, Ptr, Alignment) || @@ -4665,7 +4713,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, "Masked interleave-groups for predicated accesses are not enabled."); auto *Ty = getMemInstValueType(I); - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) : TTI.isLegalMaskedStore(Ty, Alignment); } @@ -4803,7 +4851,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Add to the Worklist all consecutive and consecutive-like pointers that // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) - if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) + if (!PossibleNonUniformPtrs.count(V)) addToWorklistIfAllowed(V); // Expand Worklist in topological order: whenever a new instruction @@ -4847,7 +4895,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // nodes separately. An induction variable will remain uniform if all users // of the induction variable and induction variable update remain uniform. // The code below handles both pointer and non-pointer induction variables. - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); @@ -4903,10 +4951,9 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { // FIXME: Avoid specializing for stride==1 instead of bailing out. if (!Legal->getLAI()->getSymbolicStrides().empty()) { - reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", + reportVectorizationFailure("Runtime stride check for small trip count", "runtime stride == 1 checks needed. Enable vectorization of " - "this loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz", + "this loop without such check by compiling with -Os/-Oz", "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } @@ -4914,7 +4961,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { +Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, + unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -4936,7 +4984,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return computeFeasibleMaxVF(TC); + return UserVF ? UserVF : computeFeasibleMaxVF(TC); case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" @@ -4964,11 +5012,18 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. - if (!useMaskedInterleavedAccesses(TTI)) + if (!useMaskedInterleavedAccesses(TTI)) { + assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && + "No decisions should have been taken at this point"); + // Note: There is no need to invalidate any cost modeling decisions here, as + // non where taken so far. InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); + } - unsigned MaxVF = computeFeasibleMaxVF(TC); - if (TC > 0 && TC % MaxVF == 0) { + unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); + assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); + unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; + if (TC > 0 && TC % MaxVFtimesIC == 0) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxVF; @@ -5015,7 +5070,9 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); - unsigned MaxVectorSize = WidestRegister / WidestType; + // Ensure MaxVF is a power of 2; the dependence distance bound may not be. + // Note that both WidestRegister and WidestType may not be a powers of 2. + unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); @@ -5140,7 +5197,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { Type *T = I.getType(); // Skip ignored values. - if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) + if (ValuesToIgnore.count(&I)) continue; // Only examine Loads, Stores and PHINodes. @@ -5152,7 +5209,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { if (auto *PN = dyn_cast<PHINode>(&I)) { if (!Legal->isReductionVariable(PN)) continue; - RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; + RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; T = RdxDesc.getRecurrenceType(); } @@ -5294,7 +5351,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && !Legal->getReductionVars()->empty()) { + if (VF > 1 && !Legal->getReductionVars().empty()) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5325,7 +5382,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. - if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { + if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -5345,7 +5402,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. - bool HasReductions = !Legal->getReductionVars()->empty(); + bool HasReductions = !Legal->getReductionVars().empty(); if (TTI.enableAggressiveInterleaving(HasReductions)) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; @@ -5459,11 +5516,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { OpenIntervals.erase(ToRemove); // Ignore instructions that are never used within the loop. - if (Ends.find(I) == Ends.end()) + if (!Ends.count(I)) continue; // Skip ignored values. - if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) + if (ValuesToIgnore.count(I)) continue; // For each VF find the maximum usage of registers. @@ -5483,7 +5540,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { collectUniformsAndScalars(VFs[j]); for (auto Inst : OpenIntervals) { // Skip ignored values for VF > 1. - if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) + if (VecValuesToIgnore.count(Inst)) continue; if (isScalarAfterVectorization(Inst, VFs[j])) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); @@ -5676,9 +5733,11 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { - ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), - true, false); - ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); + ScalarCost += TTI.getScalarizationOverhead( + cast<VectorType>(ToVectorTy(I->getType(), VF)), + APInt::getAllOnesValue(VF), true, false); + ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, + TTI::TCK_RecipThroughput); } // Compute the scalarization overhead of needed extractelement @@ -5693,7 +5752,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( Worklist.push_back(J); else if (needsExtract(J, VF)) ScalarCost += TTI.getScalarizationOverhead( - ToVectorTy(J->getType(),VF), false, true); + cast<VectorType>(ToVectorTy(J->getType(), VF)), + APInt::getAllOnesValue(VF), false, true); } // Scale the total scalar cost by block probability. @@ -5719,8 +5779,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || - (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) + if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); @@ -5806,9 +5865,10 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS); + Alignment, AS, + TTI::TCK_RecipThroughput); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5832,20 +5892,22 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); - Type *VectorTy = ToVectorTy(ValTy, VF); + auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); unsigned Cost = 0; if (Legal->isMaskRequired(I)) - Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, - Alignment ? Alignment->value() : 0, AS); + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, + CostKind); else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, + CostKind, I); bool Reverse = ConsecutiveStride < 0; if (Reverse) @@ -5856,19 +5918,22 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); - Type *VectorTy = ToVectorTy(ValTy, VF); - const MaybeAlign Alignment = getLoadStoreAlignment(I); + auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); + const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); + enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isa<LoadInst>(I)) { return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, + CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } StoreInst *SI = cast<StoreInst>(I); bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, + CostKind) + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, @@ -5878,27 +5943,27 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); - Type *VectorTy = ToVectorTy(ValTy, VF); - const MaybeAlign Alignment = getLoadStoreAlignment(I); - Value *Ptr = getLoadStorePointerOperand(I); + auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); + const Align Alignment = getLoadStoreAlignment(I); + const Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + - TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), - Alignment ? Alignment->value() : 0); + TTI.getGatherScatterOpCost( + I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, + TargetTransformInfo::TCK_RecipThroughput, I); } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); - Type *VectorTy = ToVectorTy(ValTy, VF); + auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); auto Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. // An interleaved store group doesn't need this as it doesn't allow gaps. @@ -5913,8 +5978,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, bool UseMaskForGaps = Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); unsigned Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); + I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), + AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. @@ -5932,11 +5997,12 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, // moment. if (VF == 1) { Type *ValTy = getMemInstValueType(I); - const MaybeAlign Alignment = getLoadStoreAlignment(I); + const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, + TTI::TCK_RecipThroughput, I); } return getWideningCost(I, VF); } @@ -5955,7 +6021,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { auto ForcedScalar = ForcedScalars.find(VF); if (VF > 1 && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; - if (InstSet.find(I) != InstSet.end()) + if (InstSet.count(I)) return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); } @@ -5977,7 +6043,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, Type *RetTy = ToVectorTy(I->getType(), VF); if (!RetTy->isVoidTy() && (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead(RetTy, true, false); + Cost += TTI.getScalarizationOverhead( + cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); // Some targets keep addresses scalar. if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) @@ -6157,6 +6224,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { @@ -6173,21 +6241,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); if (VF > 1 && BI->isConditional() && - (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != - PredicatedBBsAfterVectorization.end() || - PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != - PredicatedBBsAfterVectorization.end())) + (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. - Type *Vec_i1Ty = - VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + - (TTI.getCFInstrCost(Instruction::Br) * VF)); + auto *Vec_i1Ty = + FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), + false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) // The back-edge branch will remain, as will all scalar branches. - return TTI.getCFInstrCost(Instruction::Br); + return TTI.getCFInstrCost(Instruction::Br, CostKind); else // This branch will be eliminated by if-conversion. return 0; @@ -6202,7 +6269,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - VectorTy, VF - 1, VectorType::get(RetTy, 1)); + cast<VectorType>(VectorTy), VF - 1, + FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi @@ -6211,9 +6279,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), - ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); + ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), + CostKind); - return TTI.getCFInstrCost(Instruction::PHI); + return TTI.getCFInstrCost(Instruction::PHI, CostKind); } case Instruction::UDiv: case Instruction::SDiv: @@ -6230,10 +6299,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF * TTI.getCFInstrCost(Instruction::PHI); + Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); + Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -6274,13 +6343,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, SmallVector<const Value *, 4> Operands(I->operand_values()); unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, + I->getOpcode(), VectorTy, CostKind, + TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, + I->getOpcode(), VectorTy, CostKind, + TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, I->getOperand(0), I); @@ -6291,9 +6362,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) - CondTy = VectorType::get(CondTy, VF); + CondTy = FixedVectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, + CostKind, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -6302,7 +6374,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, + I); } case Instruction::Store: case Instruction::Load: { @@ -6335,7 +6408,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (isOptimizableIVTruncate(I, VF)) { auto *Trunc = cast<TruncInst>(I); return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), - Trunc->getSrcTy(), Trunc); + Trunc->getSrcTy(), CostKind, Trunc); } Type *SrcScalarTy = I->getOperand(0)->getType(); @@ -6361,7 +6434,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; - return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); + return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, + CostKind, I); } case Instruction::Call: { bool NeedToScalarize; @@ -6374,7 +6448,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + + return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, + CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6397,6 +6472,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -6424,14 +6500,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore type-promoting instructions we identified during reduction // detection. - for (auto &Reduction : *Legal->getReductionVars()) { + for (auto &Reduction : Legal->getReductionVars()) { RecurrenceDescriptor &RedDes = Reduction.second; SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction // detection. - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { InductionDescriptor &IndDes = Induction.second; const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); @@ -6490,9 +6566,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { return VectorizationFactor::Disabled(); } -Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { +Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, + unsigned UserIC) { assert(OrigLoop->empty() && "Inner loop expected."); - Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); + Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6503,7 +6580,11 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " "which requires masked-interleaved support.\n"); - CM.InterleaveInfo.reset(); + if (CM.InterleaveInfo.invalidateGroups()) + // Invalidating interleave groups also requires invalidating all decisions + // based on them, which includes widening decisions and uniform and scalar + // values. + CM.invalidateCostModelingDecisions(); } if (UserVF) { @@ -6563,6 +6644,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, &ILV, CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); + State.CanonicalIV = ILV.Induction; //===------------------------------------------------===// // @@ -6595,12 +6677,11 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( // We create new "steps" for induction variable updates to which the original // induction variables map. An original update instruction will be dead if // all its users except the induction variable are dead. - for (auto &Induction : *Legal->getInductionVars()) { + for (auto &Induction : Legal->getInductionVars()) { PHINode *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != - DeadInstructions.end(); + return U == Ind || DeadInstructions.count(cast<Instruction>(U)); })) DeadInstructions.insert(IndUpdate); @@ -6716,7 +6797,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); assert(BI && "Unexpected terminator found"); - if (!BI->isConditional()) + if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) return EdgeMaskCache[Edge] = SrcMask; VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); @@ -6749,9 +6830,21 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. - VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); + // Start by constructing the desired canonical IV. + VPValue *IV = nullptr; + if (Legal->getPrimaryInduction()) + IV = Plan->getVPValue(Legal->getPrimaryInduction()); + else { + auto IVRecipe = new VPWidenCanonicalIVRecipe(); + Builder.getInsertBlock()->appendRecipe(IVRecipe); + IV = IVRecipe->getVPValue(); + } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + bool TailFolded = !CM.isScalarEpilogueAllowed(); + if (TailFolded && CM.TTI.emitGetActiveLaneMask()) + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); + else + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); return BlockMaskCache[BB] = BlockMask; } @@ -6775,8 +6868,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { VPWidenMemoryInstructionRecipe * VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan) { - if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) - return nullptr; + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Must be called with either a load or store"); auto willWiden = [&](unsigned VF) -> bool { if (VF == 1) @@ -6801,22 +6894,29 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, Mask = createBlockInMask(I->getParent(), Plan); VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); - return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); + if (LoadInst *Load = dyn_cast<LoadInst>(I)) + return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); + + StoreInst *Store = cast<StoreInst>(I); + VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); + return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); } VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { - if (PHINode *Phi = dyn_cast<PHINode>(I)) { - // Check if this is an integer or fp induction. If so, build the recipe that - // produces its scalar and vector values. - InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) - return new VPWidenIntOrFpInductionRecipe(Phi); +VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { + // Check if this is an integer or fp induction. If so, build the recipe that + // produces its scalar and vector values. + InductionDescriptor II = Legal->getInductionVars().lookup(Phi); + if (II.getKind() == InductionDescriptor::IK_IntInduction || + II.getKind() == InductionDescriptor::IK_FpInduction) + return new VPWidenIntOrFpInductionRecipe(Phi); - return nullptr; - } + return nullptr; +} +VPWidenIntOrFpInductionRecipe * +VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, + VFRange &Range) const { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -6830,54 +6930,89 @@ VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; }; - if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( - isOptimizableIVTruncate(I), Range)) + if (LoopVectorizationPlanner::getDecisionAndClampRange( + isOptimizableIVTruncate(I), Range)) return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - cast<TruncInst>(I)); + I); return nullptr; } -VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { - PHINode *Phi = dyn_cast<PHINode>(I); - if (!Phi || Phi->getParent() == OrigLoop->getHeader()) - return nullptr; - +VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { // We know that all PHIs in non-header blocks are converted into selects, so // we don't have to worry about the insertion order and we can just use the // builder. At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - SmallVector<VPValue *, 2> Masks; + SmallVector<VPValue *, 2> Operands; unsigned NumIncoming = Phi->getNumIncomingValues(); for (unsigned In = 0; In < NumIncoming; In++) { VPValue *EdgeMask = createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); assert((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"); + Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); if (EdgeMask) - Masks.push_back(EdgeMask); + Operands.push_back(EdgeMask); } - return new VPBlendRecipe(Phi, Masks); + return new VPBlendRecipe(Phi, Operands); } -bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, - VFRange &Range) { +VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, + VPlan &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, + Range); if (IsPredicated) - return false; + return nullptr; + + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) + return nullptr; + + auto willWiden = [&](unsigned VF) -> bool { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + // The following case may be scalarized depending on the VF. + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize = false; + unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); + bool UseVectorIntrinsic = + ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; + return UseVectorIntrinsic || !NeedToScalarize; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) + return nullptr; + + return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); +} +bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { + assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && + !isa<StoreInst>(I) && "Instruction should have been handled earlier"); + // Instruction should be widened, unless it is scalar after vectorization, + // scalarization is profitable or it is predicated. + auto WillScalarize = [this, I](unsigned VF) -> bool { + return CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF) || + CM.isScalarWithPredication(I, VF); + }; + return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, + Range); +} + +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { auto IsVectorizableOpcode = [](unsigned Opcode) { switch (Opcode) { case Instruction::Add: case Instruction::And: case Instruction::AShr: case Instruction::BitCast: - case Instruction::Br: - case Instruction::Call: case Instruction::FAdd: case Instruction::FCmp: case Instruction::FDiv: @@ -6891,11 +7026,9 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, case Instruction::FSub: case Instruction::ICmp: case Instruction::IntToPtr: - case Instruction::Load: case Instruction::LShr: case Instruction::Mul: case Instruction::Or: - case Instruction::PHI: case Instruction::PtrToInt: case Instruction::SDiv: case Instruction::Select: @@ -6903,7 +7036,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, case Instruction::Shl: case Instruction::SIToFP: case Instruction::SRem: - case Instruction::Store: case Instruction::Sub: case Instruction::Trunc: case Instruction::UDiv: @@ -6917,60 +7049,10 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, }; if (!IsVectorizableOpcode(I->getOpcode())) - return false; - - if (CallInst *CI = dyn_cast<CallInst>(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) - return false; - } - - auto willWiden = [&](unsigned VF) -> bool { - if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF))) - return false; - if (CallInst *CI = dyn_cast<CallInst>(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; - return UseVectorIntrinsic || !NeedToScalarize; - } - if (isa<LoadInst>(I) || isa<StoreInst>(I)) { - assert(CM.getWideningDecision(I, VF) == - LoopVectorizationCostModel::CM_Scalarize && - "Memory widening decisions should have been taken care by now"); - return false; - } - return true; - }; - - if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) - return false; - // If this ingredient's recipe is to be recorded, keep its recipe a singleton - // to avoid having to split recipes later. - bool IsSingleton = Ingredient2Recipe.count(I); + return nullptr; // Success: widen this instruction. - - // Use the default widening recipe. We optimize the common case where - // consecutive instructions can be represented by a single recipe. - if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && - LastExtensibleRecipe->appendInstruction(I)) - return true; - - VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); - if (!IsSingleton) - LastExtensibleRecipe = WidenRecipe; - setRecipe(I, WidenRecipe); - VPBB->appendRecipe(WidenRecipe); - return true; + return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); } VPBasicBlock *VPRecipeBuilder::handleReplication( @@ -6984,7 +7066,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); - auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), + IsUniform, IsPredicated); setRecipe(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar @@ -7041,43 +7124,45 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, return Region; } -bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, - VPlanPtr &Plan, VPBasicBlock *VPBB) { - VPRecipeBase *Recipe = nullptr; - - // First, check for specific widening recipes that deal with memory +VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, + VFRange &Range, + VPlanPtr &Plan) { + // First, check for specific widening recipes that deal with calls, memory // operations, inductions and Phi nodes. - if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || - (Recipe = tryToOptimizeInduction(Instr, Range)) || - (Recipe = tryToBlend(Instr, Plan)) || - (isa<PHINode>(Instr) && - (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { - setRecipe(Instr, Recipe); - VPBB->appendRecipe(Recipe); - return true; - } + if (auto *CI = dyn_cast<CallInst>(Instr)) + return tryToWidenCall(CI, Range, *Plan); - // Handle GEP widening. - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { - auto Scalarize = [&](unsigned VF) { - return CM.isScalarWithPredication(Instr, VF) || - CM.isScalarAfterVectorization(Instr, VF) || - CM.isProfitableToScalarize(Instr, VF); - }; - if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) - return false; - VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); - setRecipe(Instr, Recipe); - VPBB->appendRecipe(Recipe); - return true; + if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) + return tryToWidenMemory(Instr, Range, Plan); + + VPRecipeBase *Recipe; + if (auto Phi = dyn_cast<PHINode>(Instr)) { + if (Phi->getParent() != OrigLoop->getHeader()) + return tryToBlend(Phi, Plan); + if ((Recipe = tryToOptimizeInductionPHI(Phi))) + return Recipe; + return new VPWidenPHIRecipe(Phi); } - // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes. - if (tryToWiden(Instr, VPBB, Range)) - return true; + if (isa<TruncInst>(Instr) && + (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) + return Recipe; - return false; + if (!shouldWiden(Instr, Range)) + return nullptr; + + if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) + return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), + OrigLoop); + + if (auto *SI = dyn_cast<SelectInst>(Instr)) { + bool InvariantCond = + PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); + return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), + InvariantCond); + } + + return tryToWiden(Instr, *Plan); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, @@ -7097,13 +7182,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, NeedDef.insert(Branch->getCondition()); } - // If the tail is to be folded by masking, the primary induction variable - // needs to be represented in VPlan for it to model early-exit masking. + // If the tail is to be folded by masking, the primary induction variable, if + // exists needs to be represented in VPlan for it to model early-exit masking. // Also, both the Phi and the live-out instruction of each reduction are // required in order to introduce a select between them in VPlan. if (CM.foldTailByMasking()) { - NeedDef.insert(Legal->getPrimaryInduction()); - for (auto &Reduction : *Legal->getReductionVars()) { + if (Legal->getPrimaryInduction()) + NeedDef.insert(Legal->getPrimaryInduction()); + for (auto &Reduction : Legal->getReductionVars()) { NeedDef.insert(Reduction.first); NeedDef.insert(Reduction.second.getLoopExitInstr()); } @@ -7118,28 +7204,39 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, SmallPtrSet<Instruction *, 4> DeadInstructions; collectTriviallyDeadInstructions(DeadInstructions); + // Add assume instructions we need to drop to DeadInstructions, to prevent + // them from being added to the VPlan. + // TODO: We only need to drop assumes in blocks that get flattend. If the + // control flow is preserved, we should keep them. + auto &ConditionalAssumes = Legal->getConditionalAssumes(); + DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); + + DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); + // Dead instructions do not need sinking. Remove them from SinkAfter. + for (Instruction *I : DeadInstructions) + SinkAfter.erase(I); + for (unsigned VF = MinVF; VF < MaxVF + 1;) { VFRange SubRange = {VF, MaxVF + 1}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); + VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, + DeadInstructions, SinkAfter)); VF = SubRange.End; } } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, - SmallPtrSetImpl<Instruction *> &DeadInstructions) { + SmallPtrSetImpl<Instruction *> &DeadInstructions, + const DenseMap<Instruction *, Instruction *> &SinkAfter) { // Hold a mapping from predicated instructions to their recipes, in order to // fix their AlsoPack behavior if a user is determined to replicate and use a // scalar instead of vector value. DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; - DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further @@ -7177,8 +7274,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // --------------------------------------------------------------------------- // Create a dummy pre-entry VPBasicBlock to start building the VPlan. + auto Plan = std::make_unique<VPlan>(); VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); - auto Plan = std::make_unique<VPlan>(VPBB); + Plan->setEntry(VPBB); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) @@ -7199,17 +7297,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. + // TODO: Model and preserve debug instrinsics in VPlan. for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; // First filter out irrelevant instructions, to ensure no recipes are // built for them. - if (isa<BranchInst>(Instr) || - DeadInstructions.find(Instr) != DeadInstructions.end()) + if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) continue; - if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) + if (auto Recipe = + RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { + RecipeBuilder.setRecipe(Instr, Recipe); + VPBB->appendRecipe(Recipe); continue; + } // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. @@ -7264,7 +7366,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (CM.foldTailByMasking()) { Builder.setInsertPoint(VPBB); auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); - for (auto &Reduction : *Legal->getReductionVars()) { + for (auto &Reduction : Legal->getReductionVars()) { VPValue *Phi = Plan->getVPValue(Reduction.first); VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); @@ -7330,32 +7432,37 @@ Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( return ILV.getOrCreateScalarValue(V, Instance); } -void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" - << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; +void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; - getAddr()->printAsOperand(O); + getAddr()->printAsOperand(O, SlotTracker); VPValue *Mask = getMask(); if (Mask) { O << ", "; - Mask->printAsOperand(O); + Mask->printAsOperand(O, SlotTracker); } - O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) - O << " +\n" - << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; + O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; +} + +void VPWidenCallRecipe::execute(VPTransformState &State) { + State.ILV->widenCallInstruction(Ingredient, User, State); +} + +void VPWidenSelectRecipe::execute(VPTransformState &State) { + State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); } void VPWidenRecipe::execute(VPTransformState &State) { - for (auto &Instr : make_range(Begin, End)) - State.ILV->widenInstruction(Instr); + State.ILV->widenInstruction(Ingredient, User, State); } void VPWidenGEPRecipe::execute(VPTransformState &State) { - State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, - IsIndexLoopInvariant); + State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, + IsIndexLoopInvariant, State); } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { @@ -7376,27 +7483,27 @@ void VPBlendRecipe::execute(VPTransformState &State) { // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - unsigned NumIncoming = Phi->getNumIncomingValues(); + unsigned NumIncoming = getNumIncomingValues(); - assert((User || NumIncoming == 1) && - "Multiple predecessors with predecessors having a full mask"); // Generate a sequence of selects of the form: // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // ( ...))) + // SELECT(Mask2, In2, + // SELECT(Mask1, In1, + // In0))) + // Note that Mask0 is never used: lanes for which no path reaches this phi and + // are essentially undef are taken from In0. InnerLoopVectorizer::VectorParts Entry(State.UF); for (unsigned In = 0; In < NumIncoming; ++In) { for (unsigned Part = 0; Part < State.UF; ++Part) { // We might have single edge PHIs (blocks) - use an identity // 'select' for the first PHI operand. - Value *In0 = - State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); + Value *In0 = State.get(getIncomingValue(In), Part); if (In == 0) Entry[Part] = In0; // Initialize with the first incoming value. else { // Select between the current value and the previous incoming edge // based on the incoming mask. - Value *Cond = State.get(User->getOperand(In), Part); + Value *Cond = State.get(getMask(In), Part); Entry[Part] = State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); } @@ -7408,19 +7515,19 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), - getMask()); + State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); } void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. - State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); + State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, + IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF > 1) { // If we're constructing lane 0, initialize to start from undef. if (State.Instance->Lane == 0) { - Value *Undef = - UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); + Value *Undef = UndefValue::get( + FixedVectorType::get(Ingredient->getType(), State.VF)); State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); } State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); @@ -7434,7 +7541,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) { unsigned EndLane = IsUniform ? 1 : State.VF; for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); + State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, + IsPredicated, State); } void VPBranchOnMaskRecipe::execute(VPTransformState &State) { @@ -7444,15 +7552,14 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { unsigned Lane = State.Instance->Lane; Value *ConditionBit = nullptr; - if (!User) // Block in mask is all-one. - ConditionBit = State.Builder.getTrue(); - else { - VPValue *BlockInMask = User->getOperand(0); + VPValue *BlockInMask = getMask(); + if (BlockInMask) { ConditionBit = State.get(BlockInMask, Part); if (ConditionBit->getType()->isVectorTy()) ConditionBit = State.Builder.CreateExtractElement( ConditionBit, State.Builder.getInt32(Lane)); - } + } else // Block in mask is all-one. + ConditionBit = State.Builder.getTrue(); // Replace the temporary unreachable terminator with a new conditional branch, // whose two destinations will be set later when they are created. @@ -7496,7 +7603,9 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); + VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; + State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, + getMask()); } // Determine how to lower the scalar epilogue, which depends on 1) optimising @@ -7513,16 +7622,15 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( PGSOQueryType::IRPass); // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. - if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) + if (OptSize) return CM_ScalarEpilogueNotAllowedOptSize; bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && !PreferPredicateOverEpilog; // 2) Next, if disabling predication is requested on the command line, honour - // this and request a scalar epilogue. Also do this if we don't have a - // primary induction variable, which is required for predication. - if (PredicateOptDisabled || !LVL.getPrimaryInduction()) + // this and request a scalar epilogue. + if (PredicateOptDisabled) return CM_ScalarEpilogueAllowed; // 3) and 4) look if enabling predication is requested on the command line, @@ -7549,6 +7657,10 @@ static bool processLoopInVPlanNativePath( OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { + if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { + LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); + return false; + } assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); @@ -7561,7 +7673,7 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); @@ -7587,10 +7699,16 @@ static bool processLoopInVPlanNativePath( // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); + assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } +LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) + : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || + !EnableLoopInterleaving), + VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || + !EnableLoopVectorization) {} + bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->empty()) && "VPlan-native path is not enabled. Only process inner loops."); @@ -7720,17 +7838,17 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); - // Get user vectorization factor. + // Get user vectorization factor and interleave count. unsigned UserVF = Hints.getWidth(); + unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); + Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; - unsigned UserIC = Hints.getInterleave(); if (MaybeVF) { VF = *MaybeVF; @@ -7883,14 +8001,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.setAlreadyVectorized(); } - LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); + assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } -bool LoopVectorizePass::runImpl( +LoopVectorizeResult LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, - DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, + DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; @@ -7915,9 +8033,9 @@ bool LoopVectorizePass::runImpl( // interleaving. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && TTI->getMaxInterleaveFactor(1) < 2) - return false; + return LoopVectorizeResult(false, false); - bool Changed = false; + bool Changed = false, CFGChanged = false; // The vectorizer requires loops to be in simplified form. // Since simplification may add new inner loops, it has to run before the @@ -7925,7 +8043,7 @@ bool LoopVectorizePass::runImpl( // will simplify all loops, regardless of whether anything end up being // vectorized. for (auto &L : *LI) - Changed |= + Changed |= CFGChanged |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); // Build up a worklist of inner-loops to vectorize. This is necessary as @@ -7946,11 +8064,11 @@ bool LoopVectorizePass::runImpl( // transform. Changed |= formLCSSARecursively(*L, *DT, LI, SE); - Changed |= processLoop(L); + Changed |= CFGChanged |= processLoop(L); } // Process each loop nest in the function. - return Changed; + return LoopVectorizeResult(Changed, CFGChanged); } PreservedAnalyses LoopVectorizePass::run(Function &F, @@ -7975,13 +8093,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; - const ModuleAnalysisManager &MAM = - AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); + auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); ProfileSummaryInfo *PSI = - MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); - bool Changed = + MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); + LoopVectorizeResult Result = runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); - if (!Changed) + if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; @@ -7995,5 +8112,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, } PA.preserve<BasicAA>(); PA.preserve<GlobalsAA>(); + if (!Result.MadeCFGChange) + PA.preserveSet<CFGAnalyses>(); return PA; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index aabd974cd73e4..5bc35aa4695f8 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -47,6 +47,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -85,6 +86,7 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> @@ -107,9 +109,8 @@ using namespace slpvectorizer; STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); -cl::opt<bool> - llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden, - cl::desc("Run the SLP vectorization passes")); +cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, + cl::desc("Run the SLP vectorization passes")); static cl::opt<int> SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, @@ -284,7 +285,7 @@ static bool isCommutative(Instruction *I) { static Optional<TargetTransformInfo::ShuffleKind> isShuffle(ArrayRef<Value *> VL) { auto *EI0 = cast<ExtractElementInst>(VL[0]); - unsigned Size = EI0->getVectorOperandType()->getVectorNumElements(); + unsigned Size = EI0->getVectorOperandType()->getNumElements(); Value *Vec1 = nullptr; Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; @@ -293,7 +294,7 @@ isShuffle(ArrayRef<Value *> VL) { auto *EI = cast<ExtractElementInst>(VL[I]); auto *Vec = EI->getVectorOperand(); // All vector operands must have the same number of vector elements. - if (Vec->getType()->getVectorNumElements() != Size) + if (cast<VectorType>(Vec->getType())->getNumElements() != Size) return None; auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); if (!Idx) @@ -377,6 +378,18 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) { return S.OpValue; } +/// \returns true if \p Opcode is allowed as part of of the main/alternate +/// instruction for SLP vectorization. +/// +/// Example of unsupported opcode is SDIV that can potentially cause UB if the +/// "shuffled out" lane would result in division by zero. +static bool isValidForAlternation(unsigned Opcode) { + if (Instruction::isIntDivRem(Opcode)) + return false; + + return true; +} + /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. @@ -399,7 +412,8 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; - if (Opcode == AltOpcode) { + if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && + isValidForAlternation(Opcode)) { AltOpcode = InstOpcode; AltIndex = Cnt; continue; @@ -411,6 +425,9 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; if (Opcode == AltOpcode) { + assert(isValidForAlternation(Opcode) && + isValidForAlternation(InstOpcode) && + "Cast isn't safe for alternation, logic needs to be updated!"); AltOpcode = InstOpcode; AltIndex = Cnt; continue; @@ -613,7 +630,7 @@ public: /// the stored value. Otherwise, the size is the width of the largest loaded /// value reaching V. This method is used by the vectorizer to calculate /// vectorization factors. - unsigned getVectorElementSize(Value *V) const; + unsigned getVectorElementSize(Value *V); /// Compute the minimum type sizes required to represent the entries in a /// vectorizable tree. @@ -650,6 +667,15 @@ public: /// may not be necessary. bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; + /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values + /// can be load combined in the backend. Load combining may not be allowed in + /// the IR optimizer, so we do not want to alter the pattern. For example, + /// partially transforming a scalar bswap() pattern into vector code is + /// effectively impossible for the backend to undo. + /// TODO: If load combining is allowed in the IR optimizer, this analysis + /// may not be necessary. + bool isLoadCombineCandidate() const; + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -816,13 +842,12 @@ public: // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. - auto *Ex1 = dyn_cast<ExtractElementInst>(V1); - auto *Ex2 = dyn_cast<ExtractElementInst>(V2); - if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() && - cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 == - cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) { + Value *EV; + ConstantInt *Ex1Idx, *Ex2Idx; + if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) && + match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) && + Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue()) return VLOperands::ScoreConsecutiveExtracts; - } auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); @@ -852,7 +877,7 @@ public: int getExternalUsesCost(const std::pair<Value *, int> &LHS, const std::pair<Value *, int> &RHS) { int Cost = 0; - SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS}; + std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}}; for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { Value *V = Values[Idx].first; // Calculate the absolute lane, using the minimum relative lane of LHS @@ -1385,7 +1410,8 @@ private: /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const; + int getGatherCost(VectorType *Ty, + const DenseSet<unsigned> &ShuffledIndices) const; /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -1422,7 +1448,7 @@ private: return VL.size() == ReuseShuffleIndices.size() && std::equal( VL.begin(), VL.end(), ReuseShuffleIndices.begin(), - [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; }); + [this](Value *V, int Idx) { return V == Scalars[Idx]; }); } /// A vector of scalars. @@ -1436,7 +1462,7 @@ private: EntryState State; /// Does this sequence require some shuffling? - SmallVector<unsigned, 4> ReuseShuffleIndices; + SmallVector<int, 4> ReuseShuffleIndices; /// Does this entry require reordering? ArrayRef<unsigned> ReorderIndices; @@ -1690,6 +1716,9 @@ private: /// Maps a specific scalar to its tree entry. SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; + /// Maps a value to the proposed vectorizable size. + SmallDenseMap<Value *, unsigned> InstrElementSize; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -2001,6 +2030,20 @@ private: if (TreeEntry *TE = BundleMember->TE) { int Lane = BundleMember->Lane; assert(Lane >= 0 && "Lane not set"); + + // Since vectorization tree is being built recursively this assertion + // ensures that the tree entry has all operands set before reaching + // this code. Couple of exceptions known at the moment are extracts + // where their second (immediate) operand is not added. Since + // immediates do not affect scheduler behavior this is considered + // okay. + auto *In = TE->getMainOp(); + assert(In && + (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || + In->getNumOperands() == TE->getNumOperands()) && + "Missed TreeEntry operands?"); + (void)In; // fake use to avoid build failure when assertions disabled + for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); OpIdx != NumOperands; ++OpIdx) if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) @@ -2323,6 +2366,7 @@ BoUpSLP::~BoUpSLP() { "trying to erase instruction with users."); Pair.getFirst()->eraseFromParent(); } + assert(!verifyFunction(*F, &dbgs())); } void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { @@ -2978,19 +3022,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } case Instruction::Call: { - // Check if the calls are all to the same vectorizable intrinsic. + // Check if the calls are all to the same vectorizable intrinsic or + // library function. CallInst *CI = cast<CallInst>(VL0); - // Check if this is an Intrinsic call or something that can be - // represented by an intrinsic call Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (!isTriviallyVectorizable(ID)) { + + VFShape Shape = VFShape::get( + *CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/}, + false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + + if (!VecFunc && !isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } - Function *Int = CI->getCalledFunction(); + Function *F = CI->getCalledFunction(); unsigned NumArgs = CI->getNumArgOperands(); SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); for (unsigned j = 0; j != NumArgs; ++j) @@ -2998,8 +3047,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, ScalarArgs[j] = CI->getArgOperand(j); for (Value *V : VL) { CallInst *CI2 = dyn_cast<CallInst>(V); - if (!CI2 || CI2->getCalledFunction() != Int || + if (!CI2 || CI2->getCalledFunction() != F || getVectorIntrinsicIDForCall(CI2, TLI) != ID || + (VecFunc && + VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -3101,7 +3152,8 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; - while (isa<CompositeType>(EltTy)) { + while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) || + isa<VectorType>(EltTy)) { if (auto *ST = dyn_cast<StructType>(EltTy)) { // Check that struct is homogeneous. for (const auto *Ty : ST->elements()) @@ -3109,16 +3161,19 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { return 0; N *= ST->getNumElements(); EltTy = *ST->element_begin(); + } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) { + N *= AT->getNumElements(); + EltTy = AT->getElementType(); } else { - auto *SeqT = cast<SequentialType>(EltTy); - N *= SeqT->getNumElements(); - EltTy = SeqT->getElementType(); + auto *VT = cast<VectorType>(EltTy); + N *= VT->getNumElements(); + EltTy = VT->getElementType(); } } if (!isValidElementType(EltTy)) return 0; - uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N)); + uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N)); if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) return 0; return N; @@ -3148,7 +3203,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) return false; } else { - NElts = Vec->getType()->getVectorNumElements(); + NElts = cast<VectorType>(Vec->getType())->getNumElements(); } if (NElts != VL.size()) @@ -3198,6 +3253,35 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { }); } +static std::pair<unsigned, unsigned> +getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI, + TargetLibraryInfo *TLI) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + // Calculate the cost of the scalar and vector calls. + IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements()); + int IntrinsicCost = + TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); + + auto Shape = + VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false}, + false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + int LibCost = IntrinsicCost; + if (!CI->isNoBuiltin() && VecFunc) { + // Calculate the cost of the vector library call. + SmallVector<Type *, 4> VecTys; + for (Use &Arg : CI->args()) + VecTys.push_back( + FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); + + // If the corresponding vector call is cheaper, return its cost. + LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys, + TTI::TCK_RecipThroughput); + } + return {IntrinsicCost, LibCost}; +} + int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef<Value*> VL = E->Scalars; @@ -3206,12 +3290,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ScalarTy = SI->getValueOperand()->getType(); else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0])) ScalarTy = CI->getOperand(0)->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. if (MinBWs.count(VL[0])) - VecTy = VectorType::get( + VecTy = FixedVectorType::get( IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); @@ -3251,6 +3336,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return ReuseShuffleCost + getGatherCost(VL); } + assert(E->State == TreeEntry::Vectorize && "Unhandled state"); assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = @@ -3260,7 +3346,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return 0; case Instruction::ExtractValue: - case Instruction::ExtractElement: + case Instruction::ExtractElement: { if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { @@ -3289,43 +3375,41 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (E->State == TreeEntry::Vectorize) { - int DeadCost = ReuseShuffleCost; - if (!E->ReorderIndices.empty()) { - // TODO: Merge this shuffle with the ReuseShuffleCost. - DeadCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast<Instruction>(VL[i]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(E)) { - // Take credit for instruction that will become dead. - if (E->hasOneUse()) { - Instruction *Ext = E->user_back(); - if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa<GetElementPtrInst>(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - DeadCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, i); - // Add back the cost of s|zext which is subtracted separately. - DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), E->getType(), Ext); - continue; - } + int DeadCost = ReuseShuffleCost; + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + DeadCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *E = cast<Instruction>(VL[i]); + // If all users are going to be vectorized, instruction can be + // considered as dead. + // The same, if have only one user, it will be vectorized for sure. + if (areAllUsersVectorized(E)) { + // Take credit for instruction that will become dead. + if (E->hasOneUse()) { + Instruction *Ext = E->user_back(); + if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa<GetElementPtrInst>(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + DeadCost -= TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), VecTy, i); + // Add back the cost of s|zext which is subtracted separately. + DeadCost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), E->getType(), CostKind, + Ext); + continue; } - DeadCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); } + DeadCost -= + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); } - return DeadCost; } - return ReuseShuffleCost + getGatherCost(VL); - + return DeadCost; + } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -3340,7 +3424,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind, + VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } @@ -3348,12 +3433,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. int ScalarCost = VL.size() * ScalarEltCost; - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); int VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, + CostKind, VL0); } return VecCost - ScalarCost; } @@ -3362,13 +3448,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::Select: { // Calculate the cost of this instruction. int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, - Builder.getInt1Ty(), VL0); + Builder.getInt1Ty(), + CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, + CostKind, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::FNeg: @@ -3429,13 +3517,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { SmallVector<const Value *, 4> Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); + E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, + Operands, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost( - E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); + E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, + Operands, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -3445,26 +3535,30 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OK_UniformConstantValue; int ScalarEltCost = - TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); + TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind, + Op1VK, Op2VK); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; int VecCost = - TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); + TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind, + Op1VK, Op2VK); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment()); + Align alignment = cast<LoadInst>(VL0)->getAlign(); int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, + CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); + TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + CostKind, VL0); if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( @@ -3477,14 +3571,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { bool IsReorder = !E->ReorderIndices.empty(); auto *SI = cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); - MaybeAlign Alignment(SI->getAlignment()); + Align Alignment = SI->getAlign(); int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0); + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, + CostKind, VL0); if (NeedToShuffleReuses) ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, Alignment, 0, VL0); + VecTy, Alignment, 0, CostKind, VL0); if (IsReorder) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecStCost += TTI->getShuffleCost( @@ -3497,24 +3592,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - SmallVector<Type *, 4> ScalarTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) - ScalarTys.push_back(CI->getArgOperand(op)->getType()); - - FastMathFlags FMF; - if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) - FMF = FPMO->getFastMathFlags(); - - int ScalarEltCost = - TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); + IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1); + int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; - SmallVector<Value *, 4> Args(CI->arg_operands()); - int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, - VecTy->getNumElements()); + auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); + int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second); LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" @@ -3533,34 +3619,34 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { Instruction *I = cast<Instruction>(VL[Idx]); - ReuseShuffleCost -= TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind); } for (Value *V : VL) { Instruction *I = cast<Instruction>(V); - ReuseShuffleCost += TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + ReuseShuffleCost += TTI->getInstructionCost(I, CostKind); } } for (Value *V : VL) { Instruction *I = cast<Instruction>(V); assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - ScalarCost += TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + ScalarCost += TTI->getInstructionCost(I, CostKind); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; if (Instruction::isBinaryOp(E->getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy); + VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); + VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, + CostKind); } else { Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); - VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); - VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty); + auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); + auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); + VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, + CostKind); + VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, + CostKind); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -3596,24 +3682,20 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { return true; } -bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { - if (RdxOpcode != Instruction::Or) - return false; - - unsigned NumElts = VectorizableTree[0]->Scalars.size(); - Value *FirstReduced = VectorizableTree[0]->Scalars[0]; - - // Look past the reduction to find a source value. Arbitrarily follow the +static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, + TargetTransformInfo *TTI) { + // Look past the root to find a source value. Arbitrarily follow the // path through operand 0 of any 'or'. Also, peek through optional // shift-left-by-constant. - Value *ZextLoad = FirstReduced; - while (match(ZextLoad, m_Or(m_Value(), m_Value())) || - match(ZextLoad, m_Shl(m_Value(), m_Constant()))) + Value *ZextLoad = Root; + while (!isa<ConstantExpr>(ZextLoad) && + (match(ZextLoad, m_Or(m_Value(), m_Value())) || + match(ZextLoad, m_Shl(m_Value(), m_Constant())))) ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0); - // Check if the input to the reduction is an extended load. + // Check if the input is an extended load of the required or/shift expression. Value *LoadPtr; - if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) return false; // Require that the total load bit width is a legal integer type. @@ -3621,15 +3703,36 @@ bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. Type *SrcTy = LoadPtr->getType()->getPointerElementType(); unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; - LLVMContext &Context = FirstReduced->getContext(); - if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth))) + if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) return false; // Everything matched - assume that we can fold the whole sequence using // load combining. - LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of " - << *(cast<Instruction>(FirstReduced)) << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " + << *(cast<Instruction>(Root)) << "\n"); + + return true; +} + +bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { + if (RdxOpcode != Instruction::Or) + return false; + unsigned NumElts = VectorizableTree[0]->Scalars.size(); + Value *FirstReduced = VectorizableTree[0]->Scalars[0]; + return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI); +} + +bool BoUpSLP::isLoadCombineCandidate() const { + // Peek through a final sequence of stores and check if all operations are + // likely to be load-combined. + unsigned NumElts = VectorizableTree[0]->Scalars.size(); + for (Value *Scalar : VectorizableTree[0]->Scalars) { + Value *X; + if (!match(Scalar, m_Store(m_Value(X), m_Value())) || + !isLoadCombineCandidateImpl(X, NumElts, TTI)) + return false; + } return true; } @@ -3712,7 +3815,7 @@ int BoUpSLP::getSpillCost() const { if (NumCalls) { SmallVector<Type*, 4> V; for (auto *II : LiveValues) - V.push_back(VectorType::get(II->getType(), BundleWidth)); + V.push_back(FixedVectorType::get(II->getType(), BundleWidth)); Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); } @@ -3776,13 +3879,13 @@ int BoUpSLP::getTreeCost() { // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); + VecTy = FixedVectorType::get(MinTy, BundleWidth); ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), VecTy, EU.Lane); } else { @@ -3809,12 +3912,15 @@ int BoUpSLP::getTreeCost() { return Cost; } -int BoUpSLP::getGatherCost(Type *Ty, +int BoUpSLP::getGatherCost(VectorType *Ty, const DenseSet<unsigned> &ShuffledIndices) const { - int Cost = 0; - for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) + unsigned NumElts = Ty->getNumElements(); + APInt DemandedElts = APInt::getNullValue(NumElts); + for (unsigned i = 0; i < NumElts; ++i) if (!ShuffledIndices.count(i)) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + DemandedElts.setBit(i); + int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, + /*Extract*/ false); if (!ShuffledIndices.empty()) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; @@ -3825,7 +3931,7 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); // Find the cost of inserting/extracting values from the vector. // Check if the same elements are inserted several times and count them as // shuffle candidates. @@ -3965,9 +4071,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { V = SV->getOperand(0); } else { // Reshuffle to get only unique values. - SmallVector<unsigned, 4> UniqueIdxs; - SmallSet<unsigned, 4> UsedIdxs; - for(unsigned Idx : E->ReuseShuffleIndices) + SmallVector<int, 4> UniqueIdxs; + SmallSet<int, 4> UsedIdxs; + for (int Idx : E->ReuseShuffleIndices) if (UsedIdxs.insert(Idx).second) UniqueIdxs.emplace_back(Idx); V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), @@ -3984,7 +4090,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { ScalarTy = SI->getValueOperand()->getType(); // Check that every instruction appears once in this bundle. - SmallVector<unsigned, 4> ReuseShuffleIndicies; + SmallVector<int, 4> ReuseShuffleIndicies; SmallVector<Value *, 4> UniqueValues; if (VL.size() > 2) { DenseMap<Value *, unsigned> UniquePositions; @@ -4002,7 +4108,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { else VL = UniqueValues; } - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); Value *V = Gather(VL, VecTy); if (!ReuseShuffleIndicies.empty()) { @@ -4017,7 +4123,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { } static void inversePermutation(ArrayRef<unsigned> Indices, - SmallVectorImpl<unsigned> &Mask) { + SmallVectorImpl<int> &Mask) { Mask.clear(); const unsigned E = Indices.size(); Mask.resize(E); @@ -4037,7 +4143,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -4056,6 +4162,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } + assert(E->State == TreeEntry::Vectorize && "Unhandled state"); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); switch (ShuffleOrOp) { @@ -4096,72 +4203,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ExtractElement: { - if (E->State == TreeEntry::Vectorize) { - Value *V = E->getSingleOperand(0); - if (!E->ReorderIndices.empty()) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - if (E->ReorderIndices.empty()) - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = V; - return V; + Value *V = E->getSingleOperand(0); + if (!E->ReorderIndices.empty()) { + SmallVector<int, 4> Mask; + inversePermutation(E->ReorderIndices, Mask); + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); } - setInsertPointAfterBundle(E); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. + if (E->ReorderIndices.empty()) + Builder.SetInsertPoint(VL0); V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast<Instruction>(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - if (E->State == TreeEntry::Vectorize) { - LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); - Builder.SetInsertPoint(LI); - PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); - Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); - LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment()); - Value *NewV = propagateMetadata(V, E->Scalars); - if (!E->ReorderIndices.empty()) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); - NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - NewV = Builder.CreateShuffleVector( - NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = NewV; - return NewV; + LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); + Builder.SetInsertPoint(LI); + PointerType *PtrTy = + PointerType::get(VecTy, LI->getPointerAddressSpace()); + Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); + LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); + Value *NewV = propagateMetadata(V, E->Scalars); + if (!E->ReorderIndices.empty()) { + SmallVector<int, 4> Mask; + inversePermutation(E->ReorderIndices, Mask); + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); } - setInsertPointAfterBundle(E); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast<Instruction>(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } + // TODO: Merge this shuffle with the ReorderShuffleMask. + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); } - E->VectorizedValue = V; - return V; + E->VectorizedValue = NewV; + return NewV; } case Instruction::ZExt: case Instruction::SExt: @@ -4207,12 +4287,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); - Value *V; - if (E->getOpcode() == Instruction::FCmp) - V = Builder.CreateFCmp(P0, L, R); - else - V = Builder.CreateICmp(P0, L, R); - + Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -4321,7 +4396,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E); LoadInst *LI = cast<LoadInst>(VL0); - Type *ScalarLoadTy = LI->getType(); unsigned AS = LI->getPointerAddressSpace(); Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), @@ -4334,14 +4408,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (getTreeEntry(PO)) ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); - MaybeAlign Alignment = MaybeAlign(LI->getAlignment()); - LI = Builder.CreateLoad(VecTy, VecPtr); - if (!Alignment) - Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy)); - LI->setAlignment(Alignment); + LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); Value *V = propagateMetadata(LI, E->Scalars); if (IsReorder) { - OrdersType Mask; + SmallVector<int, 4> Mask; inversePermutation(E->ReorderIndices, Mask); V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask, "reorder_shuffle"); @@ -4359,23 +4429,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool IsReorder = !E->ReorderIndices.empty(); auto *SI = cast<StoreInst>( IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); - unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); if (IsReorder) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); + SmallVector<int, 4> Mask(E->ReorderIndices.begin(), + E->ReorderIndices.end()); VecValue = Builder.CreateShuffleVector( - VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices, + VecValue, UndefValue::get(VecValue->getType()), Mask, "reorder_shuffle"); } Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast( ScalarPtr, VecValue->getType()->getPointerTo(AS)); - StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); + StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr, + SI->getAlign()); // The pointer operand uses an in-tree scalar, so add the new BitCast to // ExternalUses to make sure that an extract will be generated in the @@ -4383,10 +4453,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (getTreeEntry(ScalarPtr)) ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); - if (!Alignment) - Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - - ST->setAlignment(Align(Alignment)); Value *V = propagateMetadata(ST, E->Scalars); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -4445,13 +4511,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Function *FI = CI->getCalledFunction()) IID = FI->getIntrinsicID(); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); + bool UseIntrinsic = ID != Intrinsic::not_intrinsic && + VecCallCosts.first <= VecCallCosts.second; + Value *ScalarArg = nullptr; std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. - if (hasVectorInstrinsicScalarOpd(IID, j)) { + if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) { CallInst *CEI = cast<CallInst>(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); @@ -4463,10 +4535,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { OpVecs.push_back(OpVec); } - Module *M = F->getParent(); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; - Function *CF = Intrinsic::getDeclaration(M, ID, Tys); + Function *CF; + if (!UseIntrinsic) { + VFShape Shape = VFShape::get( + *CI, {static_cast<unsigned>(VecTy->getNumElements()), false}, + false /*HasGlobalPred*/); + CF = VFDatabase(*CI).getVectorizedFunction(Shape); + } else { + Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())}; + CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + } + SmallVector<OperandBundleDef, 1> OpBundles; CI->getOperandBundlesAsDefs(OpBundles); Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); @@ -4527,24 +4606,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // each vector operation. ValueList OpScalars, AltScalars; unsigned e = E->Scalars.size(); - SmallVector<Constant *, 8> Mask(e); + SmallVector<int, 8> Mask(e); for (unsigned i = 0; i < e; ++i) { auto *OpInst = cast<Instruction>(E->Scalars[i]); assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); if (OpInst->getOpcode() == E->getAltOpcode()) { - Mask[i] = Builder.getInt32(e + i); + Mask[i] = e + i; AltScalars.push_back(E->Scalars[i]); } else { - Mask[i] = Builder.getInt32(i); + Mask[i] = i; OpScalars.push_back(E->Scalars[i]); } } - Value *ShuffleMask = ConstantVector::get(Mask); propagateIRFlags(V0, OpScalars); propagateIRFlags(V1, AltScalars); - Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); if (NeedToShuffleReuses) { @@ -4586,7 +4664,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); auto BundleWidth = VectorizableTree[0]->Scalars.size(); auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto *VecTy = VectorType::get(MinTy, BundleWidth); + auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); VectorizableTree[0]->VectorizedValue = Trunc; } @@ -4715,6 +4793,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } Builder.ClearInsertionPoint(); + InstrElementSize.clear(); return VectorizableTree[0]->VectorizedValue; } @@ -5251,20 +5330,26 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { BS->ScheduleStart = nullptr; } -unsigned BoUpSLP::getVectorElementSize(Value *V) const { +unsigned BoUpSLP::getVectorElementSize(Value *V) { // If V is a store, just return the width of the stored value without // traversing the expression tree. This is the common case. if (auto *Store = dyn_cast<StoreInst>(V)) return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + auto E = InstrElementSize.find(V); + if (E != InstrElementSize.end()) + return E->second; + // If V is not a store, we can traverse the expression tree to find loads // that feed it. The type of the loaded value may indicate a more suitable // width than V's type. We want to base the vector element size on the width // of memory operations where possible. SmallVector<Instruction *, 16> Worklist; SmallPtrSet<Instruction *, 16> Visited; - if (auto *I = dyn_cast<Instruction>(V)) + if (auto *I = dyn_cast<Instruction>(V)) { Worklist.push_back(I); + Visited.insert(I); + } // Traverse the expression tree in bottom-up order looking for loads. If we // encounter an instruction we don't yet handle, we give up. @@ -5272,7 +5357,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const { auto FoundUnknownInst = false; while (!Worklist.empty() && !FoundUnknownInst) { auto *I = Worklist.pop_back_val(); - Visited.insert(I); // We should only be looking at scalar instructions here. If the current // instruction has a vector type, give up. @@ -5292,7 +5376,7 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const { isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) { for (Use &U : I->operands()) if (auto *J = dyn_cast<Instruction>(U.get())) - if (!Visited.count(J)) + if (Visited.insert(J).second) Worklist.push_back(J); } @@ -5301,13 +5385,17 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const { FoundUnknownInst = true; } + int Width = MaxWidth; // If we didn't encounter a memory access in the expression tree, or if we - // gave up for some reason, just return the width of V. + // gave up for some reason, just return the width of V. Otherwise, return the + // maximum width we found. if (!MaxWidth || FoundUnknownInst) - return DL->getTypeSizeInBits(V->getType()); + Width = DL->getTypeSizeInBits(V->getType()); - // Otherwise, return the maximum width we found. - return MaxWidth; + for (Instruction *I : Visited) + InstrElementSize[I] = Width; + + return Width; } // Determine if a value V in a vectorizable expression Expr can be demoted to a @@ -5560,6 +5648,7 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<DemandedBitsWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.addRequired<InjectTLIMappingsLegacy>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); @@ -5598,6 +5687,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) { + if (!RunSLPVectorization) + return false; SE = SE_; TTI = TTI_; TLI = TLI_; @@ -5657,7 +5748,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, if (Changed) { R.optimizeGatherSequence(); LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); - LLVM_DEBUG(verifyFunction(F)); } return Changed; } @@ -5688,6 +5778,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, } if (R.isTreeTinyAndNotFullyVectorizable()) return false; + if (R.isLoadCombineCandidate()) + return false; R.computeMinimumValueSizes(); @@ -5841,37 +5933,28 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; - Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, /*UserCost=*/0, true); + Value *VL[] = {A, B}; + return tryToVectorizeList(VL, R, /*AllowReorder=*/true); } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, - int UserCost, bool AllowReorder) { + bool AllowReorder, + ArrayRef<Value *> InsertUses) { if (VL.size() < 2) return false; LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size() << ".\n"); - // Check that all of the parts are scalar instructions of the same type, + // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. InstructionsState S = getSameOpcode(VL); if (!S.getOpcode()) return false; Instruction *I0 = cast<Instruction>(S.OpValue); - unsigned Sz = R.getVectorElementSize(I0); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); - unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); - if (MaxVF < 2) { - R.getORE()->emit([&]() { - return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) - << "Cannot SLP vectorize list: vectorization factor " - << "less than 2 is not supported"; - }); - return false; - } - + // Make sure invalid types (including vector type) are rejected before + // determining vectorization factor for scalar instructions. for (Value *V : VL) { Type *Ty = V->getType(); if (!isValidElementType(Ty)) { @@ -5889,16 +5972,35 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, } } + unsigned Sz = R.getVectorElementSize(I0); + unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); + unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); + if (MaxVF < 2) { + R.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) + << "Cannot SLP vectorize list: vectorization factor " + << "less than 2 is not supported"; + }); + return false; + } + bool Changed = false; bool CandidateFound = false; int MinCost = SLPCostThreshold; + bool CompensateUseCost = + !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) { + return V && isa<InsertElementInst>(V); + }); + assert((!CompensateUseCost || InsertUses.size() == VL.size()) && + "Each scalar expected to have an associated InsertElement user."); + unsigned NextInst = 0, MaxInst = VL.size(); for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). - auto *VecTy = VectorType::get(VL[0]->getType(), VF); + auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF); if (TTI->getNumberOfParts(VecTy) == VF) continue; for (unsigned I = NextInst; I < MaxInst; ++I) { @@ -5940,8 +6042,48 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost() - UserCost; + int Cost = R.getTreeCost(); CandidateFound = true; + if (CompensateUseCost) { + // TODO: Use TTI's getScalarizationOverhead for sequence of inserts + // rather than sum of single inserts as the latter may overestimate + // cost. This work should imply improving cost estimation for extracts + // that added in for external (for vectorization tree) users,i.e. that + // part should also switch to same interface. + // For example, the following case is projected code after SLP: + // %4 = extractelement <4 x i64> %3, i32 0 + // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0 + // %5 = extractelement <4 x i64> %3, i32 1 + // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 + // %6 = extractelement <4 x i64> %3, i32 2 + // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2 + // %7 = extractelement <4 x i64> %3, i32 3 + // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3 + // + // Extracts here added by SLP in order to feed users (the inserts) of + // original scalars and contribute to "ExtractCost" at cost evaluation. + // The inserts in turn form sequence to build an aggregate that + // detected by findBuildAggregate routine. + // SLP makes an assumption that such sequence will be optimized away + // later (instcombine) so it tries to compensate ExctractCost with + // cost of insert sequence. + // Current per element cost calculation approach is not quite accurate + // and tends to create bias toward favoring vectorization. + // Switching to the TTI interface might help a bit. + // Alternative solution could be pattern-match to detect a no-op or + // shuffle. + unsigned UserCost = 0; + for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { + auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]); + if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) + UserCost += TTI->getVectorInstrCost( + Instruction::InsertElement, IE->getType(), CI->getZExtValue()); + } + LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost + << ".\n"); + Cost -= UserCost; + } + MinCost = std::min(MinCost, Cost); if (Cost < -SLPCostThreshold) { @@ -6031,24 +6173,23 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { /// <0,2,...> or <1,3,..> while a splitting reduction will generate /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. /// \param IsLeft True will generate a mask of even elements, odd otherwise. -static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, - bool IsPairwise, bool IsLeft, - IRBuilder<> &Builder) { +static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen, + unsigned NumEltsToRdx, + bool IsPairwise, bool IsLeft) { assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); - SmallVector<Constant *, 32> ShuffleMask( - VecLen, UndefValue::get(Builder.getInt32Ty())); + SmallVector<int, 32> ShuffleMask(VecLen, -1); if (IsPairwise) // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). for (unsigned i = 0; i != NumEltsToRdx; ++i) - ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); + ShuffleMask[i] = 2 * i + !IsLeft; else // Move the upper half of the vector to the lower half. for (unsigned i = 0; i != NumEltsToRdx; ++i) - ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); + ShuffleMask[i] = NumEltsToRdx + i; - return ConstantVector::get(ShuffleMask); + return ShuffleMask; } namespace { @@ -6840,7 +6981,7 @@ private: int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal, unsigned ReduxWidth) { Type *ScalarTy = FirstReducedVal->getType(); - Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); + auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth); int PairwiseRdxCost; int SplittingRdxCost; @@ -6857,7 +6998,7 @@ private: case RK_Max: case RK_UMin: case RK_UMax: { - Type *VecCondTy = CmpInst::makeCmpResultType(VecTy); + auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy)); bool IsUnsigned = ReductionData.getKind() == RK_UMin || ReductionData.getKind() == RK_UMax; PairwiseRdxCost = @@ -6922,10 +7063,8 @@ private: Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { - Value *LeftMask = - createRdxShuffleMask(ReduxWidth, i, true, true, Builder); - Value *RightMask = - createRdxShuffleMask(ReduxWidth, i, true, false, Builder); + auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true); + auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false); Value *LeftShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); @@ -6960,20 +7099,16 @@ private: /// \return true if it matches. static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl<Value *> &BuildVectorOpds, - int &UserCost) { + SmallVectorImpl<Value *> &InsertElts) { assert((isa<InsertElementInst>(LastInsertInst) || isa<InsertValueInst>(LastInsertInst)) && "Expected insertelement or insertvalue instruction!"); - UserCost = 0; do { Value *InsertedOperand; - if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) { + auto *IE = dyn_cast<InsertElementInst>(LastInsertInst); + if (IE) { InsertedOperand = IE->getOperand(1); LastInsertInst = IE->getOperand(0); - if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { - UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, - IE->getType(), CI->getZExtValue()); - } } else { auto *IV = cast<InsertValueInst>(LastInsertInst); InsertedOperand = IV->getInsertedValueOperand(); @@ -6981,16 +7116,17 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, } if (isa<InsertElementInst>(InsertedOperand) || isa<InsertValueInst>(InsertedOperand)) { - int TmpUserCost; SmallVector<Value *, 8> TmpBuildVectorOpds; + SmallVector<Value *, 8> TmpInsertElts; if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds, - TmpUserCost)) + TmpInsertElts)) return false; BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), TmpBuildVectorOpds.rend()); - UserCost += TmpUserCost; + InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend()); } else { BuildVectorOpds.push_back(InsertedOperand); + InsertElts.push_back(IE); } if (isa<UndefValue>(LastInsertInst)) break; @@ -7000,6 +7136,7 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, return false; } while (true); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); + std::reverse(InsertElts.begin(), InsertElts.end()); return true; } @@ -7164,26 +7301,29 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, BoUpSLP &R) { - int UserCost = 0; const DataLayout &DL = BB->getModule()->getDataLayout(); if (!R.canMapToVector(IVI->getType(), DL)) return false; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost)) + SmallVector<Value *, 16> BuildVectorInsts; + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) || + BuildVectorOpds.size() < 2) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R, UserCost); + return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, + BuildVectorInsts); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { - int UserCost; + SmallVector<Value *, 16> BuildVectorInsts; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) || + if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || + BuildVectorOpds.size() < 2 || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa<ExtractElementInst>(V); }) && isShuffle(BuildVectorOpds))) @@ -7191,7 +7331,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, // Vectorize starting with the build vector operands ignoring the BuildVector // instructions for the purpose of scheduling and user extraction. - return tryToVectorizeList(BuildVectorOpds, R, UserCost); + return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, + BuildVectorInsts); } bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, @@ -7228,6 +7369,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; SmallPtrSet<Value *, 16> VisitedInstrs; + unsigned MaxVecRegSize = R.getMaxVecRegSize(); bool HaveVectorizedPhiNodes = true; while (HaveVectorizedPhiNodes) { @@ -7254,8 +7396,18 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Look for the next elements with the same type. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; + Type *EltTy = (*IncIt)->getType(); + unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy) + : MaxVecRegSize; + unsigned MaxNumElts = MaxVecRegSize / EltSize; + if (MaxNumElts < 2) { + ++IncIt; + continue; + } + while (SameTypeIt != E && - (*SameTypeIt)->getType() == (*IncIt)->getType()) { + (*SameTypeIt)->getType() == EltTy && + (SameTypeIt - IncIt) < MaxNumElts) { VisitedInstrs.insert(*SameTypeIt); ++SameTypeIt; } @@ -7269,8 +7421,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. bool AllowReorder = NumElts == 2; - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, - /*UserCost=*/0, AllowReorder)) { + if (NumElts > 1 && + tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; @@ -7370,9 +7522,12 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { << Entry.second.size() << ".\n"); // Process the GEP list in chunks suitable for the target's supported - // vector size. If a vector register can't hold 1 element, we are done. + // vector size. If a vector register can't hold 1 element, we are done. We + // are trying to vectorize the index computations, so the maximum number of + // elements is based on the size of the index expression, rather than the + // size of the GEP itself (the target's pointer size). unsigned MaxVecRegSize = R.getMaxVecRegSize(); - unsigned EltSize = R.getVectorElementSize(Entry.second[0]); + unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin()); if (MaxVecRegSize < EltSize) continue; @@ -7475,6 +7630,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 598fb00e956ea..6f055ca80ff29 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -18,7 +18,6 @@ namespace llvm { class LoopVectorizationLegality; class LoopVectorizationCostModel; -class TargetTransformInfo; class TargetLibraryInfo; /// Helper class to create VPRecipies from IR instructions. @@ -35,6 +34,8 @@ class VPRecipeBuilder { /// The profitablity analysis. LoopVectorizationCostModel &CM; + PredicatedScalarEvolution &PSE; + VPBuilder &Builder; /// When we if-convert we need to create edge masks. We have to cache values @@ -49,11 +50,57 @@ class VPRecipeBuilder { // VPlan-VPlan transformations support: Hold a mapping from ingredients to // their recipe. To save on memory, only do so for selected ingredients, - // marked by having a nullptr entry in this map. If those ingredients get a - // VPWidenRecipe, also avoid compressing other ingredients into it to avoid - // having to split such recipes later. + // marked by having a nullptr entry in this map. DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe; - VPWidenRecipe *LastExtensibleRecipe = nullptr; + + /// Check if \p I can be widened at the start of \p Range and possibly + /// decrease the range such that the returned value holds for the entire \p + /// Range. The function should not be called for memory instructions or calls. + bool shouldWiden(Instruction *I, VFRange &Range) const; + + /// Check if the load or store instruction \p I should widened for \p + /// Range.Start and potentially masked. Such instructions are handled by a + /// recipe that takes an additional VPInstruction for the mask. + VPWidenMemoryInstructionRecipe * + tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan); + + /// Check if an induction recipe should be constructed for \I. If so build and + /// return it. If not, return null. + VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi) const; + + /// Optimize the special case where the operand of \p I is a constant integer + /// induction variable. + VPWidenIntOrFpInductionRecipe * + tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range) const; + + /// Handle non-loop phi nodes. Currently all such phi nodes are turned into + /// a sequence of select instructions as the vectorizer currently performs + /// full if-conversion. + VPBlendRecipe *tryToBlend(PHINode *Phi, VPlanPtr &Plan); + + /// Handle call instructions. If \p CI can be widened for \p Range.Start, + /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same + /// decision from \p Range.Start to \p Range.End. + VPWidenCallRecipe *tryToWidenCall(CallInst *CI, VFRange &Range, + VPlan &Plan) const; + + /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe + /// if it can. The function should only be called if the cost-model indicates + /// that widening should be performed. + VPWidenRecipe *tryToWiden(Instruction *I, VPlan &Plan) const; + +public: + VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, + LoopVectorizationLegality *Legal, + LoopVectorizationCostModel &CM, + PredicatedScalarEvolution &PSE, VPBuilder &Builder) + : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE), + Builder(Builder) {} + + /// Check if a recipe can be create for \p I withing the given VF \p Range. + /// If a recipe can be created, return it. Otherwise return nullptr. + VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr, VFRange &Range, + VPlanPtr &Plan); /// Set the recipe created for given ingredient. This operation is a no-op for /// ingredients that were not marked using a nullptr entry in the map. @@ -65,7 +112,6 @@ class VPRecipeBuilder { Ingredient2Recipe[I] = R; } -public: /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* /// mask for the block BB. @@ -92,48 +138,11 @@ public: return Ingredient2Recipe[I]; } - /// Check if \I is a memory instruction to be widened for \p Range.Start and - /// potentially masked. Such instructions are handled by a recipe that takes - /// an additional VPInstruction for the mask. - VPWidenMemoryInstructionRecipe * - tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan); - - /// Check if an induction recipe should be constructed for \I within the given - /// VF \p Range. If so build and return it. If not, return null. \p Range.End - /// may be decreased to ensure same decision from \p Range.Start to - /// \p Range.End. - VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, - VFRange &Range); - - /// Handle non-loop phi nodes. Currently all such phi nodes are turned into - /// a sequence of select instructions as the vectorizer currently performs - /// full if-conversion. - VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan); - - /// Check if \p I can be widened within the given VF \p Range. If \p I can be - /// widened for \p Range.Start, check if the last recipe of \p VPBB can be - /// extended to include \p I or else build a new VPWidenRecipe for it and - /// append it to \p VPBB. Return true if \p I can be widened for Range.Start, - /// false otherwise. Range.End may be decreased to ensure same decision from - /// \p Range.Start to \p Range.End. - bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range); - /// Create a replicating region for instruction \p I that requires /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, VPlanPtr &Plan); -public: - VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, - LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM, VPBuilder &Builder) - : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {} - - /// Check if a recipe can be create for \p I withing the given VF \p Range. - /// If a recipe can be created, it adds it to \p VPBB. - bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, - VPBasicBlock *VPBB); - /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it /// is predicated. \return \p VPBB augmented with this new recipe if \p I is /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f1c708720ccf4..f5f28a3bffa18 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -49,13 +49,46 @@ extern cl::opt<bool> EnableVPlanNativePath; #define DEBUG_TYPE "vplan" raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { - if (const VPInstruction *Instr = dyn_cast<VPInstruction>(&V)) - Instr->print(OS); - else - V.printAsOperand(OS); + const VPInstruction *Instr = dyn_cast<VPInstruction>(&V); + VPSlotTracker SlotTracker( + (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); + V.print(OS, SlotTracker); return OS; } +void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { + if (const VPInstruction *Instr = dyn_cast<VPInstruction>(this)) + Instr->print(OS, SlotTracker); + else + printAsOperand(OS, SlotTracker); +} + +// Get the top-most entry block of \p Start. This is the entry block of the +// containing VPlan. This function is templated to support both const and non-const blocks +template <typename T> static T *getPlanEntry(T *Start) { + T *Next = Start; + T *Current = Start; + while ((Next = Next->getParent())) + Current = Next; + + SmallSetVector<T *, 8> WorkList; + WorkList.insert(Current); + + for (unsigned i = 0; i < WorkList.size(); i++) { + T *Current = WorkList[i]; + if (Current->getNumPredecessors() == 0) + return Current; + auto &Predecessors = Current->getPredecessors(); + WorkList.insert(Predecessors.begin(), Predecessors.end()); + } + + llvm_unreachable("VPlan without any entry node without predecessors"); +} + +VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; } + +const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; } + /// \return the VPBasicBlock that is the entry of Block, possibly indirectly. const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const { const VPBlockBase *Block = this; @@ -71,6 +104,12 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() { return cast<VPBasicBlock>(Block); } +void VPBlockBase::setPlan(VPlan *ParentPlan) { + assert(ParentPlan->getEntry() == this && + "Can only set plan on its entry block."); + Plan = ParentPlan; +} + /// \return the VPBasicBlock that is the exit of Block, possibly indirectly. const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { const VPBlockBase *Block = this; @@ -341,6 +380,20 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, V, Part); break; } + case VPInstruction::ActiveLaneMask: { + // Get first lane of vector induction variable. + Value *VIVElem0 = State.get(getOperand(0), {Part, 0}); + // Get first lane of backedge-taken-count. + Value *ScalarBTC = State.get(getOperand(1), {Part, 0}); + + auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); + auto *PredTy = FixedVectorType::get(Int1Ty, State.VF); + Instruction *Call = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()}, + {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask"); + State.set(this, Call, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -352,15 +405,22 @@ void VPInstruction::execute(VPTransformState &State) { generateInstruction(State, Part); } -void VPInstruction::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" << Indent << "\"EMIT "; - print(O); - O << "\\l\""; +void VPInstruction::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"EMIT "; + print(O, SlotTracker); } void VPInstruction::print(raw_ostream &O) const { - printAsOperand(O); - O << " = "; + VPSlotTracker SlotTracker(getParent()->getPlan()); + print(O, SlotTracker); +} + +void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { + if (hasResult()) { + printAsOperand(O, SlotTracker); + O << " = "; + } switch (getOpcode()) { case VPInstruction::Not: @@ -375,13 +435,17 @@ void VPInstruction::print(raw_ostream &O) const { case VPInstruction::SLPStore: O << "combined store"; break; + case VPInstruction::ActiveLaneMask: + O << "active lane mask"; + break; + default: O << Instruction::getOpcodeName(getOpcode()); } for (const VPValue *Operand : operands()) { O << " "; - Operand->printAsOperand(O); + Operand->printAsOperand(O, SlotTracker); } } @@ -395,7 +459,11 @@ void VPlan::execute(VPTransformState *State) { IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), "trip.count.minus.1"); - Value2VPValue[TCMO] = BackedgeTakenCount; + auto VF = State->VF; + Value *VTCMO = + VF == 1 ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); + for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) + State->set(BackedgeTakenCount, VTCMO, Part); } // 0. Set the reverse mapping from VPValues to Values for code generation. @@ -533,15 +601,10 @@ void VPlanPrinter::dump() { OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; if (!Plan.getName().empty()) OS << "\\n" << DOT::EscapeString(Plan.getName()); - if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) { - OS << ", where:"; - if (Plan.BackedgeTakenCount) - OS << "\\n" << *Plan.BackedgeTakenCount << " := BackedgeTakenCount"; - for (auto Entry : Plan.Value2VPValue) { - OS << "\\n" << *Entry.second; - OS << DOT::EscapeString(" := "); - Entry.first->printAsOperand(OS, false); - } + if (Plan.BackedgeTakenCount) { + OS << ", where:\\n"; + Plan.BackedgeTakenCount->print(OS, SlotTracker); + OS << " := BackedgeTakenCount"; } OS << "\"]\n"; OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; @@ -605,25 +668,28 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { if (Pred) { OS << " +\n" << Indent << " \"BlockPredicate: "; if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) { - PredI->printAsOperand(OS); + PredI->printAsOperand(OS, SlotTracker); OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) << ")\\l\""; } else - Pred->printAsOperand(OS); + Pred->printAsOperand(OS, SlotTracker); } - for (const VPRecipeBase &Recipe : *BasicBlock) - Recipe.print(OS, Indent); + for (const VPRecipeBase &Recipe : *BasicBlock) { + OS << " +\n" << Indent; + Recipe.print(OS, Indent, SlotTracker); + OS << "\\l\""; + } // Dump the condition bit. const VPValue *CBV = BasicBlock->getCondBit(); if (CBV) { OS << " +\n" << Indent << " \"CondBit: "; if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) { - CBI->printAsOperand(OS); + CBI->printAsOperand(OS, SlotTracker); OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\""; } else { - CBV->printAsOperand(OS); + CBV->printAsOperand(OS, SlotTracker); OS << "\""; } } @@ -670,83 +736,121 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) { O << DOT::EscapeString(IngredientString); } -void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" << Indent << "\"WIDEN\\l\""; - for (auto &Instr : make_range(Begin, End)) - O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\""; +void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient); +} + +void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"WIDEN-SELECT" << VPlanIngredient(&Ingredient) + << (InvariantCond ? " (condition is loop invariant)" : ""); } -void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, - const Twine &Indent) const { - O << " +\n" << Indent << "\"WIDEN-INDUCTION"; +void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"WIDEN\\l\""; + O << "\" " << VPlanIngredient(&Ingredient); +} + +void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"WIDEN-INDUCTION"; if (Trunc) { O << "\\l\""; O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc); } else - O << " " << VPlanIngredient(IV) << "\\l\""; + O << " " << VPlanIngredient(IV); } -void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" << Indent << "\"WIDEN-GEP "; +void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"WIDEN-GEP "; O << (IsPtrLoopInvariant ? "Inv" : "Var"); size_t IndicesNumber = IsIndexLoopInvariant.size(); for (size_t I = 0; I < IndicesNumber; ++I) O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; O << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(GEP) << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(GEP); } -void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; +void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"WIDEN-PHI " << VPlanIngredient(Phi); } -void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" << Indent << "\"BLEND "; +void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"BLEND "; Phi->printAsOperand(O, false); O << " ="; - if (!User) { + if (getNumIncomingValues() == 1) { // Not a User of any mask: not really blending, this is a // single-predecessor phi. O << " "; - Phi->getIncomingValue(0)->printAsOperand(O, false); + getIncomingValue(0)->printAsOperand(O, SlotTracker); } else { - for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) { + for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { O << " "; - Phi->getIncomingValue(I)->printAsOperand(O, false); + getIncomingValue(I)->printAsOperand(O, SlotTracker); O << "/"; - User->getOperand(I)->printAsOperand(O); + getMask(I)->printAsOperand(O, SlotTracker); } } - O << "\\l\""; } -void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" - << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") +void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") << VPlanIngredient(Ingredient); if (AlsoPack) O << " (S->V)"; - O << "\\l\""; } -void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" - << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst) - << "\\l\""; +void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst); } -void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, - const Twine &Indent) const { - O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr); +void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"WIDEN " << VPlanIngredient(&Instr); O << ", "; - getAddr()->printAsOperand(O); + getAddr()->printAsOperand(O, SlotTracker); VPValue *Mask = getMask(); if (Mask) { O << ", "; - Mask->printAsOperand(O); + Mask->printAsOperand(O, SlotTracker); } - O << "\\l\""; +} + +void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { + Value *CanonicalIV = State.CanonicalIV; + Type *STy = CanonicalIV->getType(); + IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); + auto VF = State.VF; + Value *VStart = VF == 1 + ? CanonicalIV + : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + SmallVector<Constant *, 8> Indices; + for (unsigned Lane = 0; Lane < VF; ++Lane) + Indices.push_back(ConstantInt::get(STy, Part * VF + Lane)); + // If VF == 1, there is only one iteration in the loop above, thus the + // element pushed back into Indices is ConstantInt::get(STy, Part) + Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices); + // Add the consecutive indices to the vector value. + Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); + State.set(getVPValue(), CanonicalVectorIV, Part); + } +} + +void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "\"EMIT "; + getVPValue()->printAsOperand(O, SlotTracker); + O << " = WIDEN-CANONICAL-INDUCTION"; } template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT); @@ -758,6 +862,21 @@ void VPValue::replaceAllUsesWith(VPValue *New) { User->setOperand(I, New); } +void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { + if (const Value *UV = getUnderlyingValue()) { + OS << "ir<"; + UV->printAsOperand(OS, false); + OS << ">"; + return; + } + + unsigned Slot = Tracker.getSlot(this); + if (Slot == unsigned(-1)) + OS << "<badref>"; + else + OS << "vp<%" << Tracker.getSlot(this) << ">"; +} + void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, InterleavedAccessInfo &IAI) { @@ -781,7 +900,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, auto NewIGIter = Old2New.find(IG); if (NewIGIter == Old2New.end()) Old2New[IG] = new InterleaveGroup<VPInstruction>( - IG->getFactor(), IG->isReverse(), Align(IG->getAlignment())); + IG->getFactor(), IG->isReverse(), IG->getAlign()); if (Inst == IG->getInsertPos()) Old2New[IG]->setInsertPos(VPInst); @@ -803,3 +922,57 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, Old2NewTy Old2New; visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI); } + +void VPSlotTracker::assignSlot(const VPValue *V) { + assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!"); + const Value *UV = V->getUnderlyingValue(); + if (UV) + return; + const auto *VPI = dyn_cast<VPInstruction>(V); + if (VPI && !VPI->hasResult()) + return; + + Slots[V] = NextSlot++; +} + +void VPSlotTracker::assignSlots(const VPBlockBase *VPBB) { + if (auto *Region = dyn_cast<VPRegionBlock>(VPBB)) + assignSlots(Region); + else + assignSlots(cast<VPBasicBlock>(VPBB)); +} + +void VPSlotTracker::assignSlots(const VPRegionBlock *Region) { + ReversePostOrderTraversal<const VPBlockBase *> RPOT(Region->getEntry()); + for (const VPBlockBase *Block : RPOT) + assignSlots(Block); +} + +void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) { + for (const VPRecipeBase &Recipe : *VPBB) { + if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe)) + assignSlot(VPI); + else if (const auto *VPIV = dyn_cast<VPWidenCanonicalIVRecipe>(&Recipe)) + assignSlot(VPIV->getVPValue()); + } +} + +void VPSlotTracker::assignSlots(const VPlan &Plan) { + + for (const VPValue *V : Plan.VPExternalDefs) + assignSlot(V); + + for (auto &E : Plan.Value2VPValue) + if (!isa<VPInstruction>(E.second)) + assignSlot(E.second); + + for (const VPValue *V : Plan.VPCBVs) + assignSlot(V); + + if (Plan.BackedgeTakenCount) + assignSlot(Plan.BackedgeTakenCount); + + ReversePostOrderTraversal<const VPBlockBase *> RPOT(Plan.getEntry()); + for (const VPBlockBase *Block : RPOT) + assignSlots(Block); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c65abc3639d73..f07c94e7a3c7d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -48,8 +48,6 @@ namespace llvm { -class LoopVectorizationLegality; -class LoopVectorizationCostModel; class BasicBlock; class DominatorTree; class InnerLoopVectorizer; @@ -59,6 +57,7 @@ class raw_ostream; class Value; class VPBasicBlock; class VPRegionBlock; +class VPSlotTracker; class VPlan; class VPlanSlp; @@ -271,10 +270,20 @@ struct VPTransformState { return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); } - /// Get the generated Value for a given VPValue and given Part and Lane. Note - /// that as per-lane Defs are still created by ILV and managed in its ValueMap - /// this method currently just delegates the call to ILV. + /// Get the generated Value for a given VPValue and given Part and Lane. Value *get(VPValue *Def, const VPIteration &Instance) { + // If the Def is managed directly by VPTransformState, extract the lane from + // the relevant part. Note that currently only VPInstructions and external + // defs are managed by VPTransformState. Other Defs are still created by ILV + // and managed in its ValueMap. For those this method currently just + // delegates the call to ILV below. + if (Data.PerPartOutput.count(Def)) { + auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; + // TODO: Cache created scalar values. + return Builder.CreateExtractElement(VecPart, + Builder.getInt32(Instance.Lane)); + } + return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); } @@ -329,6 +338,9 @@ struct VPTransformState { /// Values they correspond to. VPValue2ValueTy VPValue2Value; + /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF). + Value *CanonicalIV = nullptr; + /// Hold the trip count of the scalar loop. Value *TripCount = nullptr; @@ -343,7 +355,6 @@ struct VPTransformState { class VPBlockBase { friend class VPBlockUtils; -private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). /// An optional name for the block. @@ -365,6 +376,10 @@ private: /// Current block predicate - null if the block does not need a predicate. VPValue *Predicate = nullptr; + /// VPlan containing the block. Can only be set on the entry block of the + /// plan. + VPlan *Plan = nullptr; + /// Add \p Successor as the last successor to this block. void appendSuccessor(VPBlockBase *Successor) { assert(Successor && "Cannot add nullptr successor!"); @@ -418,6 +433,14 @@ public: VPRegionBlock *getParent() { return Parent; } const VPRegionBlock *getParent() const { return Parent; } + /// \return A pointer to the plan containing the current block. + VPlan *getPlan(); + const VPlan *getPlan() const; + + /// Sets the pointer of the plan containing the block. The block must be the + /// entry block into the VPlan. + void setPlan(VPlan *ParentPlan); + void setParent(VPRegionBlock *P) { Parent = P; } /// \return the VPBasicBlock that is the entry of this VPBlockBase, @@ -579,7 +602,6 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> { friend VPBasicBlock; friend class VPBlockUtils; -private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). /// Each VPRecipe belongs to a single VPBasicBlock. @@ -597,11 +619,14 @@ public: VPInterleaveSC, VPPredInstPHISC, VPReplicateSC, + VPWidenCallSC, + VPWidenCanonicalIVSC, VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, VPWidenPHISC, VPWidenSC, + VPWidenSelectSC }; VPRecipeBase(const unsigned char SC) : SubclassID(SC) {} @@ -621,7 +646,8 @@ public: virtual void execute(struct VPTransformState &State) = 0; /// Each recipe prints itself. - virtual void print(raw_ostream &O, const Twine &Indent) const = 0; + virtual void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const = 0; /// Insert an unlinked recipe into a basic block immediately before /// the specified recipe. @@ -659,6 +685,7 @@ public: ICmpULE, SLPLoad, SLPStore, + ActiveLaneMask, }; private: @@ -707,10 +734,12 @@ public: void execute(VPTransformState &State) override; /// Print the Recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; /// Print the VPInstruction. void print(raw_ostream &O) const; + void print(raw_ostream &O, VPSlotTracker &SlotTracker) const; /// Return true if this instruction may modify memory. bool mayWriteToMemory() const { @@ -719,23 +748,42 @@ public: return Opcode == Instruction::Store || Opcode == Instruction::Call || Opcode == Instruction::Invoke || Opcode == SLPStore; } + + bool hasResult() const { + // CallInst may or may not have a result, depending on the called function. + // Conservatively return calls have results for now. + switch (getOpcode()) { + case Instruction::Ret: + case Instruction::Br: + case Instruction::Store: + case Instruction::Switch: + case Instruction::IndirectBr: + case Instruction::Resume: + case Instruction::CatchRet: + case Instruction::Unreachable: + case Instruction::Fence: + case Instruction::AtomicRMW: + return false; + default: + return true; + } + } }; -/// VPWidenRecipe is a recipe for producing a copy of vector type for each -/// Instruction in its ingredients independently, in order. This recipe covers -/// most of the traditional vectorization cases where each ingredient transforms -/// into a vectorized version of itself. +/// VPWidenRecipe is a recipe for producing a copy of vector type its +/// ingredient. This recipe covers most of the traditional vectorization cases +/// where each ingredient transforms into a vectorized version of itself. class VPWidenRecipe : public VPRecipeBase { -private: - /// Hold the ingredients by pointing to their original BasicBlock location. - BasicBlock::iterator Begin; - BasicBlock::iterator End; + /// Hold the instruction to be widened. + Instruction &Ingredient; + + /// Hold VPValues for the operands of the ingredient. + VPUser User; public: - VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) { - End = I->getIterator(); - Begin = End++; - } + template <typename IterT> + VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) + : VPRecipeBase(VPWidenSC), Ingredient(I), User(Operands) {} ~VPWidenRecipe() override = default; @@ -747,28 +795,88 @@ public: /// Produce widened copies of all Ingredients. void execute(VPTransformState &State) override; - /// Augment the recipe to include Instr, if it lies at its End. - bool appendInstruction(Instruction *Instr) { - if (End != Instr->getIterator()) - return false; - End++; - return true; + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + +/// A recipe for widening Call instructions. +class VPWidenCallRecipe : public VPRecipeBase { + /// Hold the call to be widened. + CallInst &Ingredient; + + /// Hold VPValues for the arguments of the call. + VPUser User; + +public: + template <typename IterT> + VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments) + : VPRecipeBase(VPWidenCallSC), Ingredient(I), User(CallArguments) {} + + ~VPWidenCallRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC; } + /// Produce a widened version of the call instruction. + void execute(VPTransformState &State) override; + /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + +/// A recipe for widening select instructions. +class VPWidenSelectRecipe : public VPRecipeBase { +private: + /// Hold the select to be widened. + SelectInst &Ingredient; + + /// Hold VPValues for the operands of the select. + VPUser User; + + /// Is the condition of the select loop invariant? + bool InvariantCond; + +public: + template <typename IterT> + VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands, + bool InvariantCond) + : VPRecipeBase(VPWidenSelectSC), Ingredient(I), User(Operands), + InvariantCond(InvariantCond) {} + + ~VPWidenSelectRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC; + } + + /// Produce a widened version of the select instruction. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; }; /// A recipe for handling GEP instructions. class VPWidenGEPRecipe : public VPRecipeBase { -private: GetElementPtrInst *GEP; + + /// Hold VPValues for the base and indices of the GEP. + VPUser User; + bool IsPtrLoopInvariant; SmallBitVector IsIndexLoopInvariant; public: - VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop) - : VPRecipeBase(VPWidenGEPSC), GEP(GEP), + template <typename IterT> + VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands, + Loop *OrigLoop) + : VPRecipeBase(VPWidenGEPSC), GEP(GEP), User(Operands), IsIndexLoopInvariant(GEP->getNumIndices(), false) { IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); for (auto Index : enumerate(GEP->indices())) @@ -786,13 +894,13 @@ public: void execute(VPTransformState &State) override; /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; }; /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { -private: PHINode *IV; TruncInst *Trunc; @@ -811,12 +919,12 @@ public: void execute(VPTransformState &State) override; /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; }; /// A recipe for handling all phi nodes except for integer and FP inductions. class VPWidenPHIRecipe : public VPRecipeBase { -private: PHINode *Phi; public: @@ -832,26 +940,27 @@ public: void execute(VPTransformState &State) override; /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; }; /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class VPBlendRecipe : public VPRecipeBase { -private: PHINode *Phi; - /// The blend operation is a User of a mask, if not null. - std::unique_ptr<VPUser> User; + /// The blend operation is a User of the incoming values and of their + /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value + /// might be incoming with a full mask for which there is no VPValue. + VPUser User; public: - VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Masks) - : VPRecipeBase(VPBlendSC), Phi(Phi) { - assert((Phi->getNumIncomingValues() == 1 || - Phi->getNumIncomingValues() == Masks.size()) && - "Expected the same number of incoming values and masks"); - if (!Masks.empty()) - User.reset(new VPUser(Masks)); + VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) + : VPRecipeBase(VPBlendSC), Phi(Phi), User(Operands) { + assert(Operands.size() > 0 && + ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && + "Expected either a single incoming value or a positive even number " + "of operands"); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -859,17 +968,31 @@ public: return V->getVPRecipeID() == VPRecipeBase::VPBlendSC; } + /// Return the number of incoming values, taking into account that a single + /// incoming value has no mask. + unsigned getNumIncomingValues() const { + return (User.getNumOperands() + 1) / 2; + } + + /// Return incoming value number \p Idx. + VPValue *getIncomingValue(unsigned Idx) const { + return User.getOperand(Idx * 2); + } + + /// Return mask number \p Idx. + VPValue *getMask(unsigned Idx) const { return User.getOperand(Idx * 2 + 1); } + /// Generate the phi/select nodes. void execute(VPTransformState &State) override; /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; }; /// VPInterleaveRecipe is a recipe for transforming an interleave group of load /// or stores into one wide load/store and shuffles. class VPInterleaveRecipe : public VPRecipeBase { -private: const InterleaveGroup<Instruction> *IG; VPUser User; @@ -903,7 +1026,8 @@ public: void execute(VPTransformState &State) override; /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } }; @@ -913,10 +1037,12 @@ public: /// single copy of widened type for all lanes. If the instruction is known to be /// uniform only one copy, per lane zero, will be generated. class VPReplicateRecipe : public VPRecipeBase { -private: /// The instruction being replicated. Instruction *Ingredient; + /// Hold VPValues for the operands of the ingredient. + VPUser User; + /// Indicator if only a single replica per lane is needed. bool IsUniform; @@ -927,9 +1053,11 @@ private: bool AlsoPack; public: - VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false) - : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform), - IsPredicated(IsPredicated) { + template <typename IterT> + VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands, + bool IsUniform, bool IsPredicated = false) + : VPRecipeBase(VPReplicateSC), Ingredient(I), User(Operands), + IsUniform(IsUniform), IsPredicated(IsPredicated) { // Retain the previous behavior of predicateInstructions(), where an // insert-element of a predicated instruction got hoisted into the // predicated basic block iff it was its only user. This is achieved by @@ -953,18 +1081,18 @@ public: void setAlsoPack(bool Pack) { AlsoPack = Pack; } /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; }; /// A recipe for generating conditional branches on the bits of a mask. class VPBranchOnMaskRecipe : public VPRecipeBase { -private: - std::unique_ptr<VPUser> User; + VPUser User; public: VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) { if (BlockInMask) // nullptr means all-one mask. - User.reset(new VPUser({BlockInMask})); + User.addOperand(BlockInMask); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -977,14 +1105,23 @@ public: void execute(VPTransformState &State) override; /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override { O << " +\n" << Indent << "\"BRANCH-ON-MASK "; - if (User) - O << *User->getOperand(0); + if (VPValue *Mask = getMask()) + Mask->print(O, SlotTracker); else O << " All-One"; O << "\\l\""; } + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + assert(User.getNumOperands() <= 1 && "should have either 0 or 1 operands"); + // Mask is optional. + return User.getNumOperands() == 1 ? User.getOperand(0) : nullptr; + } }; /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when @@ -993,7 +1130,6 @@ public: /// The phi nodes can be scalar or vector depending on the users of the value. /// This recipe works in concert with VPBranchOnMaskRecipe. class VPPredInstPHIRecipe : public VPRecipeBase { -private: Instruction *PredInst; public: @@ -1012,23 +1148,42 @@ public: void execute(VPTransformState &State) override; /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; }; /// A Recipe for widening load/store operations. +/// The recipe uses the following VPValues: +/// - For load: Address, optional mask +/// - For store: Address, stored value, optional mask /// TODO: We currently execute only per-part unless a specific instance is /// provided. class VPWidenMemoryInstructionRecipe : public VPRecipeBase { -private: Instruction &Instr; VPUser User; + void setMask(VPValue *Mask) { + if (!Mask) + return; + User.addOperand(Mask); + } + + bool isMasked() const { + return (isa<LoadInst>(Instr) && User.getNumOperands() == 2) || + (isa<StoreInst>(Instr) && User.getNumOperands() == 3); + } + public: - VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Addr, - VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr), User({Addr}) { - if (Mask) - User.addOperand(Mask); + VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask) + : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Load), User({Addr}) { + setMask(Mask); + } + + VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, + VPValue *StoredValue, VPValue *Mask) + : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Store), + User({Addr, StoredValue}) { + setMask(Mask); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1044,15 +1199,52 @@ public: /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is optional and therefore the last, currently 2nd operand. - return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; + // Mask is optional and therefore the last operand. + return isMasked() ? User.getOperand(User.getNumOperands() - 1) : nullptr; + } + + /// Return the address accessed by this recipe. + VPValue *getStoredValue() const { + assert(isa<StoreInst>(Instr) && + "Stored value only available for store instructions"); + return User.getOperand(1); // Stored value is the 2nd, mandatory operand. } /// Generate the wide load/store. void execute(VPTransformState &State) override; /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + +/// A Recipe for widening the canonical induction variable of the vector loop. +class VPWidenCanonicalIVRecipe : public VPRecipeBase { + /// A VPValue representing the canonical vector IV. + VPValue Val; + +public: + VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {} + ~VPWidenCanonicalIVRecipe() override = default; + + /// Return the VPValue representing the canonical vector induction variable of + /// the vector loop. + const VPValue *getVPValue() const { return &Val; } + VPValue *getVPValue() { return &Val; } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenCanonicalIVSC; + } + + /// Generate a canonical vector induction variable of the vector loop, with + /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and + /// step = <VF*UF, VF*UF, ..., VF*UF>. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; }; /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It @@ -1144,7 +1336,6 @@ private: /// candidate VF's. The actual replication takes place only once the desired VF /// and UF have been determined. class VPRegionBlock : public VPBlockBase { -private: /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. VPBlockBase *Entry; @@ -1347,8 +1538,8 @@ struct GraphTraits<Inverse<VPRegionBlock *>> /// VPBlock. class VPlan { friend class VPlanPrinter; + friend class VPSlotTracker; -private: /// Hold the single entry to the Hierarchical CFG of the VPlan. VPBlockBase *Entry; @@ -1380,16 +1571,18 @@ private: SmallVector<VPValue *, 4> VPCBVs; public: - VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {} + VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { + if (Entry) + Entry->setPlan(this); + } ~VPlan() { if (Entry) VPBlockBase::deleteCFG(Entry); for (auto &MapEntry : Value2VPValue) - if (MapEntry.second != BackedgeTakenCount) - delete MapEntry.second; + delete MapEntry.second; if (BackedgeTakenCount) - delete BackedgeTakenCount; // Delete once, if in Value2VPValue or not. + delete BackedgeTakenCount; for (VPValue *Def : VPExternalDefs) delete Def; for (VPValue *CBV : VPCBVs) @@ -1402,7 +1595,11 @@ public: VPBlockBase *getEntry() { return Entry; } const VPBlockBase *getEntry() const { return Entry; } - VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; } + VPBlockBase *setEntry(VPBlockBase *Block) { + Entry = Block; + Block->setPlan(this); + return Entry; + } /// The backedge taken count of the original loop. VPValue *getOrCreateBackedgeTakenCount() { @@ -1433,7 +1630,7 @@ public: void addVPValue(Value *V) { assert(V && "Trying to add a null Value to VPlan"); assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); - Value2VPValue[V] = new VPValue(); + Value2VPValue[V] = new VPValue(V); } VPValue *getVPValue(Value *V) { @@ -1456,6 +1653,16 @@ public: /// Dump the plan to stderr (for debugging). void dump() const; + /// Returns a range mapping the values the range \p Operands to their + /// corresponding VPValues. + iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> + mapToVPValues(User::op_range Operands) { + std::function<VPValue *(Value *)> Fn = [this](Value *Op) { + return getOrAddVPValue(Op); + }; + return map_range(Operands, Fn); + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. @@ -1480,7 +1687,10 @@ private: unsigned BID = 0; SmallDenseMap<const VPBlockBase *, unsigned> BlockID; - VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P) {} + VPSlotTracker SlotTracker; + + VPlanPrinter(raw_ostream &O, const VPlan &P) + : OS(O), Plan(P), SlotTracker(&P) {} /// Handle indentation. void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } @@ -1635,7 +1845,6 @@ public: }; class VPInterleavedAccessInfo { -private: DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *> InterleaveGroupMap; @@ -1679,7 +1888,6 @@ public: /// Class that maps (parts of) an existing VPlan to trees of combined /// VPInstructions. class VPlanSlp { -private: enum class OpMode { Failed, Load, Opcode }; /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h index 19f5d2c00c604..a42ebc9ee955f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h +++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h @@ -30,7 +30,8 @@ using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>; /// Template specializations of GraphTraits for VPDomTreeNode. template <> struct GraphTraits<VPDomTreeNode *> - : public DomTreeGraphTraitsBase<VPDomTreeNode, VPDomTreeNode::iterator> {}; + : public DomTreeGraphTraitsBase<VPDomTreeNode, + VPDomTreeNode::const_iterator> {}; template <> struct GraphTraits<const VPDomTreeNode *> diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3f6a2efd55ccb..3a4872a721221 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -18,7 +18,7 @@ using namespace llvm; void VPlanTransforms::VPInstructionsToVPRecipes( Loop *OrigLoop, VPlanPtr &Plan, - LoopVectorizationLegality::InductionList *Inductions, + LoopVectorizationLegality::InductionList &Inductions, SmallPtrSetImpl<Instruction *> &DeadInstructions) { auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry()); @@ -41,7 +41,6 @@ void VPlanTransforms::VPInstructionsToVPRecipes( continue; VPBasicBlock *VPBB = Base->getEntryBasicBlock(); - VPRecipeBase *LastRecipe = nullptr; // Introduce each ingredient into VPlan. for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) { VPRecipeBase *Ingredient = &*I++; @@ -55,33 +54,29 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPRecipeBase *NewRecipe = nullptr; // Create VPWidenMemoryInstructionRecipe for loads and stores. - if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst)) + if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) NewRecipe = new VPWidenMemoryInstructionRecipe( - *Inst, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), + *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), nullptr /*Mask*/); + else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) + NewRecipe = new VPWidenMemoryInstructionRecipe( + *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), + Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/); else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) { - InductionDescriptor II = Inductions->lookup(Phi); + InductionDescriptor II = Inductions.lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || II.getKind() == InductionDescriptor::IK_FpInduction) { NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi); } else NewRecipe = new VPWidenPHIRecipe(Phi); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { - NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop); - } else { - // If the last recipe is a VPWidenRecipe, add Inst to it instead of - // creating a new recipe. - if (VPWidenRecipe *WidenRecipe = - dyn_cast_or_null<VPWidenRecipe>(LastRecipe)) { - WidenRecipe->appendInstruction(Inst); - Ingredient->eraseFromParent(); - continue; - } - NewRecipe = new VPWidenRecipe(Inst); - } + NewRecipe = new VPWidenGEPRecipe( + GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop); + } else + NewRecipe = + new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands())); NewRecipe->insertBefore(Ingredient); - LastRecipe = NewRecipe; Ingredient->eraseFromParent(); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 0d3bd7da09a70..4b20e8b4e3b31 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -19,14 +19,12 @@ namespace llvm { -class VPlanTransforms { - -public: +struct VPlanTransforms { /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. static void VPInstructionsToVPRecipes( Loop *OrigLoop, VPlanPtr &Plan, - LoopVectorizationLegality::InductionList *Inductions, + LoopVectorizationLegality::InductionList &Inductions, SmallPtrSetImpl<Instruction *> &DeadInstructions); }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 464498c29d89e..f73505d0279af 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -22,13 +22,14 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/iterator_range.h" namespace llvm { // Forward declarations. +class raw_ostream; +class Value; +class VPSlotTracker; class VPUser; // This is the base class of the VPlan Def/Use graph, used for modeling the data @@ -37,11 +38,11 @@ class VPUser; // and live-outs which the VPlan will need to fix accordingly. class VPValue { friend class VPBuilder; - friend class VPlanTransforms; + friend struct VPlanTransforms; friend class VPBasicBlock; friend class VPInterleavedAccessInfo; + friend class VPSlotTracker; -private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). SmallVector<VPUser *, 1> Users; @@ -62,6 +63,7 @@ protected: /// Return the underlying Value attached to this VPValue. Value *getUnderlyingValue() { return UnderlyingVal; } + const Value *getUnderlyingValue() const { return UnderlyingVal; } // Set \p Val as the underlying Value of this VPValue. void setUnderlyingValue(Value *Val) { @@ -85,9 +87,8 @@ public: /// for any other purpose, as the values may change as LLVM evolves. unsigned getVPValueID() const { return SubclassID; } - void printAsOperand(raw_ostream &OS) const { - OS << "%vp" << (unsigned short)(unsigned long long)this; - } + void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const; + void print(raw_ostream &OS, VPSlotTracker &Tracker) const; unsigned getNumUsers() const { return Users.size(); } void addUser(VPUser &User) { Users.push_back(&User); } @@ -129,7 +130,6 @@ raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); /// This class augments VPValue with operands which provide the inverse def-use /// edges from VPValue's users to their defs. class VPUser : public VPValue { -private: SmallVector<VPValue *, 2> Operands; protected: @@ -144,6 +144,12 @@ public: VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {} VPUser(std::initializer_list<VPValue *> Operands) : VPUser(ArrayRef<VPValue *>(Operands)) {} + template <typename IterT> + VPUser(iterator_range<IterT> Operands) : VPValue(VPValue::VPUserSC) { + for (VPValue *Operand : Operands) + addOperand(Operand); + } + VPUser(const VPUser &) = delete; VPUser &operator=(const VPUser &) = delete; @@ -180,6 +186,37 @@ public: return const_operand_range(op_begin(), op_end()); } }; +class VPlan; +class VPBasicBlock; +class VPRegionBlock; + +/// This class can be used to assign consecutive numbers to all VPValues in a +/// VPlan and allows querying the numbering for printing, similar to the +/// ModuleSlotTracker for IR values. +class VPSlotTracker { + DenseMap<const VPValue *, unsigned> Slots; + unsigned NextSlot = 0; + + void assignSlots(const VPBlockBase *VPBB); + void assignSlots(const VPRegionBlock *Region); + void assignSlots(const VPBasicBlock *VPBB); + void assignSlot(const VPValue *V); + + void assignSlots(const VPlan &Plan); + +public: + VPSlotTracker(const VPlan *Plan) { + if (Plan) + assignSlots(*Plan); + } + + unsigned getSlot(const VPValue *V) const { + auto I = Slots.find(V); + if (I == Slots.end()) + return -1; + return I->second; + } +}; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index ab3e7e2282e77..b384c94121e9b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "VPlanVerifier.h" +#include "VPlan.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h index 7d2b262521723..8e8de441648ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h @@ -24,14 +24,12 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H #define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H -#include "VPlan.h" - namespace llvm { +class VPRegionBlock; -/// Class with utility functions that can be used to check the consistency and +/// Struct with utility functions that can be used to check the consistency and /// invariants of a VPlan, including the components of its H-CFG. -class VPlanVerifier { -public: +struct VPlanVerifier { /// Verify the invariants of the H-CFG starting from \p TopRegion. The /// verification process comprises the following steps: /// 1. Region/Block verification: Check the Region/Block verification diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp new file mode 100644 index 0000000000000..64b41bf9cefa8 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -0,0 +1,699 @@ +//===------- VectorCombine.cpp - Optimize partial vector operations -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass optimizes scalar/vector interactions using target cost models. The +// transforms implemented here may not fit in traditional loop-based or SLP +// vectorization passes. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Vectorize.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "vector-combine" +STATISTIC(NumVecCmp, "Number of vector compares formed"); +STATISTIC(NumVecBO, "Number of vector binops formed"); +STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); +STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); +STATISTIC(NumScalarBO, "Number of scalar binops formed"); +STATISTIC(NumScalarCmp, "Number of scalar compares formed"); + +static cl::opt<bool> DisableVectorCombine( + "disable-vector-combine", cl::init(false), cl::Hidden, + cl::desc("Disable all vector combine transforms")); + +static cl::opt<bool> DisableBinopExtractShuffle( + "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, + cl::desc("Disable binop extract to shuffle transforms")); + +static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max(); + +namespace { +class VectorCombine { +public: + VectorCombine(Function &F, const TargetTransformInfo &TTI, + const DominatorTree &DT) + : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {} + + bool run(); + +private: + Function &F; + IRBuilder<> Builder; + const TargetTransformInfo &TTI; + const DominatorTree &DT; + + ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, + ExtractElementInst *Ext1, + unsigned PreferredExtractIndex) const; + bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, + unsigned Opcode, + ExtractElementInst *&ConvertToShuffle, + unsigned PreferredExtractIndex); + void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, + Instruction &I); + void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, + Instruction &I); + bool foldExtractExtract(Instruction &I); + bool foldBitcastShuf(Instruction &I); + bool scalarizeBinopOrCmp(Instruction &I); + bool foldExtractedCmps(Instruction &I); +}; +} // namespace + +static void replaceValue(Value &Old, Value &New) { + Old.replaceAllUsesWith(&New); + New.takeName(&Old); +} + +/// Determine which, if any, of the inputs should be replaced by a shuffle +/// followed by extract from a different index. +ExtractElementInst *VectorCombine::getShuffleExtract( + ExtractElementInst *Ext0, ExtractElementInst *Ext1, + unsigned PreferredExtractIndex = InvalidIndex) const { + assert(isa<ConstantInt>(Ext0->getIndexOperand()) && + isa<ConstantInt>(Ext1->getIndexOperand()) && + "Expected constant extract indexes"); + + unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue(); + unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue(); + + // If the extract indexes are identical, no shuffle is needed. + if (Index0 == Index1) + return nullptr; + + Type *VecTy = Ext0->getVectorOperand()->getType(); + assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); + int Cost0 = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + int Cost1 = TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + + // We are extracting from 2 different indexes, so one operand must be shuffled + // before performing a vector operation and/or extract. The more expensive + // extract will be replaced by a shuffle. + if (Cost0 > Cost1) + return Ext0; + if (Cost1 > Cost0) + return Ext1; + + // If the costs are equal and there is a preferred extract index, shuffle the + // opposite operand. + if (PreferredExtractIndex == Index0) + return Ext1; + if (PreferredExtractIndex == Index1) + return Ext0; + + // Otherwise, replace the extract with the higher index. + return Index0 > Index1 ? Ext0 : Ext1; +} + +/// Compare the relative costs of 2 extracts followed by scalar operation vs. +/// vector operation(s) followed by extract. Return true if the existing +/// instructions are cheaper than a vector alternative. Otherwise, return false +/// and if one of the extracts should be transformed to a shufflevector, set +/// \p ConvertToShuffle to that extract instruction. +bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, + ExtractElementInst *Ext1, + unsigned Opcode, + ExtractElementInst *&ConvertToShuffle, + unsigned PreferredExtractIndex) { + assert(isa<ConstantInt>(Ext0->getOperand(1)) && + isa<ConstantInt>(Ext1->getOperand(1)) && + "Expected constant extract indexes"); + Type *ScalarTy = Ext0->getType(); + auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); + int ScalarOpCost, VectorOpCost; + + // Get cost estimates for scalar and vector versions of the operation. + bool IsBinOp = Instruction::isBinaryOp(Opcode); + if (IsBinOp) { + ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); + VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); + } else { + assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && + "Expected a compare"); + ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy)); + VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy, + CmpInst::makeCmpResultType(VecTy)); + } + + // Get cost estimates for the extract elements. These costs will factor into + // both sequences. + unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue(); + unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue(); + + int Extract0Cost = + TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); + int Extract1Cost = + TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index); + + // A more expensive extract will always be replaced by a splat shuffle. + // For example, if Ext0 is more expensive: + // opcode (extelt V0, Ext0), (ext V1, Ext1) --> + // extelt (opcode (splat V0, Ext0), V1), Ext1 + // TODO: Evaluate whether that always results in lowest cost. Alternatively, + // check the cost of creating a broadcast shuffle and shuffling both + // operands to element 0. + int CheapExtractCost = std::min(Extract0Cost, Extract1Cost); + + // Extra uses of the extracts mean that we include those costs in the + // vector total because those instructions will not be eliminated. + int OldCost, NewCost; + if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { + // Handle a special case. If the 2 extracts are identical, adjust the + // formulas to account for that. The extra use charge allows for either the + // CSE'd pattern or an unoptimized form with identical values: + // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C + bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2) + : !Ext0->hasOneUse() || !Ext1->hasOneUse(); + OldCost = CheapExtractCost + ScalarOpCost; + NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost; + } else { + // Handle the general case. Each extract is actually a different value: + // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C + OldCost = Extract0Cost + Extract1Cost + ScalarOpCost; + NewCost = VectorOpCost + CheapExtractCost + + !Ext0->hasOneUse() * Extract0Cost + + !Ext1->hasOneUse() * Extract1Cost; + } + + ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex); + if (ConvertToShuffle) { + if (IsBinOp && DisableBinopExtractShuffle) + return true; + + // If we are extracting from 2 different indexes, then one operand must be + // shuffled before performing the vector operation. The shuffle mask is + // undefined except for 1 lane that is being translated to the remaining + // extraction lane. Therefore, it is a splat shuffle. Ex: + // ShufMask = { undef, undef, 0, undef } + // TODO: The cost model has an option for a "broadcast" shuffle + // (splat-from-element-0), but no option for a more general splat. + NewCost += + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } + + // Aggressively form a vector op if the cost is equal because the transform + // may enable further optimization. + // Codegen can reverse this transform (scalarize) if it was not profitable. + return OldCost < NewCost; +} + +/// Create a shuffle that translates (shifts) 1 element from the input vector +/// to a new element location. +static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, + unsigned NewIndex, IRBuilder<> &Builder) { + // The shuffle mask is undefined except for 1 lane that is being translated + // to the new element index. Example for OldIndex == 2 and NewIndex == 0: + // ShufMask = { 2, undef, undef, undef } + auto *VecTy = cast<FixedVectorType>(Vec->getType()); + SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); + ShufMask[NewIndex] = OldIndex; + Value *Undef = UndefValue::get(VecTy); + return Builder.CreateShuffleVector(Vec, Undef, ShufMask, "shift"); +} + +/// Given an extract element instruction with constant index operand, shuffle +/// the source vector (shift the scalar element) to a NewIndex for extraction. +/// Return null if the input can be constant folded, so that we are not creating +/// unnecessary instructions. +static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, + unsigned NewIndex, + IRBuilder<> &Builder) { + // If the extract can be constant-folded, this code is unsimplified. Defer + // to other passes to handle that. + Value *X = ExtElt->getVectorOperand(); + Value *C = ExtElt->getIndexOperand(); + assert(isa<ConstantInt>(C) && "Expected a constant index operand"); + if (isa<Constant>(X)) + return nullptr; + + Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(), + NewIndex, Builder); + return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex)); +} + +/// Try to reduce extract element costs by converting scalar compares to vector +/// compares followed by extract. +/// cmp (ext0 V0, C), (ext1 V1, C) +void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, + ExtractElementInst *Ext1, Instruction &I) { + assert(isa<CmpInst>(&I) && "Expected a compare"); + assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == + cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && + "Expected matching constant extract indexes"); + + // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C + ++NumVecCmp; + CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate(); + Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); + Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); + Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); + replaceValue(I, *NewExt); +} + +/// Try to reduce extract element costs by converting scalar binops to vector +/// binops followed by extract. +/// bo (ext0 V0, C), (ext1 V1, C) +void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, + ExtractElementInst *Ext1, Instruction &I) { + assert(isa<BinaryOperator>(&I) && "Expected a binary operator"); + assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == + cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && + "Expected matching constant extract indexes"); + + // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C + ++NumVecBO; + Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); + Value *VecBO = + Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1); + + // All IR flags are safe to back-propagate because any potential poison + // created in unused vector elements is discarded by the extract. + if (auto *VecBOInst = dyn_cast<Instruction>(VecBO)) + VecBOInst->copyIRFlags(&I); + + Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); + replaceValue(I, *NewExt); +} + +/// Match an instruction with extracted vector operands. +bool VectorCombine::foldExtractExtract(Instruction &I) { + // It is not safe to transform things like div, urem, etc. because we may + // create undefined behavior when executing those on unknown vector elements. + if (!isSafeToSpeculativelyExecute(&I)) + return false; + + Instruction *I0, *I1; + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) && + !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1)))) + return false; + + Value *V0, *V1; + uint64_t C0, C1; + if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) || + !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) || + V0->getType() != V1->getType()) + return false; + + // If the scalar value 'I' is going to be re-inserted into a vector, then try + // to create an extract to that same element. The extract/insert can be + // reduced to a "select shuffle". + // TODO: If we add a larger pattern match that starts from an insert, this + // probably becomes unnecessary. + auto *Ext0 = cast<ExtractElementInst>(I0); + auto *Ext1 = cast<ExtractElementInst>(I1); + uint64_t InsertIndex = InvalidIndex; + if (I.hasOneUse()) + match(I.user_back(), + m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex))); + + ExtractElementInst *ExtractToChange; + if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange, + InsertIndex)) + return false; + + if (ExtractToChange) { + unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; + ExtractElementInst *NewExtract = + translateExtract(ExtractToChange, CheapExtractIdx, Builder); + if (!NewExtract) + return false; + if (ExtractToChange == Ext0) + Ext0 = NewExtract; + else + Ext1 = NewExtract; + } + + if (Pred != CmpInst::BAD_ICMP_PREDICATE) + foldExtExtCmp(Ext0, Ext1, I); + else + foldExtExtBinop(Ext0, Ext1, I); + + return true; +} + +/// If this is a bitcast of a shuffle, try to bitcast the source vector to the +/// destination type followed by shuffle. This can enable further transforms by +/// moving bitcasts or shuffles together. +bool VectorCombine::foldBitcastShuf(Instruction &I) { + Value *V; + ArrayRef<int> Mask; + if (!match(&I, m_BitCast( + m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask)))))) + return false; + + // Disallow non-vector casts and length-changing shuffles. + // TODO: We could allow any shuffle. + auto *DestTy = dyn_cast<VectorType>(I.getType()); + auto *SrcTy = cast<VectorType>(V->getType()); + if (!DestTy || I.getOperand(0)->getType() != SrcTy) + return false; + + // The new shuffle must not cost more than the old shuffle. The bitcast is + // moved ahead of the shuffle, so assume that it has the same cost as before. + if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) > + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy)) + return false; + + unsigned DestNumElts = DestTy->getNumElements(); + unsigned SrcNumElts = SrcTy->getNumElements(); + SmallVector<int, 16> NewMask; + if (SrcNumElts <= DestNumElts) { + // The bitcast is from wide to narrow/equal elements. The shuffle mask can + // always be expanded to the equivalent form choosing narrower elements. + assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = DestNumElts / SrcNumElts; + narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); + } else { + // The bitcast is from narrow elements to wide elements. The shuffle mask + // must choose consecutive elements to allow casting first. + assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = SrcNumElts / DestNumElts; + if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) + return false; + } + // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' + ++NumShufOfBitcast; + Value *CastV = Builder.CreateBitCast(V, DestTy); + Value *Shuf = + Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), NewMask); + replaceValue(I, *Shuf); + return true; +} + +/// Match a vector binop or compare instruction with at least one inserted +/// scalar operand and convert to scalar binop/cmp followed by insertelement. +bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + Value *Ins0, *Ins1; + if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && + !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) + return false; + + // Do not convert the vector condition of a vector select into a scalar + // condition. That may cause problems for codegen because of differences in + // boolean formats and register-file transfers. + // TODO: Can we account for that in the cost model? + bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; + if (IsCmp) + for (User *U : I.users()) + if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value()))) + return false; + + // Match against one or both scalar values being inserted into constant + // vectors: + // vec_op VecC0, (inselt VecC1, V1, Index) + // vec_op (inselt VecC0, V0, Index), VecC1 + // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) + // TODO: Deal with mismatched index constants and variable indexes? + Constant *VecC0 = nullptr, *VecC1 = nullptr; + Value *V0 = nullptr, *V1 = nullptr; + uint64_t Index0 = 0, Index1 = 0; + if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0), + m_ConstantInt(Index0))) && + !match(Ins0, m_Constant(VecC0))) + return false; + if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), + m_ConstantInt(Index1))) && + !match(Ins1, m_Constant(VecC1))) + return false; + + bool IsConst0 = !V0; + bool IsConst1 = !V1; + if (IsConst0 && IsConst1) + return false; + if (!IsConst0 && !IsConst1 && Index0 != Index1) + return false; + + // Bail for single insertion if it is a load. + // TODO: Handle this once getVectorInstrCost can cost for load/stores. + auto *I0 = dyn_cast_or_null<Instruction>(V0); + auto *I1 = dyn_cast_or_null<Instruction>(V1); + if ((IsConst0 && I1 && I1->mayReadFromMemory()) || + (IsConst1 && I0 && I0->mayReadFromMemory())) + return false; + + uint64_t Index = IsConst0 ? Index1 : Index0; + Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType(); + Type *VecTy = I.getType(); + assert(VecTy->isVectorTy() && + (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && + (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || + ScalarTy->isPointerTy()) && + "Unexpected types for insert element into binop or cmp"); + + unsigned Opcode = I.getOpcode(); + int ScalarOpCost, VectorOpCost; + if (IsCmp) { + ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy); + VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy); + } else { + ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); + VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); + } + + // Get cost estimate for the insert element. This cost will factor into + // both sequences. + int InsertCost = + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); + int OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + + VectorOpCost; + int NewCost = ScalarOpCost + InsertCost + + (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + + (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); + + // We want to scalarize unless the vector variant actually has lower cost. + if (OldCost < NewCost) + return false; + + // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> + // inselt NewVecC, (scalar_op V0, V1), Index + if (IsCmp) + ++NumScalarCmp; + else + ++NumScalarBO; + + // For constant cases, extract the scalar element, this should constant fold. + if (IsConst0) + V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index)); + if (IsConst1) + V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index)); + + Value *Scalar = + IsCmp ? Builder.CreateCmp(Pred, V0, V1) + : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); + + Scalar->setName(I.getName() + ".scalar"); + + // All IR flags are safe to back-propagate. There is no potential for extra + // poison to be created by the scalar instruction. + if (auto *ScalarInst = dyn_cast<Instruction>(Scalar)) + ScalarInst->copyIRFlags(&I); + + // Fold the vector constants in the original vectors into a new base vector. + Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1) + : ConstantExpr::get(Opcode, VecC0, VecC1); + Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); + replaceValue(I, *Insert); + return true; +} + +/// Try to combine a scalar binop + 2 scalar compares of extracted elements of +/// a vector into vector operations followed by extract. Note: The SLP pass +/// may miss this pattern because of implementation problems. +bool VectorCombine::foldExtractedCmps(Instruction &I) { + // We are looking for a scalar binop of booleans. + // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1) + if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1)) + return false; + + // The compare predicates should match, and each compare should have a + // constant operand. + // TODO: Relax the one-use constraints. + Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); + Instruction *I0, *I1; + Constant *C0, *C1; + CmpInst::Predicate P0, P1; + if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) || + !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) || + P0 != P1) + return false; + + // The compare operands must be extracts of the same vector with constant + // extract indexes. + // TODO: Relax the one-use constraints. + Value *X; + uint64_t Index0, Index1; + if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) || + !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))) + return false; + + auto *Ext0 = cast<ExtractElementInst>(I0); + auto *Ext1 = cast<ExtractElementInst>(I1); + ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1); + if (!ConvertToShuf) + return false; + + // The original scalar pattern is: + // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) + CmpInst::Predicate Pred = P0; + unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp + : Instruction::ICmp; + auto *VecTy = dyn_cast<FixedVectorType>(X->getType()); + if (!VecTy) + return false; + + int OldCost = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2; + OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); + + // The proposed vector pattern is: + // vcmp = cmp Pred X, VecC + // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 + int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; + int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; + auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); + int NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); + NewCost += + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy); + NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); + NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex); + + // Aggressively form vector ops if the cost is equal because the transform + // may enable further optimization. + // Codegen can reverse this transform (scalarize) if it was not profitable. + if (OldCost < NewCost) + return false; + + // Create a vector constant from the 2 scalar constants. + SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), + UndefValue::get(VecTy->getElementType())); + CmpC[Index0] = C0; + CmpC[Index1] = C1; + Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); + + Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder); + Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(), + VCmp, Shuf); + Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex); + replaceValue(I, *NewExt); + ++NumVecCmpBO; + return true; +} + +/// This is the entry point for all transforms. Pass manager differences are +/// handled in the callers of this function. +bool VectorCombine::run() { + if (DisableVectorCombine) + return false; + + bool MadeChange = false; + for (BasicBlock &BB : F) { + // Ignore unreachable basic blocks. + if (!DT.isReachableFromEntry(&BB)) + continue; + // Do not delete instructions under here and invalidate the iterator. + // Walk the block forwards to enable simple iterative chains of transforms. + // TODO: It could be more efficient to remove dead instructions + // iteratively in this loop rather than waiting until the end. + for (Instruction &I : BB) { + if (isa<DbgInfoIntrinsic>(I)) + continue; + Builder.SetInsertPoint(&I); + MadeChange |= foldExtractExtract(I); + MadeChange |= foldBitcastShuf(I); + MadeChange |= scalarizeBinopOrCmp(I); + MadeChange |= foldExtractedCmps(I); + } + } + + // We're done with transforms, so remove dead instructions. + if (MadeChange) + for (BasicBlock &BB : F) + SimplifyInstructionsInBlock(&BB); + + return MadeChange; +} + +// Pass manager boilerplate below here. + +namespace { +class VectorCombineLegacyPass : public FunctionPass { +public: + static char ID; + VectorCombineLegacyPass() : FunctionPass(ID) { + initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + VectorCombine Combiner(F, TTI, DT); + return Combiner.run(); + } +}; +} // namespace + +char VectorCombineLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine", + "Optimize scalar/vector ops", false, + false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine", + "Optimize scalar/vector ops", false, false) +Pass *llvm::createVectorCombinePass() { + return new VectorCombineLegacyPass(); +} + +PreservedAnalyses VectorCombinePass::run(Function &F, + FunctionAnalysisManager &FAM) { + TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + VectorCombine Combiner(F, TTI, DT); + if (!Combiner.run()) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + PA.preserve<GlobalsAA>(); + PA.preserve<AAManager>(); + PA.preserve<BasicAA>(); + return PA; +} diff --git a/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/llvm/lib/Transforms/Vectorize/Vectorize.cpp index 6a4f9169c2af0..0296a995ad29a 100644 --- a/llvm/lib/Transforms/Vectorize/Vectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/Vectorize.cpp @@ -21,12 +21,12 @@ using namespace llvm; -/// initializeVectorizationPasses - Initialize all passes linked into the -/// Vectorization library. +/// Initialize all passes linked into the Vectorization library. void llvm::initializeVectorization(PassRegistry &Registry) { initializeLoopVectorizePass(Registry); initializeSLPVectorizerPass(Registry); initializeLoadStoreVectorizerLegacyPassPass(Registry); + initializeVectorCombineLegacyPassPass(Registry); } void LLVMInitializeVectorization(LLVMPassRegistryRef R) { |