diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 18 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 104 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h | 48 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3585 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2218 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 10 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 302 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 606 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp | 4 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanSLP.cpp | 25 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 10 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanValue.h | 212 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 207 |
14 files changed, 4875 insertions, 2482 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 9b81afbb4b6c..6ec5590d76ba 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -666,6 +666,10 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { cast<IntrinsicInst>(&I)->getIntrinsicID() == Intrinsic::sideeffect) { // Ignore llvm.sideeffect calls. + } else if (isa<IntrinsicInst>(&I) && + cast<IntrinsicInst>(&I)->getIntrinsicID() == + Intrinsic::pseudoprobe) { + // Ignore llvm.pseudoprobe calls. } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'); @@ -762,8 +766,8 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { return Chain.slice(0, ChainIdx); } -static ChainID getChainID(const Value *Ptr, const DataLayout &DL) { - const Value *ObjPtr = GetUnderlyingObject(Ptr, DL); +static ChainID getChainID(const Value *Ptr) { + const Value *ObjPtr = getUnderlyingObject(Ptr); if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { // The select's themselves are distinct instructions even if they share the // same condition and evaluate to consecutive pointers for true and false @@ -830,7 +834,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Save the load locations. - const ChainID ID = getChainID(Ptr, DL); + const ChainID ID = getChainID(Ptr); LoadRefs[ID].push_back(LI); } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { if (!SI->isSimple()) @@ -876,7 +880,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Save store location. - const ChainID ID = getChainID(Ptr, DL); + const ChainID ID = getChainID(Ptr); StoreRefs[ID].push_back(SI); } } @@ -1027,8 +1031,8 @@ bool Vectorizer::vectorizeStoreChain( unsigned EltSzInBytes = Sz / 8; unsigned SzInBytes = EltSzInBytes * ChainSize; - VectorType *VecTy; - VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy); + FixedVectorType *VecTy; + auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy); if (VecStoreTy) VecTy = FixedVectorType::get(StoreTy->getScalarType(), Chain.size() * VecStoreTy->getNumElements()); @@ -1180,7 +1184,7 @@ bool Vectorizer::vectorizeLoadChain( unsigned EltSzInBytes = Sz / 8; unsigned SzInBytes = EltSzInBytes * ChainSize; VectorType *VecTy; - VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy); + auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy); if (VecLoadTy) VecTy = FixedVectorType::get(LoadTy->getScalarType(), Chain.size() * VecLoadTy->getNumElements()); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 23613775d896..2ab0848193f6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -13,13 +13,16 @@ // pass. It should be easy to create an analysis pass around it if there // is a need (but D45420 needs to happen first). // + #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" using namespace llvm; @@ -63,6 +66,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { return (Val <= 1); case HK_ISVECTORIZED: case HK_PREDICATE: + case HK_SCALABLE: return (Val == 0 || Val == 1); } return false; @@ -75,7 +79,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), - Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L), + Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), + Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -88,7 +93,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, // If the vectorization width and interleaving count are both 1 then // consider the loop to have been already vectorized because there's // nothing more that we can do. - IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; + IsVectorized.Value = + getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1; LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); } @@ -161,7 +167,7 @@ void LoopVectorizeHints::emitRemarkWithHints() const { if (Force.Value == LoopVectorizeHints::FK_Enabled) { R << " (Force=" << NV("Force", true); if (Width.Value != 0) - R << ", Vector Width=" << NV("VectorWidth", Width.Value); + R << ", Vector Width=" << NV("VectorWidth", getWidth()); if (Interleave.Value != 0) R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); R << ")"; @@ -172,11 +178,11 @@ void LoopVectorizeHints::emitRemarkWithHints() const { } const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { - if (getWidth() == 1) + if (getWidth() == ElementCount::getFixed(1)) return LV_NAME; if (getForce() == LoopVectorizeHints::FK_Disabled) return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) + if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) return LV_NAME; return OptimizationRemarkAnalysis::AlwaysPrint; } @@ -227,7 +233,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { return; unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate}; + Hint *Hints[] = {&Width, &Interleave, &Force, + &IsVectorized, &Predicate, &Scalable}; for (auto H : Hints) { if (Name == H->Name) { if (H->validate(Val)) @@ -412,7 +419,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); - bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize(); + Function *F = TheLoop->getHeader()->getParent(); + bool OptForSize = F->hasOptSize() || + llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); + bool CanAddPredicate = !OptForSize; int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false); if (Stride == 1 || Stride == -1) return Stride; @@ -424,7 +435,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) { } bool LoopVectorizationLegality::canVectorizeOuterLoop() { - assert(!TheLoop->empty() && "We are not vectorizing an outer loop."); + assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop."); // Store the result and return it at the end instead of exiting early, in case // allowExtraAnalysis is used to report multiple reasons for not vectorizing. bool Result = true; @@ -768,7 +779,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // supported on the target. if (ST->getMetadata(LLVMContext::MD_nontemporal)) { // Arbitrarily try a vector of 2 elements. - auto *VecTy = FixedVectorType::get(T, /*NumElements=*/2); + auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); assert(VecTy && "did not find vectorized version of stored type"); if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { reportVectorizationFailure( @@ -783,7 +794,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (LD->getMetadata(LLVMContext::MD_nontemporal)) { // For nontemporal loads, check that a nontemporal vector version is // supported on the target (arbitrarily try a vector of 2 elements). - auto *VecTy = FixedVectorType::get(I.getType(), /*NumElements=*/2); + auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); assert(VecTy && "did not find vectorized version of load type"); if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { reportVectorizationFailure( @@ -912,7 +923,10 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { } bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) { + BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, + SmallPtrSetImpl<const Instruction *> &MaskedOp, + SmallPtrSetImpl<Instruction *> &ConditionalAssumes, + bool PreserveGuards) const { const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); for (Instruction &I : *BB) { @@ -930,6 +944,12 @@ bool LoopVectorizationLegality::blockCanBePredicated( continue; } + // Do not let llvm.experimental.noalias.scope.decl block the vectorization. + // TODO: there might be cases that it should block the vectorization. Let's + // ignore those for now. + if (isa<NoAliasScopeDeclInst>(&I)) + continue; + // We might be able to hoist the load. if (I.mayReadFromMemory()) { auto *LI = dyn_cast<LoadInst>(&I); @@ -999,7 +1019,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { ScalarEvolution &SE = *PSE.getSE(); for (Instruction &I : *BB) { LoadInst *LI = dyn_cast<LoadInst>(&I); - if (LI && !mustSuppressSpeculation(*LI) && + if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) && isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT)) SafePointers.insert(LI->getPointerOperand()); } @@ -1019,7 +1039,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointers)) { + if (!blockCanBePredicated(BB, SafePointers, MaskedOp, + ConditionalAssumes)) { reportVectorizationFailure( "Control flow cannot be substituted for a select", "control flow cannot be substituted for a select", @@ -1044,7 +1065,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // Helper function to canVectorizeLoopNestCFG. bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, bool UseVPlanNativePath) { - assert((UseVPlanNativePath || Lp->empty()) && + assert((UseVPlanNativePath || Lp->isInnermost()) && "VPlan-native path is not enabled."); // TODO: ORE should be improved to show more accurate information when an @@ -1080,22 +1101,14 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, return false; } - // We must have a single exiting block. - if (!Lp->getExitingBlock()) { - reportVectorizationFailure("The loop must have an exiting block", - "loop control flow is not understood by vectorizer", - "CFGNotUnderstood", ORE, TheLoop); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We only handle bottom-tested loops, i.e. loop in which the condition is - // checked at the end of each iteration. With that we can assume that all - // instructions in the loop are executed the same number of times. - if (Lp->getExitingBlock() != Lp->getLoopLatch()) { - reportVectorizationFailure("The exiting block is not the loop latch", + // We currently must have a single "exit block" after the loop. Note that + // multiple "exiting blocks" inside the loop are allowed, provided they all + // reach the single exit block. + // TODO: This restriction can be relaxed in the near future, it's here solely + // to allow separation of changes for review. We need to generalize the phi + // update logic in a number of places. + if (!Lp->getUniqueExitBlock()) { + reportVectorizationFailure("The loop must have a unique exit block", "loop control flow is not understood by vectorizer", "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) @@ -1103,7 +1116,6 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, else return false; } - return Result; } @@ -1154,7 +1166,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { // Specific checks for outer loops. We skip the remaining legal checks at this // point because they don't support outer loops. - if (!TheLoop->empty()) { + if (!TheLoop->isInnermost()) { assert(UseVPlanNativePath && "VPlan-native path is not enabled."); if (!canVectorizeOuterLoop()) { @@ -1171,7 +1183,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return Result; } - assert(TheLoop->empty() && "Inner loop expected."); + assert(TheLoop->isInnermost() && "Inner loop expected."); // Check if we can if-convert non-single-bb loops. unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { @@ -1246,10 +1258,10 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { Instruction *UI = cast<Instruction>(U); if (TheLoop->contains(UI)) continue; - reportVectorizationFailure( - "Cannot fold tail by masking, loop has an outside user for", - "Cannot fold tail by masking in the presence of live outs.", - "LiveOutFoldingTailByMasking", ORE, TheLoop, UI); + LLVM_DEBUG( + dbgs() + << "LV: Cannot fold tail by masking, loop has an outside user for " + << *UI << "\n"); return false; } } @@ -1257,20 +1269,26 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { // The list of pointers that we can safely read and write to remains empty. SmallPtrSet<Value *, 8> SafePointers; + SmallPtrSet<const Instruction *, 8> TmpMaskedOp; + SmallPtrSet<Instruction *, 8> TmpConditionalAssumes; + // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) { - reportVectorizationFailure( - "Cannot fold tail by masking as required", - "control flow cannot be substituted for a select", - "NoCFGForSelect", ORE, TheLoop, - BB->getTerminator()); + if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, + TmpConditionalAssumes, + /* MaskAllLoads= */ true)) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } } LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); + + MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); + ConditionalAssumes.insert(TmpConditionalAssumes.begin(), + TmpConditionalAssumes.end()); + return true; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 8dd06983cd84..1795470fa58c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -34,6 +34,7 @@ namespace llvm { class LoopVectorizationLegality; class LoopVectorizationCostModel; class PredicatedScalarEvolution; +class VPRecipeBuilder; /// VPlan-based builder utility analogous to IRBuilder. class VPBuilder { @@ -171,16 +172,22 @@ public: /// Information about vectorization costs struct VectorizationFactor { // Vector width with best cost - unsigned Width; + ElementCount Width; // Cost of the loop with that width unsigned Cost; // Width 1 means no vectorization, cost 0 means uncomputed cost. - static VectorizationFactor Disabled() { return {1, 0}; } + static VectorizationFactor Disabled() { + return {ElementCount::getFixed(1), 0}; + } bool operator==(const VectorizationFactor &rhs) const { return Width == rhs.Width && Cost == rhs.Cost; } + + bool operator!=(const VectorizationFactor &rhs) const { + return !(*this == rhs); + } }; /// Planner drives the vectorization process after having passed @@ -226,7 +233,10 @@ class LoopVectorizationPlanner { /// A builder used to construct the current plan. VPBuilder Builder; - unsigned BestVF = 0; + /// The best number of elements of the vector types used in the + /// transformed loop. BestVF = None means that vectorization is + /// disabled. + Optional<ElementCount> BestVF = None; unsigned BestUF = 0; public: @@ -241,14 +251,14 @@ public: /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional<VectorizationFactor> plan(unsigned UserVF, unsigned UserIC); + Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(unsigned UserVF); + VectorizationFactor planInVPlanNativePath(ElementCount UserVF); /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(unsigned VF, unsigned UF); + void setBestPlan(ElementCount VF, unsigned UF); /// Generate the IR code for the body of the vectorized loop according to the /// best selected VPlan. @@ -259,11 +269,21 @@ public: O << *Plan; } + /// Look through the existing plans and return true if we have one with all + /// the vectorization factors in question. + bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const { + return any_of(VPlans, [&](const VPlanPtr &Plan) { + return all_of(VFs, [&](const ElementCount &VF) { + return Plan->hasVF(VF); + }); + }); + } + /// Test a \p Predicate on a \p Range of VF's. Return the value of applying /// \p Predicate on Range.Start, possibly decreasing Range.End such that the /// returned value holds for the entire \p Range. static bool - getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate, + getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate, VFRange &Range); protected: @@ -275,7 +295,7 @@ protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. - void buildVPlans(unsigned MinVF, unsigned MaxVF); + void buildVPlans(ElementCount MinVF, ElementCount MaxVF); private: /// Build a VPlan according to the information gathered by Legal. \return a @@ -286,14 +306,20 @@ private: /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. VPlanPtr buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, - SmallPtrSetImpl<Instruction *> &DeadInstructions, + VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const DenseMap<Instruction *, Instruction *> &SinkAfter); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF); + void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + + /// Adjust the recipes for any inloop reductions. The chain of instructions + /// leading from the loop exit instr to the phi need to be converted to + /// reductions, with one operand being vector and the other being the scalar + /// reduction chain. + void adjustRecipesForInLoopReductions(VPlanPtr &Plan, + VPRecipeBuilder &RecipeBuilder); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 35af8e425778..ea0d7673edf6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -130,6 +130,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/InstructionCost.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -157,18 +158,37 @@ using namespace llvm; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME +#ifndef NDEBUG +const char VerboseDebug[] = DEBUG_TYPE "-verbose"; +#endif + /// @{ /// Metadata attribute names -static const char *const LLVMLoopVectorizeFollowupAll = - "llvm.loop.vectorize.followup_all"; -static const char *const LLVMLoopVectorizeFollowupVectorized = +const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; +const char LLVMLoopVectorizeFollowupVectorized[] = "llvm.loop.vectorize.followup_vectorized"; -static const char *const LLVMLoopVectorizeFollowupEpilogue = +const char LLVMLoopVectorizeFollowupEpilogue[] = "llvm.loop.vectorize.followup_epilogue"; /// @} STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); +STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); + +static cl::opt<bool> EnableEpilogueVectorization( + "enable-epilogue-vectorization", cl::init(true), cl::Hidden, + cl::desc("Enable vectorization of epilogue loops.")); + +static cl::opt<unsigned> EpilogueVectorizationForceVF( + "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, + cl::desc("When epilogue vectorization is enabled, and a value greater than " + "1 is specified, forces the given VF for all applicable epilogue " + "loops.")); + +static cl::opt<unsigned> EpilogueVectorizationMinVF( + "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, + cl::desc("Only loops with vectorization factor equal to or larger than " + "the specified value are considered for epilogue vectorization.")); /// Loops with a known constant trip count below this number are vectorized only /// if no scalar iteration overheads are incurred. @@ -178,13 +198,36 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); -// Indicates that an epilogue is undesired, predication is preferred. -// This means that the vectorizer will try to fold the loop-tail (epilogue) -// into the loop and predicate the loop body accordingly. -static cl::opt<bool> PreferPredicateOverEpilog( - "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, - cl::desc("Indicate that an epilogue is undesired, predication should be " - "used instead.")); +// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, +// that predication is preferred, and this lists all options. I.e., the +// vectorizer will try to fold the tail-loop (epilogue) into the vector body +// and predicate the instructions accordingly. If tail-folding fails, there are +// different fallback strategies depending on these values: +namespace PreferPredicateTy { + enum Option { + ScalarEpilogue = 0, + PredicateElseScalarEpilogue, + PredicateOrDontVectorize + }; +} // namespace PreferPredicateTy + +static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( + "prefer-predicate-over-epilogue", + cl::init(PreferPredicateTy::ScalarEpilogue), + cl::Hidden, + cl::desc("Tail-folding and predication preferences over creating a scalar " + "epilogue loop."), + cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, + "scalar-epilogue", + "Don't tail-predicate loops, create scalar epilogue"), + clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, + "predicate-else-scalar-epilogue", + "prefer tail-folding, create scalar epilogue if tail " + "folding fails."), + clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, + "predicate-dont-vectorize", + "prefers tail-folding, don't attempt vectorization if " + "tail-folding fails."))); static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, @@ -196,7 +239,7 @@ static cl::opt<bool> EnableInterleavedMemAccesses( cl::desc("Enable vectorization on interleaved memory accesses in a loop")); /// An interleave-group may need masking if it resides in a block that needs -/// predication, or in order to mask away gaps. +/// predication, or in order to mask away gaps. static cl::opt<bool> EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); @@ -230,6 +273,12 @@ static cl::opt<unsigned> ForceTargetInstructionCost( "an instruction to a single constant value. Mostly " "useful for getting consistent testing.")); +static cl::opt<bool> ForceTargetSupportsScalableVectors( + "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, + cl::desc( + "Pretend that scalable vectors are supported, even if the target does " + "not support them. This flag should only be used for testing.")); + static cl::opt<unsigned> SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, cl::desc( @@ -247,6 +296,12 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave( cl::desc( "Enable runtime interleaving until load/store ports are saturated")); +/// Interleave small loops with scalar reductions. +static cl::opt<bool> InterleaveSmallLoopScalarReduction( + "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, + cl::desc("Enable interleaving for loops with small iteration counts that " + "contain scalar reductions to expose ILP.")); + /// The number of stores in a loop that are allowed to need predication. static cl::opt<unsigned> NumberOfStoresToPredicate( "vectorize-num-stores-pred", cl::init(1), cl::Hidden, @@ -265,6 +320,17 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); +static cl::opt<bool> + PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), + cl::Hidden, + cl::desc("Prefer in-loop vector reductions, " + "overriding the targets preference.")); + +static cl::opt<bool> PreferPredicatedReductionSelect( + "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, + cl::desc( + "Prefer predicating a reduction operation over an after loop select.")); + cl::opt<bool> EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -307,12 +373,14 @@ static Type *getMemInstValueType(Value *I) { /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. -static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { +static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a <VF x Ty> vector. - if (VF > 1) { - auto *VectorTy = FixedVectorType::get(Ty, VF); - return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); + if (VF.isVector()) { + auto *VectorTy = VectorType::get(Ty, VF); + return TypeSize::get(VF.getKnownMinValue() * + DL.getTypeAllocSize(Ty).getFixedValue(), + VF.isScalable()) != DL.getTypeStoreSize(VectorTy); } // If the vectorization factor is one, we just check if an array of type Ty @@ -393,29 +461,42 @@ public: LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, unsigned VecWidth, + OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), - VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} + VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), + BFI(BFI), PSI(PSI) { + // Query this against the original loop and save it here because the profile + // of the original loop header may change as the transformation happens. + OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( + OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); + } + virtual ~InnerLoopVectorizer() = default; - /// Create a new empty loop. Unlink the old loop and connect the new one. - /// Return the pre-header block of the new loop. - BasicBlock *createVectorizedLoopSkeleton(); + /// Create a new empty loop that will contain vectorized instructions later + /// on, while the old loop will be used as the scalar remainder. Control flow + /// is generated around the vectorized (and scalar epilogue) loops consisting + /// of various checks and bypasses. Return the pre-header block of the new + /// loop. + /// In the case of epilogue vectorization, this function is overriden to + /// handle the more complex control flow around the loops. + virtual BasicBlock *createVectorizedLoopSkeleton(); /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I, VPUser &Operands, + void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, VPTransformState &State); /// Widen a single call instruction within the innermost loop. - void widenCallInstruction(CallInst &I, VPUser &ArgOperands, + void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State); /// Widen a single select instruction within the innermost loop. - void widenSelectInstruction(SelectInst &I, VPUser &Operands, + void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, bool InvariantCond, VPTransformState &State); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. @@ -431,14 +512,15 @@ public: /// Vectorize a single GetElementPtrInst based on information gathered and /// decisions taken during planning. - void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, - unsigned VF, bool IsPtrLoopInvariant, + void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, + unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, + Value *StartV, unsigned UF, ElementCount VF); /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane @@ -452,7 +534,8 @@ public: /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); + void widenIntOrFpInduction(PHINode *IV, Value *Start, + TruncInst *Trunc = nullptr); /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a /// vector or scalar value on-demand if one is not yet available. When @@ -477,6 +560,10 @@ public: /// value into a vector. Value *getOrCreateVectorValue(Value *V, unsigned Part); + void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { + VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); + } + /// Return a value in the new loop corresponding to \p V from the original /// loop at unroll and vector indices \p Instance. If the value has been /// vectorized but not scalarized, the necessary extractelement instruction @@ -491,7 +578,9 @@ public: /// BlockInMask is non-null. Use \p State to translate given VPValues to IR /// values in the vectorized loop. void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, + ArrayRef<VPValue *> VPDefs, VPTransformState &State, VPValue *Addr, + ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask = nullptr); /// Vectorize Load and Store instructions with the base address given in \p @@ -499,8 +588,8 @@ public: /// non-null. Use \p State to translate given VPValues to IR values in the /// vectorized loop. void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, - VPValue *Addr, VPValue *StoredValue, - VPValue *BlockInMask); + VPValue *Def, VPValue *Addr, + VPValue *StoredValue, VPValue *BlockInMask); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -544,10 +633,11 @@ protected: /// Clear NSW/NUW flags from reduction instructions if necessary. void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); - /// The Loop exit block may have single value PHI nodes with some - /// incoming value. While vectorizing we only handled real values - /// that were defined inside the loop and we should have one value for - /// each predecessor of its parent basic block. See PR14725. + /// Fixup the LCSSA phi nodes in the unique exit block. This simply + /// means we need to add the appropriate incoming value from the middle + /// block as exiting edges from the scalar epilogue loop (if present) are + /// already in place, and we exit the vector loop exclusively to the middle + /// block. void fixLCSSAPHIs(); /// Iteratively sink the scalarized operands of a predicated instruction into @@ -586,7 +676,8 @@ protected: /// truncate instruction, instead of widening the original IV, we widen a /// version of the IV truncated to \p EntryVal's type. void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, - Value *Step, Instruction *EntryVal); + Value *Step, Value *Start, + Instruction *EntryVal); /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. @@ -654,6 +745,28 @@ protected: const DataLayout &DL, const InductionDescriptor &ID) const; + /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, + /// vector loop preheader, middle block and scalar preheader. Also + /// allocate a loop object for the new vector loop and return it. + Loop *createVectorLoopSkeleton(StringRef Prefix); + + /// Create new phi nodes for the induction variables to resume iteration count + /// in the scalar epilogue, from where the vectorized loop left off (given by + /// \p VectorTripCount). + /// In cases where the loop skeleton is more complicated (eg. epilogue + /// vectorization) and the resume values can come from an additional bypass + /// block, the \p AdditionalBypass pair provides information about the bypass + /// block and the end value on the edge from bypass to this loop. + void createInductionResumeValues( + Loop *L, Value *VectorTripCount, + std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); + + /// Complete the loop skeleton by adding debug MDs, creating appropriate + /// conditional branches in the middle block, preparing the builder and + /// running the verifier. Take in the vector loop \p L as argument, and return + /// the preheader of the completed vector loop. + BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); + /// Add additional metadata to \p To that was not present on \p Orig. /// /// Currently this is used to add the noalias annotations based on the @@ -672,6 +785,11 @@ protected: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); + /// Allow subclasses to override and print debug traces before/after vplan + /// execution, when trace information is requested. + virtual void printDebugTracesAtStart(){}; + virtual void printDebugTracesAtEnd(){}; + /// The original loop. Loop *OrigLoop; @@ -710,7 +828,7 @@ protected: /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. - unsigned VF; + ElementCount VF; /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. @@ -730,7 +848,8 @@ protected: /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; - /// The ExitBlock of the scalar loop. + /// The (unique) ExitBlock of the scalar loop. Note that + /// there can be multiple exiting edges reaching this block. BasicBlock *LoopExitBlock; /// The vector loop body. @@ -779,6 +898,14 @@ protected: // Vector of original scalar PHIs whose corresponding widened PHIs need to be // fixed up at the end of vector code generation. SmallVector<PHINode *, 8> OrigPHIsToFix; + + /// BFI and PSI are used to check for profile guided size optimizations. + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; + + // Whether this loop should be optimized for size based on profile guided size + // optimizatios. + bool OptForSizeBasedOnProfile; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -789,9 +916,11 @@ public: const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM) {} + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), UnrollFactor, LVL, CM, + BFI, PSI) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -801,6 +930,128 @@ private: Value *reverseVector(Value *Vec) override; }; +/// Encapsulate information regarding vectorization of a loop and its epilogue. +/// This information is meant to be updated and used across two stages of +/// epilogue vectorization. +struct EpilogueLoopVectorizationInfo { + ElementCount MainLoopVF = ElementCount::getFixed(0); + unsigned MainLoopUF = 0; + ElementCount EpilogueVF = ElementCount::getFixed(0); + unsigned EpilogueUF = 0; + BasicBlock *MainLoopIterationCountCheck = nullptr; + BasicBlock *EpilogueIterationCountCheck = nullptr; + BasicBlock *SCEVSafetyCheck = nullptr; + BasicBlock *MemSafetyCheck = nullptr; + Value *TripCount = nullptr; + Value *VectorTripCount = nullptr; + + EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, + unsigned EUF) + : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), + EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { + assert(EUF == 1 && + "A high UF for the epilogue loop is likely not beneficial."); + } +}; + +/// An extension of the inner loop vectorizer that creates a skeleton for a +/// vectorized loop that has its epilogue (residual) also vectorized. +/// The idea is to run the vplan on a given loop twice, firstly to setup the +/// skeleton and vectorize the main loop, and secondly to complete the skeleton +/// from the first step and vectorize the epilogue. This is achieved by +/// deriving two concrete strategy classes from this base class and invoking +/// them in succession from the loop vectorizer planner. +class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { +public: + InnerLoopAndEpilogueVectorizer( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), + EPI(EPI) {} + + // Override this function to handle the more complex control flow around the + // three loops. + BasicBlock *createVectorizedLoopSkeleton() final override { + return createEpilogueVectorizedLoopSkeleton(); + } + + /// The interface for creating a vectorized skeleton using one of two + /// different strategies, each corresponding to one execution of the vplan + /// as described above. + virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + + /// Holds and updates state information required to vectorize the main loop + /// and its epilogue in two separate passes. This setup helps us avoid + /// regenerating and recomputing runtime safety checks. It also helps us to + /// shorten the iteration-count-check path length for the cases where the + /// iteration count of the loop is so small that the main vector loop is + /// completely skipped. + EpilogueLoopVectorizationInfo &EPI; +}; + +/// A specialized derived class of inner loop vectorizer that performs +/// vectorization of *main* loops in the process of vectorizing loops and their +/// epilogues. +class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { +public: + EpilogueVectorizerMainLoop( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, LVL, CM, BFI, PSI) {} + /// Implements the interface for creating a vectorized skeleton using the + /// *main loop* strategy (ie the first pass of vplan execution). + BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + +protected: + /// Emits an iteration count bypass check once for the main loop (when \p + /// ForEpilogue is false) and once for the epilogue loop (when \p + /// ForEpilogue is true). + BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, + bool ForEpilogue); + void printDebugTracesAtStart() override; + void printDebugTracesAtEnd() override; +}; + +// A specialized derived class of inner loop vectorizer that performs +// vectorization of *epilogue* loops in the process of vectorizing loops and +// their epilogues. +class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { +public: + EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, + EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, + llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, LVL, CM, BFI, PSI) {} + /// Implements the interface for creating a vectorized skeleton using the + /// *epilogue loop* strategy (ie the second pass of vplan execution). + BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + +protected: + /// Emits an iteration count bypass check after the main vector loop has + /// finished to see if there are any iterations left to execute by either + /// the vector epilogue or the scalar epilogue. + BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, + BasicBlock *Bypass, + BasicBlock *Insert); + void printDebugTracesAtStart() override; + void printDebugTracesAtEnd() override; +}; } // end namespace llvm /// Look for a meaningful debug location on the instruction or it's @@ -827,7 +1078,9 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && !isa<DbgInfoIntrinsic>(Inst)) { - auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto NewDIL = + DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); if (NewDIL) B.SetCurrentDebugLocation(NewDIL.getValue()); else @@ -881,6 +1134,15 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, return R; } +/// Return a value for Step multiplied by VF. +static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { + assert(isa<ConstantInt>(Step) && "Expected an integer step"); + Constant *StepVal = ConstantInt::get( + Step->getType(), + cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -952,7 +1214,10 @@ enum ScalarEpilogueLowering { CM_ScalarEpilogueNotAllowedLowTripLoop, // Loop hint predicate indicating an epilogue is undesired. - CM_ScalarEpilogueNotNeededUsePredicate + CM_ScalarEpilogueNotNeededUsePredicate, + + // Directive indicating we must either tail fold or not vectorize + CM_ScalarEpilogueNotAllowedUsePredicate }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -979,7 +1244,7 @@ public: /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); + Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -989,10 +1254,13 @@ public: /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(unsigned MaxVF); + VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); + VectorizationFactor + selectEpilogueVectorizationFactor(const ElementCount MaxVF, + const LoopVectorizationPlanner &LVP); /// Setup cost-based decisions for user vectorization factor. - void selectUserVectorizationFactor(unsigned UserVF) { + void selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); } @@ -1006,7 +1274,7 @@ public: /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); + unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1015,7 +1283,7 @@ public: /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. - void setCostBasedWideningDecision(unsigned VF); + void setCostBasedWideningDecision(ElementCount VF); /// A struct that represents some properties of the register usage /// of a loop. @@ -1030,11 +1298,16 @@ public: /// \return Returns information about the register usages of the loop for the /// given vectorization factors. - SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); + SmallVector<RegisterUsage, 8> + calculateRegisterUsage(ArrayRef<ElementCount> VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); + /// Split reductions into those that happen in the loop, and those that happen + /// outside. In loop reductions are collected into InLoopReductionChains. + void collectInLoopReductions(); + /// \returns The smallest bitwidth each instruction can be represented with. /// The vector equivalents of these instructions should be truncated to this /// type. @@ -1044,8 +1317,9 @@ public: /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. - bool isProfitableToScalarize(Instruction *I, unsigned VF) const { - assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); + bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { + assert(VF.isVector() && + "Profitable to scalarize relevant only for VF > 1."); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. @@ -1059,8 +1333,8 @@ public: } /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1075,8 +1349,8 @@ public: } /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1092,8 +1366,8 @@ public: /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. - bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { - return VF > 1 && MinBWs.find(I) != MinBWs.end() && + bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { + return VF.isVector() && MinBWs.find(I) != MinBWs.end() && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1110,17 +1384,18 @@ public: /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. - void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, - unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, + InstructionCost Cost) { + assert(VF.isVector() && "Expected VF >=2"); WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. - void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, - InstWidening W, unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + void setWideningDecision(const InterleaveGroup<Instruction> *Grp, + ElementCount VF, InstWidening W, + InstructionCost Cost) { + assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { @@ -1136,15 +1411,14 @@ public: /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - + InstWidening getWideningDecision(Instruction *I, ElementCount VF) { + assert(VF.isVector() && "Expected VF to be a vector VF"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; - std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); + std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; @@ -1153,9 +1427,9 @@ public: /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. - unsigned getWideningCost(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); + InstructionCost getWideningCost(Instruction *I, ElementCount VF) { + assert(VF.isVector() && "Expected VF >=2"); + std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; @@ -1164,7 +1438,7 @@ public: /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. - bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { + bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { // If the instruction is not a truncate, return false. auto *Trunc = dyn_cast<TruncInst>(I); if (!Trunc) @@ -1189,14 +1463,14 @@ public: /// Collects the instructions to scalarize for each predicated instruction in /// the loop. - void collectInstsToScalarize(unsigned VF); + void collectInstsToScalarize(ElementCount VF); /// Collect Uniform and Scalar values for the given \p VF. /// The sets depend on CM decision for Load/Store instructions /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { + void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. - if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) + if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1247,7 +1521,8 @@ public: /// instructions that may divide by zero. /// If a non-zero VF has been calculated, we check if I will be scalarized /// predication for that VF. - bool isScalarWithPredication(Instruction *I, unsigned VF = 1); + bool isScalarWithPredication(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. @@ -1264,12 +1539,16 @@ public: /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + bool + memoryInstructionCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + bool + interleavedAccessCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1282,11 +1561,16 @@ public: return InterleaveInfo.getInterleaveGroup(Instr); } - /// Returns true if an interleaved group requires a scalar iteration - /// to handle accesses with gaps, and there is nothing preventing us from - /// creating a scalar epilogue. + /// Returns true if we're required to use a scalar epilogue for at least + /// the final iteration of the original loop. bool requiresScalarEpilogue() const { - return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); + if (!isScalarEpilogueAllowed()) + return false; + // If we might exit from anywhere but the latch, must run the exiting + // iteration in scalar form. + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) + return true; + return InterleaveInfo.requiresScalarEpilogue(); } /// Returns true if a scalar epilogue is not allowed due to optsize or a @@ -1302,17 +1586,34 @@ public: return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// A SmallMapVector to store the InLoop reduction op chains, mapping phi + /// nodes to the chain of instructions representing the reductions. Uses a + /// MapVector to ensure deterministic iteration order. + using ReductionChainMap = + SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; + + /// Return the chain of instructions representing an inloop reduction. + const ReductionChainMap &getInLoopReductionChains() const { + return InLoopReductionChains; + } + + /// Returns true if the Phi is part of an inloop reduction. + bool isInLoopReduction(PHINode *Phi) const { + return InLoopReductionChains.count(Phi); + } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. - unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. - unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, + bool &NeedToScalarize); /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1327,7 +1628,8 @@ private: /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - unsigned computeFeasibleMaxVF(unsigned ConstTripCount); + ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1336,47 +1638,54 @@ private: /// is /// false, then all operations will be scalarized (i.e. no vectorization has /// actually taken place). - using VectorizationCostTy = std::pair<unsigned, bool>; + using VectorizationCostTy = std::pair<InstructionCost, bool>; /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. - VectorizationCostTy expectedCost(unsigned VF); + VectorizationCostTy expectedCost(ElementCount VF); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + InstructionCost getInstructionCost(Instruction *I, ElementCount VF, + Type *&VectorTy); + + /// Return the cost of instructions in an inloop reduction pattern, if I is + /// part of that pattern. + InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, + Type *VectorTy, + TTI::TargetCostKind CostKind); /// Calculate vectorization cost of memory instruction \p I. - unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); + InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); /// The cost computation for scalarized memory instruction. - unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); /// The cost computation for interleaving group of memory instructions. - unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); /// The cost computation for Gather/Scatter instruction. - unsigned getGatherScatterCost(Instruction *I, unsigned VF); + InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. - unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); + InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. /// Store: scalar store + (loop invariant value stored? 0 : extract of last /// element) - unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -1394,7 +1703,7 @@ private: /// A type representing the costs for instructions if they were to be /// scalarized rather than vectorized. The entries are Instruction-Cost /// pairs. - using ScalarCostsTy = DenseMap<Instruction *, unsigned>; + using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. @@ -1416,19 +1725,30 @@ private: /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; + DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. - DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; + DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. - DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; + DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. - DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; + DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; + + /// PHINodes of the reductions that should be expanded in-loop along with + /// their associated chains of reduction operations, in program order from top + /// (PHI) to bottom + ReductionChainMap InLoopReductionChains; + + /// A Map of inloop reduction operations and their immediate chain operand. + /// FIXME: This can be removed once reductions can be costed correctly in + /// vplan. This was added to allow quick lookup to the inloop operations, + /// without having to loop through InLoopReductionChains. + DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to @@ -1436,7 +1756,7 @@ private: /// non-negative return value implies the expression will be scalarized. /// Currently, only single-use chains are considered for scalarization. int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, - unsigned VF); + ElementCount VF); /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in @@ -1447,27 +1767,28 @@ private: /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. - void collectLoopUniforms(unsigned VF); + void collectLoopUniforms(ElementCount VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized /// during vectorization. Non-uniform scalarized instructions will be /// represented by VF values in the vectorized loop, each corresponding to an /// iteration of the original scalar loop. - void collectLoopScalars(unsigned VF); + void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, - std::pair<InstWidening, unsigned>>; + using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, + std::pair<InstWidening, InstructionCost>>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. - bool needsExtract(Value *V, unsigned VF) const { + bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast<Instruction>(V); - if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) + if (VF.isScalar() || !I || !TheLoop->contains(I) || + TheLoop->isLoopInvariant(I)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1482,11 +1803,21 @@ private: /// Returns a range containing only operands needing to be extracted. SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, - unsigned VF) { + ElementCount VF) { return SmallVector<Value *, 4>(make_filter_range( Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } + /// Determines if we have the infrastructure to vectorize loop \p L and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const Loop &L, + const ElementCount VF) const; + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -1529,6 +1860,9 @@ public: /// Values to ignore in the cost model when VF > 1. SmallPtrSet<const Value *, 16> VecValuesToIgnore; + + /// Profitable vector factors. + SmallVector<VectorizationFactor, 8> ProfitableVFs; }; } // end namespace llvm @@ -1549,7 +1883,7 @@ public: // representation for pragma 'omp simd' is introduced. static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE) { - assert(!OuterLp->empty() && "This is not an outer loop"); + assert(!OuterLp->isInnermost() && "This is not an outer loop"); LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); // Only outer loops with an explicit vectorization hint are supported. @@ -1582,7 +1916,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI, // now, only collect outer loops that have explicit vectorization hints. If we // are stress testing the VPlan H-CFG construction, we collect the outermost // loop of every loop nest. - if (L.empty() || VPlanBuildStressTest || + if (L.isInnermost() || VPlanBuildStressTest || (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { LoopBlocksRPO RPOT(&L); RPOT.perform(LI); @@ -1696,10 +2030,10 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { } void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( - const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { + const InductionDescriptor &II, Value *Step, Value *Start, + Instruction *EntryVal) { assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); - Value *Start = II.getStartValue(); // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); @@ -1729,7 +2063,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); + Value *ConstVF = + getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. @@ -1737,10 +2072,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - Value *SplatVF = - isa<Constant>(Mul) - ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(VF, Mul); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + Value *SplatVF = isa<Constant>(Mul) + ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -1816,7 +2151,8 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, + TruncInst *Trunc) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); @@ -1874,8 +2210,10 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *EntryPart = - getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); + getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, + ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); if (Trunc) addMetadata(EntryPart, Trunc); @@ -1885,7 +2223,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF <= 1) { + if (VF.isZero() || VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -1896,7 +2234,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // least one user in the loop that is not widened. auto NeedsScalarIV = needsScalarInduction(EntryVal); if (!NeedsScalarIV) { - createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); return; } @@ -1904,7 +2242,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); Value *ScalarIV = CreateScalarIV(Step); // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase @@ -1926,7 +2264,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. - auto *ValVTy = cast<VectorType>(Val->getType()); + auto *ValVTy = cast<FixedVectorType>(Val->getType()); int VLen = ValVTy->getNumElements(); Type *STy = Val->getType()->getScalarType(); @@ -1983,8 +2321,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF > 1 && "VF should be greater than one"); - + assert(VF.isVector() && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2006,12 +2343,27 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 - : VF; + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) + ? 1 + : VF.getKnownMinValue(); + assert((!VF.isScalable() || Lanes == 1) && + "Should never scalarize a scalable vector"); // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); + auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), + ScalarIVTy->getScalarSizeInBits()); + Value *StartIdx = + createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + if (ScalarIVTy->isFloatingPointTy()) + StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); + StartIdx = addFastMathFlag(Builder.CreateBinOp( + AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); + // The step returned by `createStepForVF` is a runtime-evaluated value + // when VF is scalable. Otherwise, it should be folded into a Constant. + assert((VF.isScalable() || isa<Constant>(StartIdx)) && + "Expected StartIdx to be folded to a constant when VF is not " + "scalable"); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2045,7 +2397,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // If we aren't vectorizing, we can just copy the scalar map values over to // the vector map. - if (VF == 1) { + if (VF.isScalar()) { VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); return ScalarValue; } @@ -2054,7 +2406,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; + unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) + ? 0 + : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "Scalable vectorization can't lead to any scalarized values."); auto *LastInst = cast<Instruction>( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2075,10 +2431,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { VectorValue = getBroadcastInstrs(ScalarValue); VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { - // Initialize packing with insertelements to start from undef. - Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); - VectorLoopValueMap.setVectorValue(V, Part, Undef); - for (unsigned Lane = 0; Lane < VF; ++Lane) + // Initialize packing with insertelements to start from poison. + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); + VectorLoopValueMap.setVectorValue(V, Part, Poison); + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } @@ -2117,7 +2474,7 @@ InnerLoopVectorizer::getOrCreateScalarValue(Value *V, // extractelement instruction. auto *U = getOrCreateVectorValue(V, Instance.Part); if (!U->getType()->isVectorTy()) { - assert(VF == 1 && "Value not scalarized has non-vector type"); + assert(VF.isScalar() && "Value not scalarized has non-vector type"); return U; } @@ -2142,12 +2499,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue( Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); + assert(!VF.isScalable() && "Cannot reverse scalable vectors"); SmallVector<int, 8> ShuffleMask; - for (unsigned i = 0; i < VF; ++i) - ShuffleMask.push_back(VF - i - 1); + for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) + ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); - return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), - ShuffleMask, "reverse"); + return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); } // Return whether we allow using masked interleave-groups (for dealing with @@ -2172,9 +2529,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // } // To: // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B -// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements -// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements -// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements +// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements +// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements +// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements // // Or translate following interleaved store group (factor = 3): // for (i = 0; i < N; i+=3) { @@ -2185,20 +2542,22 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // } // To: // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> -// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup( - const InterleaveGroup<Instruction> *Group, VPTransformState &State, - VPValue *Addr, VPValue *BlockInMask) { + const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, + VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, + VPValue *BlockInMask) { Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. SmallVector<Value *, 2> AddrParts; @@ -2214,8 +2573,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. + assert(!VF.isScalable() && + "scalable vector reverse operation is not implemented"); if (Group->isReverse()) - Index += (VF - 1) * Group->getFactor(); + Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, {Part, 0}); @@ -2246,11 +2607,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } setDebugLocFromInst(Builder, Instr); - Value *UndefVec = UndefValue::get(VecTy); + Value *PoisonVec = PoisonValue::get(VecTy); Value *MaskForGaps = nullptr; if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2266,10 +2628,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *GroupMask = MaskForGaps; if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); - auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, - createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) @@ -2277,7 +2640,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } NewLoad = Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), - GroupMask, UndefVec, "wide.masked.vec"); + GroupMask, PoisonVec, "wide.masked.vec"); } else NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], @@ -2288,6 +2651,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // For each member in the group, shuffle out the appropriate data from the // wide loads. + unsigned J = 0; for (unsigned I = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); @@ -2295,28 +2659,33 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (!Member) continue; - auto StrideMask = createStrideMask(I, InterleaveFactor, VF); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto StrideMask = + createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( - NewLoads[Part], UndefVec, StrideMask, "strided.vec"); + NewLoads[Part], StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } if (Group->isReverse()) StridedVec = reverseVector(StridedVec); - VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); + State.set(VPDefs[J], Member, StridedVec, Part); } + ++J; } return; } // The sub vector type for current instruction. - auto *SubVT = FixedVectorType::get(ScalarTy, VF); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. for (unsigned Part = 0; Part < UF; Part++) { @@ -2324,11 +2693,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( SmallVector<Value *, 4> StoredVecs; for (unsigned i = 0; i < InterleaveFactor; i++) { // Interleaved store group doesn't allow a gap, so each index has a member - Instruction *Member = Group->getMember(i); - assert(Member && "Fail to get a member from an interleaved store group"); + assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); + + Value *StoredVec = State.get(StoredValues[i], Part); - Value *StoredVec = getOrCreateVectorValue( - cast<StoreInst>(Member)->getValueOperand(), Part); if (Group->isReverse()) StoredVec = reverseVector(StoredVec); @@ -2344,16 +2712,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *IVec = Builder.CreateShuffleVector( - WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), + WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); - auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); @@ -2366,11 +2735,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } } -void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, - VPTransformState &State, - VPValue *Addr, - VPValue *StoredValue, - VPValue *BlockInMask) { +void InnerLoopVectorizer::vectorizeMemoryInstruction( + Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, + VPValue *StoredValue, VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2387,7 +2754,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, "CM decision is not to widen the memory instruction"); Type *ScalarDataTy = getMemInstValueType(Instr); - auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); + + auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); // Determine if the pointer operand of the access is either consecutive or @@ -2419,19 +2787,23 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, InBounds = gep->isInBounds(); if (Reverse) { + assert(!VF.isScalable() && + "Reversing vectors is not yet supported for scalable vectors."); + // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); + PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( + ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); + PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( + ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { + Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); PartPtr->setIsInBounds(InBounds); } @@ -2486,7 +2858,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad( - VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), + VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); else NewLI = @@ -2497,7 +2869,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (Reverse) NewLI = reverseVector(NewLI); } - VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); + + State.set(Def, Instr, NewLI, Part); } } @@ -2507,6 +2880,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for + // the first lane and part. + if (isa<NoAliasScopeDeclInst>(Instr)) + if (Instance.Lane != 0 || Instance.Part != 0) + return; + setDebugLocFromInst(Builder, Instr); // Does this instruction return a value ? @@ -2519,7 +2898,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { - auto *NewOp = State.get(User.getOperand(op), Instance); + auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); + auto InputInstance = Instance; + if (!Operand || !OrigLoop->contains(Operand) || + (Cost->isUniformAfterVectorization(Operand, State.VF))) + InputInstance.Lane = 0; + auto *NewOp = State.get(User.getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2527,7 +2911,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - // Add the cloned scalar to the scalar map entry. + // TODO: Set result for VPValue of VPReciplicateRecipe. This requires + // representing scalar values in VPTransformState. Add the cloned scalar to + // the scalar map entry. VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); // If we just cloned a new assumption, add it the assumption cache. @@ -2564,7 +2950,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, Induction->addIncoming(Next, Latch); // Create the compare. Value *ICmp = Builder.CreateICmpEQ(Next, End); - Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); + Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); // Now we have two terminators. Remove the old one from the block. Latch->getTerminator()->eraseFromParent(); @@ -2581,7 +2967,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { // Find the loop boundaries. ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - assert(BackedgeTakenCount != SE->getCouldNotCompute() && + assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); Type *IdxTy = Legal->getWidestInductionType(); @@ -2627,7 +3013,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Type *Ty = TC->getType(); - Constant *Step = ConstantInt::get(Ty, VF * UF); + // This is where we can make the step a runtime constant. + Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2636,9 +3023,12 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF * UF) && + assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); + assert(!VF.isScalable() && + "Tail folding not yet supported for scalable vectors"); + TC = Builder.CreateAdd( + TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -2648,14 +3038,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // unroll factor (number of SIMD instructions). Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); - // If there is a non-reversed interleaved group that may speculatively access - // memory out-of-bounds, we need to ensure that there will be at least one - // iteration of the scalar epilogue loop. Thus, if the step evenly divides + // There are two cases where we need to ensure (at least) the last iteration + // runs in the scalar remainder loop. Thus, if the step evenly divides // the trip count, we set the remainder to be equal to the step. If the step // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations - // check ensures that N >= Step. - if (VF > 1 && Cost->requiresScalarEpilogue()) { + // check ensures that N >= Step. The cases are: + // 1) If there is a non-reversed interleaved group that may speculatively + // access memory out-of-bounds. + // 2) If any instruction may follow a conditionally taken exit. That is, if + // the loop contains multiple exiting blocks, or a single exiting block + // which is not the latch. + if (VF.isVector() && Cost->requiresScalarEpilogue()) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -2668,17 +3062,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. - unsigned VF = DstVTy->getNumElements(); - VectorType *SrcVecTy = cast<VectorType>(V->getType()); + auto *DstFVTy = cast<FixedVectorType>(DstVTy); + unsigned VF = DstFVTy->getNumElements(); + auto *SrcVecTy = cast<FixedVectorType>(V->getType()); assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); - Type *DstElemTy = DstVTy->getElementType(); + Type *DstElemTy = DstFVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"); // Do a direct cast if element types are castable. if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { - return Builder.CreateBitOrPointerCast(V, DstVTy); + return Builder.CreateBitOrPointerCast(V, DstFVTy); } // V cannot be directly casted to desired vector type. // May happen when V is a floating point vector but DstVTy is a vector of @@ -2692,7 +3087,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); auto *VecIntTy = FixedVectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); - return Builder.CreateBitOrPointerCast(CastVal, DstVTy); + return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, @@ -2713,11 +3108,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); - if (!Cost->foldTailByMasking()) - CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF * UF), - "min.iters.check"); - + if (!Cost->foldTailByMasking()) { + Value *Step = + createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); + CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + } // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -2754,7 +3149,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { if (C->isZero()) return; - assert(!SCEVCheckBlock->getParent()->hasOptSize() && + assert(!(SCEVCheckBlock->getParent()->hasOptSize() || + (OptForSizeBasedOnProfile && + Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); @@ -2792,15 +3189,8 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); if (!RtPtrChecking.Need) return; - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, - RtPtrChecking.getChecks(), RtPtrChecking.getSE()); - assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " - "claimed checks are required"); - if (MemCheckBlock->getParent()->hasOptSize()) { + if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -2820,22 +3210,33 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, "vector.ph"); + auto *CondBranch = cast<BranchInst>( + Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); + ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); + LoopBypassBlocks.push_back(MemCheckBlock); + AddedSafetyChecks = true; + // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { DT->changeImmediateDominator(Bypass, MemCheckBlock); DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); } - ReplaceInstWithInst( - MemCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); - LoopBypassBlocks.push_back(MemCheckBlock); - AddedSafetyChecks = true; + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + std::tie(FirstCheckInst, MemRuntimeCheck) = + addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, + RtPtrChecking.getChecks(), RtPtrChecking.getSE()); + assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " + "claimed checks are required"); + CondBranch->setCondition(MemRuntimeCheck); // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. - LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + LVer = std::make_unique<LoopVersioning>( + *Legal->getLAI(), + Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, + DT, PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -2939,74 +3340,35 @@ Value *InnerLoopVectorizer::emitTransformedIndex( llvm_unreachable("invalid enum"); } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { - /* - In this function we generate a new loop. The new loop will contain - the vectorized instructions while the old loop will continue to run the - scalar remainder. - - [ ] <-- loop iteration number check. - / | - / v - | [ ] <-- vector loop bypass (may consist of multiple blocks). - | / | - | / v - || [ ] <-- vector pre header. - |/ | - | v - | [ ] \ - | [ ]_| <-- vector loop. - | | - | v - | -[ ] <--- middle-block. - | / | - | / v - -|- >[ ] <--- new preheader. - | | - | v - | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder. - \ | - \ v - >[ ] <-- exit block. - ... - */ - - MDNode *OrigLoopID = OrigLoop->getLoopID(); - - // Some loops have a single integer induction variable, while other loops - // don't. One example is c++ iterators that often have multiple pointer - // induction variables. In the code below we also support a case where we - // don't have a single induction variable. - // - // We try to obtain an induction variable from the original loop as hard - // as possible. However if we don't find one that: - // - is an integer - // - counts from zero, stepping by one - // - is the size of the widest induction variable type - // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - - // Split the single block loop into the two loop structure described above. +Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); - LoopExitBlock = OrigLoop->getExitBlock(); + LoopExitBlock = OrigLoop->getUniqueExitBlock(); assert(LoopExitBlock && "Must have an exit block"); assert(LoopVectorPreHeader && "Invalid loop structure"); LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - LI, nullptr, "middle.block"); + LI, nullptr, Twine(Prefix) + "middle.block"); LoopScalarPreHeader = SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, - nullptr, "scalar.ph"); + nullptr, Twine(Prefix) + "scalar.ph"); + + // Set up branch from middle block to the exit and scalar preheader blocks. + // completeLoopSkeleton will update the condition to use an iteration check, + // if required to decide whether to execute the remainder. + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); + auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); + BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); + // We intentionally don't let SplitBlock to update LoopInfo since // LoopVectorBody should belong to another loop than LoopVectorPreHeader. // LoopVectorBody is explicitly added to the correct place few lines later. LoopVectorBody = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - nullptr, nullptr, "vector.body"); + nullptr, nullptr, Twine(Prefix) + "vector.body"); // Update dominator for loop exit. DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); @@ -3023,37 +3385,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LI->addTopLevelLoop(Lp); } Lp->addBasicBlockToLoop(LoopVectorBody, *LI); + return Lp; +} - // Find the loop boundaries. - Value *Count = getOrCreateTripCount(Lp); - - Value *StartIdx = ConstantInt::get(IdxTy, 0); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. This check also covers the case where the - // backedge-taken count is uint##_max: adding one to it will overflow leading - // to an incorrect trip count of zero. In this (rare) case we will also jump - // to the scalar loop. - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); - - // Generate the code to check any assumptions that we've made for SCEV - // expressions. - emitSCEVChecks(Lp, LoopScalarPreHeader); - - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - // Generate the induction variable. - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Constant *Step = ConstantInt::get(IdxTy, VF * UF); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); - +void InnerLoopVectorizer::createInductionResumeValues( + Loop *L, Value *VectorTripCount, + std::pair<BasicBlock *, Value *> AdditionalBypass) { + assert(VectorTripCount && L && "Expected valid arguments"); + assert(((AdditionalBypass.first && AdditionalBypass.second) || + (!AdditionalBypass.first && !AdditionalBypass.second)) && + "Inconsistent information about additional bypass."); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3061,10 +3402,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // iteration in the vectorized loop. // If we come from a bypass edge then we need to start from the original // start value. - - // This variable saves the new starting index for the scalar loop. It is used - // to test if there are any tail iterations left once the vector loop has - // completed. for (auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; @@ -3076,20 +3413,32 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; + Value *EndValueFromAdditionalBypass = AdditionalBypass.second; if (OrigPhi == OldInduction) { // We know what the end value is. - EndValue = CountRoundDown; + EndValue = VectorTripCount; } else { - IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); + IRBuilder<> B(L->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); - Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); + CastInst::getCastOpcode(VectorTripCount, true, StepType, true); + Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); - } + // Compute the end value for the additional bypass (if applicable). + if (AdditionalBypass.first) { + B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); + CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, + StepType, true); + CRD = + B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); + EndValueFromAdditionalBypass = + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + EndValueFromAdditionalBypass->setName("ind.end"); + } + } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); @@ -3099,42 +3448,44 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); + + if (AdditionalBypass.first) + BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, + EndValueFromAdditionalBypass); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } +} + +BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, + MDNode *OrigLoopID) { + assert(L && "Expected valid loop."); - // We need the OrigLoop (scalar loop part) latch terminator to help - // produce correct debug info for the middle block BB instructions. - // The legality check stage guarantees that the loop will have a single - // latch. - assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && - "Scalar loop latch terminator isn't a branch"); - BranchInst *ScalarLatchBr = - cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); + // The trip counts should be cached by now. + Value *Count = getOrCreateTripCount(L); + Value *VectorTripCount = getOrCreateVectorTripCount(L); + + auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. // If tail is to be folded, we know we don't need to run the remainder. - Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { - CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", - LoopMiddleBlock->getTerminator()); + Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + Count, VectorTripCount, "cmp.n", + LoopMiddleBlock->getTerminator()); - // Here we use the same DebugLoc as the scalar loop latch branch instead + // Here we use the same DebugLoc as the scalar loop latch terminator instead // of the corresponding compare because they may have ended up with // different line numbers and we want to avoid awkward line stepping while // debugging. Eg. if the compare has got a line number inside the loop. - cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); + CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); + cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); } - BranchInst *BrInst = - BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); - BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); - ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); - // Get ready to start creating new instructions into the vectorized body. - assert(LoopVectorPreHeader == Lp->getLoopPreheader() && + assert(LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"); Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); @@ -3142,7 +3493,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); if (VectorizedLoopID.hasValue()) { - Lp->setLoopID(VectorizedLoopID.getValue()); + L->setLoopID(VectorizedLoopID.getValue()); // Do not setAlreadyVectorized if loop attributes have been defined // explicitly. @@ -3152,9 +3503,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). if (MDNode *LID = OrigLoop->getLoopID()) - Lp->setLoopID(LID); + L->setLoopID(LID); - LoopVectorizeHints Hints(Lp, true, *ORE); + LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); #ifdef EXPENSIVE_CHECKS @@ -3165,6 +3516,91 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { return LoopVectorPreHeader; } +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { + /* + In this function we generate a new loop. The new loop will contain + the vectorized instructions while the old loop will continue to run the + scalar remainder. + + [ ] <-- loop iteration number check. + / | + / v + | [ ] <-- vector loop bypass (may consist of multiple blocks). + | / | + | / v + || [ ] <-- vector pre header. + |/ | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + | v + | -[ ] <--- middle-block. + | / | + | / v + -|- >[ ] <--- new preheader. + | | + | v + | [ ] \ + | [ ]_| <-- old scalar loop to handle remainder. + \ | + \ v + >[ ] <-- exit block. + ... + */ + + // Get the metadata of the original loop before it gets modified. + MDNode *OrigLoopID = OrigLoop->getLoopID(); + + // Create an empty vector loop, and prepare basic blocks for the runtime + // checks. + Loop *Lp = createVectorLoopSkeleton(""); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. This check also covers the case where the + // backedge-taken count is uint##_max: adding one to it will overflow leading + // to an incorrect trip count of zero. In this (rare) case we will also jump + // to the scalar loop. + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); + + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + emitSCEVChecks(Lp, LoopScalarPreHeader); + + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + + // Some loops have a single integer induction variable, while other loops + // don't. One example is c++ iterators that often have multiple pointer + // induction variables. In the code below we also support a case where we + // don't have a single induction variable. + // + // We try to obtain an induction variable from the original loop as hard + // as possible. However if we don't find one that: + // - is an integer + // - counts from zero, stepping by one + // - is the size of the widest induction variable type + // then we create a new one. + OldInduction = Legal->getPrimaryInduction(); + Type *IdxTy = Legal->getWidestInductionType(); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); + Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Emit phis for the new starting index of the scalar loop. + createInductionResumeValues(Lp, CountRoundDown); + + return completeLoopSkeleton(Lp, OrigLoopID); +} + // Fix up external users of the induction variable. At this point, we are // in LCSSA form, with all external PHIs that use the IV having one input value, // coming from the remainder loop. We need those PHIs to also have a correct @@ -3178,7 +3614,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, // value (the value that feeds into the phi from the loop latch). // We allow both, but they, obviously, have different values. - assert(OrigLoop->getExitBlock() && "Expected a single exit block"); + assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); DenseMap<Value *, Value *> MissingVals; @@ -3284,9 +3720,10 @@ static void cse(BasicBlock *BB) { } } -unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, - unsigned VF, - bool &NeedToScalarize) { +InstructionCost +LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, + bool &NeedToScalarize) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; @@ -3297,9 +3734,9 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // to be vectors, so we need to extract individual elements from there, // execute VF scalar calls, and then gather the result into the vector return // value. - unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, - TTI::TCK_RecipThroughput); - if (VF == 1) + InstructionCost ScalarCallCost = + TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); + if (VF.isScalar()) return ScalarCallCost; // Compute corresponding vector type for return value and arguments. @@ -3309,31 +3746,33 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); + InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); - unsigned Cost = ScalarCallCost * VF + ScalarizationCost; + InstructionCost Cost = + ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); + VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); if (!TLI || CI->isNoBuiltin() || !VecFunc) return Cost; // If the corresponding vector cost is cheaper, return its cost. - unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, - TTI::TCK_RecipThroughput); + InstructionCost VectorCallCost = + TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); if (VectorCallCost < Cost) { NeedToScalarize = false; - return VectorCallCost; + Cost = VectorCallCost; } return Cost; } -unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, + ElementCount VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -3373,7 +3812,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), KV.second); auto *TruncatedTy = FixedVectorType::get( - ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); + ScalarTruncatedTy, + cast<FixedVectorType>(OriginalTy)->getNumElements()); if (TruncatedTy == OriginalTy) continue; @@ -3423,13 +3863,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { break; } } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { - auto Elements0 = - cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); + auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( SI->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = - cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); + auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) + ->getNumElements(); auto *O1 = B.CreateZExtOrTrunc( SI->getOperand(1), FixedVectorType::get(ScalarTruncatedTy, Elements1)); @@ -3439,16 +3879,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { - auto Elements = - cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); + auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( IE->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements)); auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { - auto Elements = - cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); + auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) + ->getNumElements(); auto *O0 = B.CreateZExtOrTrunc( EE->getOperand(0), FixedVectorType::get(ScalarTruncatedTy, Elements)); @@ -3490,7 +3930,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. - if (VF > 1) + if (VF.isVector()) truncateToMinimalBitwidths(); // Fix widened non-induction PHIs by setting up the PHI operands. @@ -3531,9 +3971,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. - setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), - LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF * UF); + // + // For scalable vectorization we can't know at compile time how many iterations + // of the loop are handled in one vector iteration, so instead assume a pessimistic + // vscale of '1'. + setProfileInfoAfterUnrolling( + LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), + LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -3612,11 +4056,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Create a vector from the initial value. auto *VectorInit = ScalarInit; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); VectorInit = Builder.CreateInsertElement( - UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), - VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); + PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, + Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -3657,10 +4102,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - SmallVector<int, 8> ShuffleMask(VF); - ShuffleMask[0] = VF - 1; - for (unsigned I = 1; I < VF; ++I) - ShuffleMask[I] = I + VF - 1; + assert(!VF.isScalable()); + SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); + ShuffleMask[0] = VF.getKnownMinValue() - 1; + for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) + ShuffleMask[I] = I + VF.getKnownMinValue() - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -3670,9 +4116,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *PreviousPart = getOrCreateVectorValue(Previous, Part); Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, - ShuffleMask) - : Incoming; + auto *Shuffle = + VF.isVector() + ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) + : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast<Instruction>(PhiPart)->eraseFromParent(); VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); @@ -3685,10 +4132,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. auto *ExtractForScalar = Incoming; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); + ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), + "vector.recur.extract"); } // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself @@ -3696,9 +4144,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // will be the value when jumping to the exit block from the LoopMiddleBlock, // when the scalar loop is not run at all. Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF > 1) + if (VF.isVector()) ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); + Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), + "vector.recur.extract.for.phi"); // When loop is unrolled without vectorizing, initialize // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of // `Incoming`. This is analogous to the vectorized case above: extracting the @@ -3722,69 +4171,31 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // vector recurrence we extracted in the middle block. Since the loop is in // LCSSA form, we just need to find all the phi nodes for the original scalar // recurrence in the exit block, and then add an edge for the middle block. - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getIncomingValue(0) == Phi) { + // Note that LCSSA does not imply single entry when the original scalar loop + // had multiple exiting edges (as we always run the last iteration in the + // scalar epilogue); in that case, the exiting path through middle will be + // dynamically dead and the value picked for the phi doesn't matter. + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) + if (any_of(LCSSAPhi.incoming_values(), + [Phi](Value *V) { return V == Phi; })) LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); - } - } } void InnerLoopVectorizer::fixReduction(PHINode *Phi) { - Constant *Zero = Builder.getInt32(0); - // Get it's reduction variable descriptor. assert(Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; - RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); + RecurKind RK = RdxDesc.getRecurrenceKind(); TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = - RdxDesc.getMinMaxRecurrenceKind(); setDebugLocFromInst(Builder, ReductionStartValue); - - // We need to generate a reduction vector from the incoming scalar. - // To do so, we need to generate the 'identity' vector and override - // one of the elements with the incoming scalar reduction. We need - // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); // This is the vector-clone of the value that leaves the loop. Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); - // Find the reduction identity variable. Zero for addition, or, xor, - // one for multiplication, -1 for And. - Value *Identity; - Value *VectorStart; - if (RK == RecurrenceDescriptor::RK_IntegerMinMax || - RK == RecurrenceDescriptor::RK_FloatMinMax) { - // MinMax reduction have the start value as their identify. - if (VF == 1) { - VectorStart = Identity = ReductionStartValue; - } else { - VectorStart = Identity = - Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); - } - } else { - // Handle other reduction kinds: - Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( - RK, VecTy->getScalarType()); - if (VF == 1) { - Identity = Iden; - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = ReductionStartValue; - } else { - Identity = ConstantVector::getSplat({VF, false}, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = - Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); - } - } - // Wrap flags are in general invalid after vectorization, clear them. clearReductionWrapFlags(RdxDesc); @@ -3798,10 +4209,6 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); Value *Val = getOrCreateVectorValue(LoopVal, Part); - // Make sure to add the reduction start value only to the - // first unroll part. - Value *StartVal = (Part == 0) ? VectorStart : Identity; - cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); cast<PHINode>(VecRdxPhi) ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); } @@ -3816,8 +4223,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // If tail is folded by masking, the vector value to leave the loop should be // a Select choosing between the vectorized LoopExitInst and vectorized Phi, - // instead of the former. - if (Cost->foldTailByMasking()) { + // instead of the former. For an inloop reduction the reduction will already + // be predicated, and does not need to be handled here. + if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); @@ -3831,14 +4239,31 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } assert(Sel && "Reduction exit feeds no select"); VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); + + // If the target can create a predicated operator for the reduction at no + // extra cost in the loop (for example a predicated vadd), it can be + // cheaper for the select to remain in the loop than be sunk out of it, + // and so use the select value for the phi instead of the old + // LoopExitValue. + RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; + if (PreferPredicatedReductionSelect || + TTI->preferPredicatedReductionSelect( + RdxDesc.getOpcode(), Phi->getType(), + TargetTransformInfo::ReductionFlags())) { + auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); + VecRdxPhi->setIncomingValueForBlock( + LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); + } } } // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. - if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { - Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); + if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { + assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); @@ -3865,7 +4290,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); - unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); + unsigned Op = RecurrenceDescriptor::getOpcode(RK); // The middle block terminator has already been assigned a DebugLoc here (the // OrigLoop's single latch terminator). We want the whole middle block to @@ -3884,14 +4309,14 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { ReducedPartRdx, "bin.rdx"), RdxDesc.getFastMathFlags()); else - ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, - RdxPart); + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); } - if (VF > 1) { - bool NoNaN = Legal->hasFunNoNaNAttr(); + // Create the reduction after the loop. Note that inloop reductions create the + // target reduction in the loop using a Reduction recipe. + if (VF.isVector() && !IsInLoopReductionPhi) { ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (Phi->getType() != RdxDesc.getRecurrenceType()) @@ -3911,21 +4336,17 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. - // We know that the loop is in LCSSA form. We need to update the - // PHI nodes in the exit blocks. - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - // All PHINodes need to have a single entry edge, or two if - // we already fixed them. - assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); - // We found a reduction value exit-PHI. Update it with the - // incoming bypass edge. - if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) + // We know that the loop is in LCSSA form. We need to update the PHI nodes + // in the exit blocks. See comment on analogous loop in + // fixFirstOrderRecurrence for a more complete explaination of the logic. + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) + if (any_of(LCSSAPhi.incoming_values(), + [LoopExitInst](Value *V) { return V == LoopExitInst; })) LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); - } // end of the LCSSA phi scan. - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. int IncomingEdgeBlockIdx = Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); @@ -3937,9 +4358,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { void InnerLoopVectorizer::clearReductionWrapFlags( RecurrenceDescriptor &RdxDesc) { - RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); - if (RK != RecurrenceDescriptor::RK_IntegerAdd && - RK != RecurrenceDescriptor::RK_IntegerMult) + RecurKind RK = RdxDesc.getRecurrenceKind(); + if (RK != RecurKind::Add && RK != RecurKind::Mul) return; Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); @@ -3968,22 +4388,27 @@ void InnerLoopVectorizer::clearReductionWrapFlags( void InnerLoopVectorizer::fixLCSSAPHIs() { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getNumIncomingValues() == 1) { - auto *IncomingValue = LCSSAPhi.getIncomingValue(0); - // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa<Instruction>(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast<Instruction>(IncomingValue), VF) - ? 0 - : VF - 1; - // Can be a loop invariant incoming value or the last scalar value to be - // extracted from the vectorized loop. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - Value *lastIncomingValue = - getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); - LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); - } + if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) + // Some phis were already hand updated by the reduction and recurrence + // code above, leave them alone. + continue; + + auto *IncomingValue = LCSSAPhi.getIncomingValue(0); + // Non-instruction incoming values will have only one value. + unsigned LastLane = 0; + if (isa<Instruction>(IncomingValue)) + LastLane = Cost->isUniformAfterVectorization( + cast<Instruction>(IncomingValue), VF) + ? 0 + : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "scalable vectors dont support non-uniform scalars yet"); + // Can be a loop invariant incoming value or the last scalar value to be + // extracted from the vectorized loop. + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + Value *lastIncomingValue = + getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); + LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } @@ -4087,9 +4512,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } } -void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, - unsigned UF, unsigned VF, - bool IsPtrLoopInvariant, +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, + VPUser &Operands, unsigned UF, + ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State) { // Construct a vector GEP by widening the operands of the scalar GEP as @@ -4098,7 +4523,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. - if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we @@ -4114,7 +4539,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, auto *Clone = Builder.Insert(GEP->clone()); for (unsigned Part = 0; Part < UF; ++Part) { Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); + State.set(VPDef, GEP, EntryPart, Part); addMetadata(EntryPart, GEP); } } else { @@ -4149,16 +4574,19 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, Indices) : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); - VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); + State.set(VPDef, GEP, NewGEP, Part); addMetadata(NewGEP, GEP); } } } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, - unsigned VF) { +void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, + RecurrenceDescriptor *RdxDesc, + Value *StartV, unsigned UF, + ElementCount VF) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); PHINode *P = cast<PHINode>(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction @@ -4166,7 +4594,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = - (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); + (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -4181,18 +4609,60 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. - if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { + if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { + Value *Iden = nullptr; + bool ScalarPHI = + (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); + Type *VecTy = + ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); + + if (RdxDesc) { + assert(Legal->isReductionVariable(P) && StartV && + "RdxDesc should only be set for reduction variables; in that case " + "a StartV is also required"); + RecurKind RK = RdxDesc->getRecurrenceKind(); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { + // MinMax reduction have the start value as their identify. + if (ScalarPHI) { + Iden = StartV; + } else { + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); + } + } else { + Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( + RK, VecTy->getScalarType()); + Iden = IdenC; + + if (!ScalarPHI) { + Iden = ConstantVector::getSplat(VF, IdenC); + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + Constant *Zero = Builder.getInt32(0); + StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + } + } + } + for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. - Type *VecTy = - (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); VectorLoopValueMap.setVectorValue(P, Part, EntryPart); + if (StartV) { + // Make sure to add the reduction start value only to the + // first unroll part. + Value *StartVal = (Part == 0) ? StartV : Iden; + cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); + } } return; } + assert(!Legal->isReductionVariable(P) && + "reductions should be handled above"); + setDebugLocFromInst(Builder, P); // This PHINode must be an induction variable. @@ -4213,26 +4683,74 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); - // This is the normalized GEP that starts counting at zero. - Value *PtrInd = Induction; - PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); - // Determine the number of scalars we need to generate for each unroll - // iteration. If the instruction is uniform, we only need to generate the - // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; - // These are the scalar results. Notice that we don't generate vector GEPs - // because scalar GEPs result in better code. - for (unsigned Part = 0; Part < UF; ++Part) { - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); - Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); - SclrGep->setName("next.gep"); - VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); + + if (Cost->isScalarAfterVectorization(P, VF)) { + // This is the normalized GEP that starts counting at zero. + Value *PtrInd = + Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + unsigned Lanes = + Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); + for (unsigned Part = 0; Part < UF; ++Part) { + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Constant *Idx = ConstantInt::get(PtrInd->getType(), + Lane + Part * VF.getKnownMinValue()); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); + Value *SclrGep = + emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); + SclrGep->setName("next.gep"); + VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); + } } + return; + } + assert(isa<SCEVConstant>(II.getStep()) && + "Induction step not a SCEV constant!"); + Type *PhiType = II.getStep()->getType(); + + // Build a pointer phi + Value *ScalarStartValue = II.getStartValue(); + Type *ScStValueType = ScalarStartValue->getType(); + PHINode *NewPointerPhi = + PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); + NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); + + // A pointer induction, performed by using a gep + BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); + Instruction *InductionLoc = LoopLatch->getTerminator(); + const SCEV *ScalarStep = II.getStep(); + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + Value *ScalarStepValue = + Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); + Value *InductionGEP = GetElementPtrInst::Create( + ScStValueType->getPointerElementType(), NewPointerPhi, + Builder.CreateMul( + ScalarStepValue, + ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), + "ptr.ind", InductionLoc); + NewPointerPhi->addIncoming(InductionGEP, LoopLatch); + + // Create UF many actual address geps that use the pointer + // phi as base and a vectorized version of the step value + // (<step*0, ..., step*N>) as offset. + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Constant *, 8> Indices; + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) + Indices.push_back( + ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); + Constant *StartOffset = ConstantVector::get(Indices); + + Value *GEP = Builder.CreateGEP( + ScStValueType->getPointerElementType(), NewPointerPhi, + Builder.CreateMul( + StartOffset, + Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), + "vector.gep")); + VectorLoopValueMap.setVectorValue(P, Part, GEP); } - return; } } } @@ -4255,7 +4773,8 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, +void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, + VPUser &User, VPTransformState &State) { switch (I.getOpcode()) { case Instruction::Call: @@ -4297,7 +4816,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, VecOp->copyIRFlags(&I); // Use this vector value for all users of the original instruction. - VectorLoopValueMap.setVectorValue(&I, Part, V); + State.set(Def, &I, V, Part); addMetadata(V, &I); } @@ -4321,7 +4840,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, } else { C = Builder.CreateICmp(Cmp->getPredicate(), A, B); } - VectorLoopValueMap.setVectorValue(&I, Part, C); + State.set(Def, &I, C, Part); addMetadata(C, &I); } @@ -4345,12 +4864,12 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, /// Vectorize casts. Type *DestTy = - (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); + (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = State.get(User.getOperand(0), Part); Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - VectorLoopValueMap.setVectorValue(&I, Part, Cast); + State.set(Def, &I, Cast, Part); addMetadata(Cast, &I); } break; @@ -4362,7 +4881,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, } // end of switch. } -void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, +void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, + VPUser &ArgOperands, VPTransformState &State) { assert(!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); @@ -4373,7 +4893,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, SmallVector<Type *, 4> Tys; for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -4381,11 +4901,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize = false; - unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; + InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; + bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."); + assert(IntrinsicCost.isValid() && CallCost.isValid() && + "Cannot have invalid costs while widening"); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; @@ -4404,15 +4926,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, if (UseVectorIntrinsic) { // Use vector version of the intrinsic. Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = - FixedVectorType::get(CI->getType()->getScalarType(), VF); + if (VF.isVector()) { + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + } VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. - const VFShape Shape = - VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); + const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); #ifndef NDEBUG assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && "Can't create vector function."); @@ -4426,12 +4948,12 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, if (isa<FPMathOperator>(V)) V->copyFastMathFlags(CI); - VectorLoopValueMap.setVectorValue(&I, Part, V); + State.set(Def, &I, V, Part); addMetadata(V, &I); } } -void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, +void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, bool InvariantCond, VPTransformState &State) { @@ -4450,16 +4972,16 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, Value *Op0 = State.get(Operands.getOperand(1), Part); Value *Op1 = State.get(Operands.getOperand(2), Part); Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); - VectorLoopValueMap.setVectorValue(&I, Part, Sel); + State.set(VPDef, &I, Sel, Part); addMetadata(Sel, &I); } } -void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { +void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && + assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); SmallSetVector<Instruction *, 8> Worklist; @@ -4468,6 +4990,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // accesses that will remain scalar. SmallSetVector<Instruction *, 8> ScalarPtrs; SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; + auto *Latch = TheLoop->getLoopLatch(); // A helper that returns true if the use of Ptr by MemAccess will be scalar. // The pointer operands of loads and stores will be scalar as long as the @@ -4493,11 +5016,33 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { !TheLoop->isLoopInvariant(V); }; - // A helper that evaluates a memory access's use of a pointer. If the use - // will be a scalar use, and the pointer is only used by memory accesses, we - // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in - // PossibleNonScalarPtrs. + auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { + if (!isa<PHINode>(Ptr) || + !Legal->getInductionVars().count(cast<PHINode>(Ptr))) + return false; + auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; + if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) + return false; + return isScalarUse(MemAccess, Ptr); + }; + + // A helper that evaluates a memory access's use of a pointer. If the + // pointer is actually the pointer induction of a loop, it is being + // inserted into Worklist. If the use will be a scalar use, and the + // pointer is only used by memory accesses, we place the pointer in + // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { + if (isScalarPtrInduction(MemAccess, Ptr)) { + Worklist.insert(cast<Instruction>(Ptr)); + Instruction *Update = cast<Instruction>( + cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); + Worklist.insert(Update); + LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr + << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update + << "\n"); + return; + } // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -4521,10 +5066,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { }; // We seed the scalars analysis with three classes of instructions: (1) - // instructions marked uniform-after-vectorization, (2) bitcast and - // getelementptr instructions used by memory accesses requiring a scalar use, - // and (3) pointer induction variables and their update instructions (we - // currently only scalarize these). + // instructions marked uniform-after-vectorization and (2) bitcast, + // getelementptr and (pointer) phi instructions used by memory accesses + // requiring a scalar use. // // (1) Add to the worklist all instructions that have been identified as // uniform-after-vectorization. @@ -4550,24 +5094,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { Worklist.insert(I); } - // (3) Add to the worklist all pointer induction variables and their update - // instructions. - // - // TODO: Once we are able to vectorize pointer induction variables we should - // no longer insert them into the worklist here. - auto *Latch = TheLoop->getLoopLatch(); - for (auto &Induction : Legal->getInductionVars()) { - auto *Ind = Induction.first; - auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) - continue; - Worklist.insert(Ind); - Worklist.insert(IndUpdate); - LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); - LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate - << "\n"); - } - // Insert the forced scalars. // FIXME: Currently widenPHIInstruction() often creates a dead vector // induction variable when the PHI user is scalarized. @@ -4603,14 +5129,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - // We already considered pointer induction variables, so there's no reason - // to look at their users again. - // - // TODO: Once we are able to vectorize pointer induction variables we - // should no longer skip over them here. - if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) - continue; - // If tail-folding is applied, the primary induction variable will be used // to feed a vector compare. if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) @@ -4646,7 +5164,8 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, + ElementCount VF) { if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -4660,7 +5179,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne auto *Ty = getMemInstValueType(I); // We have already decided how to vectorize this instruction, get that // result. - if (VF > 1) { + if (VF.isVector()) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -4681,8 +5200,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return false; } -bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( + Instruction *I, ElementCount VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); @@ -4718,8 +5237,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, : TTI.isLegalMaskedStore(Ty, Alignment); } -bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( + Instruction *I, ElementCount VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast<LoadInst>(I); StoreInst *SI = dyn_cast<StoreInst>(I); @@ -4746,13 +5265,13 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, return true; } -void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { +void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && + assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -4778,6 +5297,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // replicating region where only a single instance out of VF should be formed. // TODO: optimize such seldom cases if found important, see PR40816. auto addToWorklistIfAllowed = [&](Instruction *I) -> void { + if (isOutOfScope(I)) { + LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " + << *I << "\n"); + return; + } if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); @@ -4794,65 +5318,71 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) addToWorklistIfAllowed(Cmp); - // Holds consecutive and consecutive-like pointers. Consecutive-like pointers - // are pointers that are treated like consecutive pointers during - // vectorization. The pointer operands of interleaved accesses are an - // example. - SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; - - // Holds pointer operands of instructions that are possibly non-uniform. - SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; - - auto isUniformDecision = [&](Instruction *I, unsigned VF) { + auto isUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); + // A uniform memory op is itself uniform. We exclude uniform stores + // here as they demand the last lane, not the first one. + if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { + assert(WideningDecision == CM_Scalarize); + return true; + } + return (WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || WideningDecision == CM_Interleave); }; - // Iterate over the instructions in the loop, and collect all - // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible - // that a consecutive-like pointer operand will be scalarized, we collect it - // in PossibleNonUniformPtrs instead. We use two sets here because a single - // getelementptr instruction can be used by both vectorized and scalarized - // memory instructions. For example, if a loop loads and stores from the same - // location, but the store is conditional, the store will be scalarized, and - // the getelementptr won't remain uniform. + + + // Returns true if Ptr is the pointer operand of a memory access instruction + // I, and I is known to not require scalarization. + auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { + return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); + }; + + // Holds a list of values which are known to have at least one uniform use. + // Note that there may be other uses which aren't uniform. A "uniform use" + // here is something which only demands lane 0 of the unrolled iterations; + // it does not imply that all lanes produce the same value (e.g. this is not + // the usual meaning of uniform) + SmallPtrSet<Value *, 8> HasUniformUse; + + // Scan the loop for instructions which are either a) known to have only + // lane 0 demanded or b) are uses which demand only lane 0 of their operand. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { // If there's no pointer operand, there's nothing to do. - auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); + auto *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) continue; - // True if all users of Ptr are memory accesses that have Ptr as their - // pointer operand. - auto UsersAreMemAccesses = - llvm::all_of(Ptr->users(), [&](User *U) -> bool { - return getLoadStorePointerOperand(U) == Ptr; - }); - - // Ensure the memory instruction will not be scalarized or used by - // gather/scatter, making its pointer operand non-uniform. If the pointer - // operand is used by any instruction other than a memory access, we - // conservatively assume the pointer operand may be non-uniform. - if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) - PossibleNonUniformPtrs.insert(Ptr); + // A uniform memory op is itself uniform. We exclude uniform stores + // here as they demand the last lane, not the first one. + if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) + addToWorklistIfAllowed(&I); - // If the memory instruction will be vectorized and its pointer operand - // is consecutive-like, or interleaving - the pointer operand should - // remain uniform. - else - ConsecutiveLikePtrs.insert(Ptr); + if (isUniformDecision(&I, VF)) { + assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); + HasUniformUse.insert(Ptr); + } } - // Add to the Worklist all consecutive and consecutive-like pointers that - // aren't also identified as possibly non-uniform. - for (auto *V : ConsecutiveLikePtrs) - if (!PossibleNonUniformPtrs.count(V)) - addToWorklistIfAllowed(V); + // Add to the worklist any operands which have *only* uniform (e.g. lane 0 + // demanding) users. Since loops are assumed to be in LCSSA form, this + // disallows uses outside the loop as well. + for (auto *V : HasUniformUse) { + if (isOutOfScope(V)) + continue; + auto *I = cast<Instruction>(V); + auto UsersAreMemAccesses = + llvm::all_of(I->users(), [&](User *U) -> bool { + return isVectorizedMemAccessUse(cast<Instruction>(U), V); + }); + if (UsersAreMemAccesses) + addToWorklistIfAllowed(I); + } // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures @@ -4875,20 +5405,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { auto *OI = cast<Instruction>(OV); if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast<Instruction>(U); - return Worklist.count(J) || - (OI == getLoadStorePointerOperand(J) && - isUniformDecision(J, VF)); + return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); })) addToWorklistIfAllowed(OI); } } - // Returns true if Ptr is the pointer operand of a memory access instruction - // I, and I is known to not require scalarization. - auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); - }; - // For an instruction to be added into Worklist above, all its users inside // the loop should also be in Worklist. However, this condition cannot be // true for phi nodes that form a cyclic dependence. We must process phi @@ -4961,8 +5483,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, - unsigned UserIC) { +Optional<ElementCount> +LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -4982,9 +5504,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, return None; } + ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); + switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return UserVF ? UserVF : computeFeasibleMaxVF(TC); + return MaxVF; + case CM_ScalarEpilogueNotAllowedUsePredicate: + LLVM_FALLTHROUGH; case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" @@ -5005,9 +5531,26 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, // for size. if (runtimeChecksRequired()) return None; + break; } + // The only loops we can vectorize without a scalar epilogue, are loops with + // a bottom-test and a single exiting block. We'd have to handle the fact + // that not every instruction executes on the last iteration. This will + // require a lane mask which varies through the vector loop body. (TODO) + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + // If there was a tail-folding hint/switch, but we can't fold the tail by + // masking, fallback to a vectorization with a scalar epilogue. + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " + "scalar epilogue instead.\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + return MaxVF; + } + return None; + } + // Now try the tail folding // Invalidate interleave groups that require an epilogue if we can't mask @@ -5020,10 +5563,21 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); - assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); - unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; - if (TC > 0 && TC % MaxVFtimesIC == 0) { + assert(!MaxVF.isScalable() && + "Scalable vectors do not yet support tail folding"); + assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && + "MaxVF must be a power of 2"); + unsigned MaxVFtimesIC = + UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); + // Avoid tail folding if the trip count is known to be a multiple of any VF we + // chose. + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + const SCEV *Rem = SE->getURemExpr( + ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + if (Rem->isZero()) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxVF; @@ -5038,6 +5592,20 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, return MaxVF; } + // If there was a tail-folding hint/switch, but we can't fold the tail by + // masking, fallback to a vectorization with a scalar epilogue. + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " + "scalar epilogue instead.\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + return MaxVF; + } + + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); + return None; + } + if (TC == 0) { reportVectorizationFailure( "Unable to calculate the loop count due to complex control flow", @@ -5055,8 +5623,33 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, return None; } -unsigned -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { +ElementCount +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF) { + bool IgnoreScalableUserVF = UserVF.isScalable() && + !TTI.supportsScalableVectors() && + !ForceTargetSupportsScalableVectors; + if (IgnoreScalableUserVF) { + LLVM_DEBUG( + dbgs() << "LV: Ignoring VF=" << UserVF + << " because target does not support scalable vectors.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Ignoring VF=" << ore::NV("UserVF", UserVF) + << " because target does not support scalable vectors."; + }); + } + + // Beyond this point two scenarios are handled. If UserVF isn't specified + // then a suitable VF is chosen. If UserVF is specified and there are + // dependencies, check if it's legal. However, if a UserVF is specified and + // there are no dependencies, then there's nothing to do. + if (UserVF.isNonZero() && !IgnoreScalableUserVF && + Legal->isSafeForAnyVectorWidth()) + return UserVF; + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5066,9 +5659,62 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). - unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); + unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); + + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value. Otherwise, return as is. + if (UserVF.isNonZero() && !IgnoreScalableUserVF) { + unsigned MaxSafeElements = + PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); + ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); - WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); + if (UserVF.isScalable()) { + Optional<unsigned> MaxVScale = TTI.getMaxVScale(); + + // Scale VF by vscale before checking if it's safe. + MaxSafeVF = ElementCount::getScalable( + MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + + if (MaxSafeVF.isZero()) { + // The dependence distance is too small to use scalable vectors, + // fallback on fixed. + LLVM_DEBUG( + dbgs() + << "LV: Max legal vector width too small, scalable vectorization " + "unfeasible. Using fixed-width vectorization instead.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Max legal vector width too small, scalable vectorization " + << "unfeasible. Using fixed-width vectorization instead."; + }); + return computeFeasibleMaxVF( + ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); + } + } + + LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); + + if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) + return UserVF; + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" << MaxSafeVF + << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeVF); + }); + return MaxSafeVF; + } + + WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. @@ -5079,12 +5725,13 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister << " bits.\n"); - assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" - " into one vector!"); + assert(MaxVectorSize <= WidestRegister && + "Did not expect to pack so many elements" + " into one vector!"); if (MaxVectorSize == 0) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); MaxVectorSize = 1; - return MaxVectorSize; + return ElementCount::getFixed(MaxVectorSize); } else if (ConstTripCount && ConstTripCount < MaxVectorSize && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in @@ -5092,7 +5739,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " << ConstTripCount << "\n"); MaxVectorSize = ConstTripCount; - return MaxVectorSize; + return ElementCount::getFixed(MaxVectorSize); } unsigned MaxVF = MaxVectorSize; @@ -5100,10 +5747,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). - SmallVector<unsigned, 8> VFs; + SmallVector<ElementCount, 8> VFs; unsigned NewMaxVectorSize = WidestRegister / SmallestType; for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) - VFs.push_back(VS); + VFs.push_back(ElementCount::getFixed(VS)); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5118,7 +5765,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { Selected = false; } if (Selected) { - MaxVF = VFs[i]; + MaxVF = VFs[i].getKnownMinValue(); break; } } @@ -5130,30 +5777,39 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { } } } - return MaxVF; + return ElementCount::getFixed(MaxVF); } VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { - float Cost = expectedCost(1).first; - const float ScalarCost = Cost; +LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { + // FIXME: This can be fixed for scalable vectors later, because at this stage + // the LoopVectorizer will only consider vectorizing a loop with scalable + // vectors when the loop has a hint to enable vectorization for a given VF. + assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); + + InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); + assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); + unsigned Width = 1; - LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); + const float ScalarCost = *ExpectedCost.getValue(); + float Cost = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && MaxVF > 1) { + if (ForceVectorization && MaxVF.isVector()) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. Cost = std::numeric_limits<float>::max(); } - for (unsigned i = 2; i <= MaxVF; i *= 2) { + for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. - VectorizationCostTy C = expectedCost(i); - float VectorCost = C.first / (float)i; + VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); + assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); + float VectorCost = *C.first.getValue() / (float)i; LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); if (!C.second && !ForceVectorization) { @@ -5162,6 +5818,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { << " because it will not generate any vector instructions.\n"); continue; } + + // If profitable add it to ProfitableVF list. + if (VectorCost < ScalarCost) { + ProfitableVFs.push_back(VectorizationFactor( + {ElementCount::getFixed(i), (unsigned)VectorCost})); + } + if (VectorCost < Cost) { Cost = VectorCost; Width = i; @@ -5180,10 +5843,131 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; + VectorizationFactor Factor = {ElementCount::getFixed(Width), + (unsigned)(Width * Cost)}; return Factor; } +bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( + const Loop &L, ElementCount VF) const { + // Cross iteration phis such as reductions need special handling and are + // currently unsupported. + if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { + return Legal->isFirstOrderRecurrence(&Phi) || + Legal->isReductionVariable(&Phi); + })) + return false; + + // Phis with uses outside of the loop require special handling and are + // currently unsupported. + for (auto &Entry : Legal->getInductionVars()) { + // Look for uses of the value of the induction at the last iteration. + Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); + for (User *U : PostInc->users()) + if (!L.contains(cast<Instruction>(U))) + return false; + // Look for uses of penultimate value of the induction. + for (User *U : Entry.first->users()) + if (!L.contains(cast<Instruction>(U))) + return false; + } + + // Induction variables that are widened require special handling that is + // currently not supported. + if (any_of(Legal->getInductionVars(), [&](auto &Entry) { + return !(this->isScalarAfterVectorization(Entry.first, VF) || + this->isProfitableToScalarize(Entry.first, VF)); + })) + return false; + + return true; +} + +bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( + const ElementCount VF) const { + // FIXME: We need a much better cost-model to take different parameters such + // as register pressure, code size increase and cost of extra branches into + // account. For now we apply a very crude heuristic and only consider loops + // with vectorization factors larger than a certain value. + // We also consider epilogue vectorization unprofitable for targets that don't + // consider interleaving beneficial (eg. MVE). + if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) + return false; + if (VF.getFixedValue() >= EpilogueVectorizationMinVF) + return true; + return false; +} + +VectorizationFactor +LoopVectorizationCostModel::selectEpilogueVectorizationFactor( + const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { + VectorizationFactor Result = VectorizationFactor::Disabled(); + if (!EnableEpilogueVectorization) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); + return Result; + } + + if (!isScalarEpilogueAllowed()) { + LLVM_DEBUG( + dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " + "allowed.\n";); + return Result; + } + + // FIXME: This can be fixed for scalable vectors later, because at this stage + // the LoopVectorizer will only consider vectorizing a loop with scalable + // vectors when the loop has a hint to enable vectorization for a given VF. + if (MainLoopVF.isScalable()) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " + "yet supported.\n"); + return Result; + } + + // Not really a cost consideration, but check for unsupported cases here to + // simplify the logic. + if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { + LLVM_DEBUG( + dbgs() << "LEV: Unable to vectorize epilogue because the loop is " + "not a supported candidate.\n";); + return Result; + } + + if (EpilogueVectorizationForceVF > 1) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); + if (LVP.hasPlanWithVFs( + {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) + return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; + else { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization forced factor is not viable.\n";); + return Result; + } + } + + if (TheLoop->getHeader()->getParent()->hasOptSize() || + TheLoop->getHeader()->getParent()->hasMinSize()) { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization skipped due to opt for size.\n";); + return Result; + } + + if (!isEpilogueVectorizationProfitable(MainLoopVF)) + return Result; + + for (auto &NextVF : ProfitableVFs) + if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && + LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) + Result = NextVF; + + if (Result != VectorizationFactor::Disabled()) + LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " + << Result.Width.getFixedValue() << "\n";); + return Result; +} + std::pair<unsigned, unsigned> LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; @@ -5210,6 +5994,11 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { if (!Legal->isReductionVariable(PN)) continue; RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; + if (PreferInLoopReductions || + TTI.preferInLoopReduction(RdxDesc.getOpcode(), + RdxDesc.getRecurrenceType(), + TargetTransformInfo::ReductionFlags())) + continue; T = RdxDesc.getRecurrenceType(); } @@ -5240,7 +6029,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -5263,10 +6052,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not interleave loops with a relatively small known or estimated trip - // count. auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); - if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) + const bool HasReductions = !Legal->getReductionVars().empty(); + // Do not interleave loops with a relatively small known or estimated trip + // count. But we will interleave when InterleaveSmallLoopScalarReduction is + // enabled, and the code has scalar reductions(HasReductions && VF = 1), + // because with the above conditions interleaving can expose ILP and break + // cross iteration dependences for reductions. + if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && + !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) return 1; RegisterUsage R = calculateRegisterUsage({VF})[0]; @@ -5294,7 +6088,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName(pair.first) << " register class\n"); - if (VF == 1) { + if (VF.isScalar()) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; } else { @@ -5318,10 +6112,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + unsigned MaxInterleaveCount = + TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); // Check if the user has overridden the max. - if (VF == 1) { + if (VF.isScalar()) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { @@ -5330,28 +6125,47 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } // If trip count is known or estimated compile time constant, limit the - // interleave count to be less than the trip count divided by VF. + // interleave count to be less than the trip count divided by VF, provided it + // is at least 1. + // + // For scalable vectors we can't know if interleaving is beneficial. It may + // not be beneficial for small loops if none of the lanes in the second vector + // iterations is enabled. However, for larger loops, there is likely to be a + // similar benefit as for fixed-width vectors. For now, we choose to leave + // the InterleaveCount as if vscale is '1', although if some information about + // the vector is known (e.g. min vector size), we can make a better decision. if (BestKnownTC) { - MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); + MaxInterleaveCount = + std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); + // Make sure MaxInterleaveCount is greater than 0. + MaxInterleaveCount = std::max(1u, MaxInterleaveCount); } - // If we did not calculate the cost for VF (because the user selected the VF) - // then we calculate the cost of VF here. - if (LoopCost == 0) - LoopCost = expectedCost(VF).first; - - assert(LoopCost && "Non-zero loop cost expected"); + assert(MaxInterleaveCount > 0 && + "Maximum interleave count must be greater than 0"); // Clamp the calculated IC to be between the 1 and the max interleave count // that the target and trip count allows. if (IC > MaxInterleaveCount) IC = MaxInterleaveCount; - else if (IC < 1) - IC = 1; + else + // Make sure IC is greater than 0. + IC = std::max(1u, IC); + + assert(IC > 0 && "Interleave count must be greater than 0."); + + // If we did not calculate the cost for VF (because the user selected the VF) + // then we calculate the cost of VF here. + if (LoopCost == 0) { + assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); + LoopCost = *expectedCost(VF).first.getValue(); + } + + assert(LoopCost && "Non-zero loop cost expected"); // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && !Legal->getReductionVars().empty()) { + if (VF.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5359,11 +6173,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // Note that if we've already vectorized the loop we will have done the // runtime check and so interleaving won't require further checks. bool InterleavingRequiresRuntimePointerCheck = - (VF == 1 && Legal->getRuntimePointerChecking()->Need); + (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. - LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' + << "LV: IC is " << IC << '\n' + << "LV: VF is " << VF << '\n'); + const bool AggressivelyInterleaveReductions = + TTI.enableAggressiveInterleaving(HasReductions); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the @@ -5382,7 +6200,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. - if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { + if (HasReductions && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -5396,14 +6214,23 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, return std::max(StoresIC, LoadsIC); } - LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); - return SmallIC; + // If there are scalar reductions and TTI has enabled aggressive + // interleaving for reductions, we will interleave to expose ILP. + if (InterleaveSmallLoopScalarReduction && VF.isScalar() && + AggressivelyInterleaveReductions) { + LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + // Interleave no less than SmallIC but not as aggressive as the normal IC + // to satisfy the rare situation when resources are too limited. + return std::max(IC / 2, SmallIC); + } else { + LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + return SmallIC; + } } // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. - bool HasReductions = !Legal->getReductionVars().empty(); - if (TTI.enableAggressiveInterleaving(HasReductions)) { + if (AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; } @@ -5413,7 +6240,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, } SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { +LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -5485,26 +6312,17 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { TransposeEnds[Interval.second].push_back(Interval.first); SmallPtrSet<Instruction *, 8> OpenIntervals; - - // Get the size of the widest register. - unsigned MaxSafeDepDist = -1U; - if (Legal->getMaxSafeDepDistBytes() != -1U) - MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; - unsigned WidestRegister = - std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); - const DataLayout &DL = TheFunction->getParent()->getDataLayout(); - SmallVector<RegisterUsage, 8> RUs(VFs.size()); SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); // A lambda that gets the register usage for the given type and VF. - auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { - if (Ty->isTokenTy()) + const auto &TTICapture = TTI; + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { + if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0U; - unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); - return std::max<unsigned>(1, VF * TypeSize / WidestRegister); + return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -5528,7 +6346,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { // Count the number of live intervals. SmallMapVector<unsigned, unsigned, 4> RegUsage; - if (VFs[j] == 1) { + if (VFs[j].isScalar()) { for (auto Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); if (RegUsage.find(ClassID) == RegUsage.end()) @@ -5557,7 +6375,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { } } } - + for (auto& pair : RegUsage) { if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); @@ -5575,10 +6393,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { for (unsigned i = 0, e = VFs.size(); i < e; ++i) { SmallMapVector<unsigned, unsigned, 4> Invariant; - + for (auto Inst : LoopInvariants) { - unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); - unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); + unsigned Usage = + VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = + TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); if (Invariant.find(ClassID) == Invariant.end()) Invariant[ClassID] = Usage; else @@ -5626,12 +6446,13 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ NumPredStores > NumberOfStoresToPredicate); } -void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { +void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) + if (VF.isScalar() || VF.isZero() || + InstsToScalarize.find(VF) != InstsToScalarize.end()) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -5660,14 +6481,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { } int LoopVectorizationCostModel::computePredInstDiscount( - Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, - unsigned VF) { + Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the // vector version cost the same. - int Discount = 0; + InstructionCost Discount = 0; // Holds instructions to analyze. The instructions we visit are mapped in // ScalarCosts. Those instructions are the ones that would be scalarized if @@ -5722,22 +6542,27 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the cost of the vector instruction. Note that this cost already // includes the scalarization overhead of the predicated instruction. - unsigned VectorCost = getInstructionCost(I, VF).first; + InstructionCost VectorCost = getInstructionCost(I, VF).first; // Compute the cost of the scalarized instruction. This cost is the cost of // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. - unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + assert(!VF.isScalable() && "scalable vectors not yet supported."); + InstructionCost ScalarCost = + VF.getKnownMinValue() * + getInstructionCost(I, ElementCount::getFixed(1)).first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF), true, false); - ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, - TTI::TCK_RecipThroughput); + APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + ScalarCost += + VF.getKnownMinValue() * + TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); } // Compute the scalarization overhead of needed extractelement @@ -5750,10 +6575,12 @@ int LoopVectorizationCostModel::computePredInstDiscount( "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J, VF)) + else if (needsExtract(J, VF)) { + assert(!VF.isScalable() && "scalable vectors not yet supported."); ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF), false, true); + APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); + } } // Scale the total scalar cost by block probability. @@ -5765,11 +6592,11 @@ int LoopVectorizationCostModel::computePredInstDiscount( ScalarCosts[I] = ScalarCost; } - return Discount; + return *Discount.getValue(); } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(unsigned VF) { +LoopVectorizationCostModel::expectedCost(ElementCount VF) { VectorizationCostTy Cost; // For each block. @@ -5779,14 +6606,15 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) + if (ValuesToIgnore.count(&I) || + (VF.isVector() && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) - C.first = ForceTargetInstructionCost; + C.first = InstructionCost(ForceTargetInstructionCost); BlockCost.first += C.first; BlockCost.second |= C.second; @@ -5799,9 +6627,10 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { // if-converted. This means that the block's instructions (aside from // stores and instructions that may divide by zero) will now be // unconditionally executed. For the scalar case, we may not always execute - // the predicated block. Thus, scale the block's cost by the probability of - // executing it. - if (VF == 1 && blockNeedsPredication(BB)) + // the predicated block, if it is an if-else block. Thus, scale the block's + // cost by the probability of executing it. blockNeedsPredication from + // Legal is used so as to not include all blocks in tail folded loops. + if (VF.isScalar() && Legal->blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -5846,9 +6675,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { Legal->hasStride(I->getOperand(1)); } -unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, - unsigned VF) { - assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); +InstructionCost +LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + ElementCount VF) { + assert(VF.isVector() && + "Scalarization cost of instruction implies vectorization."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); @@ -5861,14 +6693,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + InstructionCost Cost = + VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. const Align Alignment = getLoadStoreAlignment(I); - Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS, - TTI::TCK_RecipThroughput); + Cost += VF.getKnownMinValue() * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS, TTI::TCK_RecipThroughput); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5889,8 +6722,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, return Cost; } -unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); @@ -5901,7 +6735,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); - unsigned Cost = 0; + InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); @@ -5915,8 +6749,11 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, return Cost; } -unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, + ElementCount VF) { + assert(Legal->isUniformMemOp(*I)); + Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -5937,11 +6774,12 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF - 1)); + VF.getKnownMinValue() - 1)); } -unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -5953,8 +6791,9 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, TargetTransformInfo::TCK_RecipThroughput, I); } -unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -5963,7 +6802,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. // An interleaved store group doesn't need this as it doesn't allow gaps. @@ -5977,7 +6817,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, // Calculate the cost of the whole interleaved group. bool UseMaskForGaps = Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); - unsigned Cost = TTI.getInterleavedMemoryOpCost( + InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); @@ -5991,11 +6831,122 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, return Cost; } -unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - unsigned VF) { +InstructionCost LoopVectorizationCostModel::getReductionPatternCost( + Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { + // Early exit for no inloop reductions + if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) + return InstructionCost::getInvalid(); + auto *VectorTy = cast<VectorType>(Ty); + + // We are looking for a pattern of, and finding the minimal acceptable cost: + // reduce(mul(ext(A), ext(B))) or + // reduce(mul(A, B)) or + // reduce(ext(A)) or + // reduce(A). + // The basic idea is that we walk down the tree to do that, finding the root + // reduction instruction in InLoopReductionImmediateChains. From there we find + // the pattern of mul/ext and test the cost of the entire pattern vs the cost + // of the components. If the reduction cost is lower then we return it for the + // reduction instruction and 0 for the other instructions in the pattern. If + // it is not we return an invalid cost specifying the orignal cost method + // should be used. + Instruction *RetI = I; + if ((RetI->getOpcode() == Instruction::SExt || + RetI->getOpcode() == Instruction::ZExt)) { + if (!RetI->hasOneUser()) + return InstructionCost::getInvalid(); + RetI = RetI->user_back(); + } + if (RetI->getOpcode() == Instruction::Mul && + RetI->user_back()->getOpcode() == Instruction::Add) { + if (!RetI->hasOneUser()) + return InstructionCost::getInvalid(); + RetI = RetI->user_back(); + } + + // Test if the found instruction is a reduction, and if not return an invalid + // cost specifying the parent to use the original cost modelling. + if (!InLoopReductionImmediateChains.count(RetI)) + return InstructionCost::getInvalid(); + + // Find the reduction this chain is a part of and calculate the basic cost of + // the reduction on its own. + Instruction *LastChain = InLoopReductionImmediateChains[RetI]; + Instruction *ReductionPhi = LastChain; + while (!isa<PHINode>(ReductionPhi)) + ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; + + RecurrenceDescriptor RdxDesc = + Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; + unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), + VectorTy, false, CostKind); + + // Get the operand that was not the reduction chain and match it to one of the + // patterns, returning the better cost if it is found. + Instruction *RedOp = RetI->getOperand(1) == LastChain + ? dyn_cast<Instruction>(RetI->getOperand(0)) + : dyn_cast<Instruction>(RetI->getOperand(1)); + + VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); + + if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && + !TheLoop->isLoopInvariant(RedOp)) { + bool IsUnsigned = isa<ZExtInst>(RedOp); + auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + unsigned ExtCost = + TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, RedOp); + if (RedCost.isValid() && RedCost < BaseCost + ExtCost) + return I == RetI ? *RedCost.getValue() : 0; + } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { + Instruction *Mul = RedOp; + Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); + Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); + if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && + Op0->getOpcode() == Op1->getOpcode() && + Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && + !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { + bool IsUnsigned = isa<ZExtInst>(Op0); + auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); + // reduce(mul(ext, ext)) + unsigned ExtCost = + TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, Op0); + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) + return I == RetI ? *RedCost.getValue() : 0; + } else { + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, + CostKind); + + if (RedCost.isValid() && RedCost < MulCost + BaseCost) + return I == RetI ? *RedCost.getValue() : 0; + } + } + + return I == RetI ? BaseCost : InstructionCost::getInvalid(); +} + +InstructionCost +LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, + ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. - if (VF == 1) { + if (VF.isScalar()) { Type *ValTy = getMemInstValueType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6008,43 +6959,52 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, + ElementCount VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) - VF = 1; + VF = ElementCount::getFixed(1); - if (VF > 1 && isProfitableToScalarize(I, VF)) + if (VF.isVector() && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); // Forced scalars do not have any scalarization overhead. auto ForcedScalar = ForcedScalars.find(VF); - if (VF > 1 && ForcedScalar != ForcedScalars.end()) { + if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) - return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + return VectorizationCostTy( + (getInstructionCost(I, ElementCount::getFixed(1)).first * + VF.getKnownMinValue()), + false); } Type *VectorTy; - unsigned C = getInstructionCost(I, VF, VectorTy); + InstructionCost C = getInstructionCost(I, VF, VectorTy); bool TypeNotScalarized = - VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; + VF.isVector() && VectorTy->isVectorTy() && + TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); return VectorizationCostTy(C, TypeNotScalarized); } -unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - unsigned VF) { +InstructionCost +LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, + ElementCount VF) { - if (VF == 1) + assert(!VF.isScalable() && + "cannot compute scalarization overhead for scalable vectorization"); + if (VF.isScalar()) return 0; - unsigned Cost = 0; + InstructionCost Cost = 0; Type *RetTy = ToVectorTy(I->getType(), VF); if (!RetTy->isVoidTy() && (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); + cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), + true, false); // Some targets keep addresses scalar. if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) @@ -6061,11 +7021,11 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Skip operands that do not require extraction/scalarization and do not incur // any overhead. return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF); + filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); } -void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { - if (VF == 1) +void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { + if (VF.isScalar()) return; NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { @@ -6082,23 +7042,19 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (Legal->isUniform(Ptr) && - // Conditional loads and stores should be scalarized and predicated. - // isScalarWithPredication cannot be used here since masked - // gather/scatters are not considered scalar with predication. - !Legal->blockNeedsPredication(I.getParent())) { + if (Legal->isUniformMemOp(I)) { // TODO: Avoid replicating loads and stores instead of // relying on instcombine to remove them. // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract - unsigned Cost = getUniformMemOpCost(&I, VF); + InstructionCost Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); continue; } // We assume that widening is the best solution when possible. if (memoryInstructionCanBeWidened(&I, VF)) { - unsigned Cost = getConsecutiveMemOpCost(&I, VF); + InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); int ConsecutiveStride = Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && @@ -6110,7 +7066,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } // Choose between Interleaving, Gather/Scatter or Scalarization. - unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); + InstructionCost InterleaveCost = std::numeric_limits<int>::max(); unsigned NumAccesses = 1; if (isAccessInterleaved(&I)) { auto Group = getInterleavedAccessGroup(&I); @@ -6125,17 +7081,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { InterleaveCost = getInterleaveGroupCost(&I, VF); } - unsigned GatherScatterCost = + InstructionCost GatherScatterCost = isLegalGatherOrScatter(&I) ? getGatherScatterCost(&I, VF) * NumAccesses - : std::numeric_limits<unsigned>::max(); + : std::numeric_limits<int>::max(); - unsigned ScalarizationCost = + InstructionCost ScalarizationCost = getMemInstScalarizationCost(&I, VF) * NumAccesses; // Choose better solution for the current VF, // write down this decision and use it during vectorization. - unsigned Cost; + InstructionCost Cost; InstWidening Decision; if (InterleaveCost <= GatherScatterCost && InterleaveCost < ScalarizationCost) { @@ -6179,8 +7135,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // Add all instructions used to generate the addresses. SmallVector<Instruction *, 4> Worklist; - for (auto *I : AddrDefs) - Worklist.push_back(I); + append_range(Worklist, AddrDefs); while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); for (auto &Op : I->operands()) @@ -6199,14 +7154,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { InstWidening Decision = getWideningDecision(I, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. - setWideningDecision(I, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(I, 1))); + setWideningDecision( + I, VF, CM_Scalarize, + (VF.getKnownMinValue() * + getMemoryInstructionCost(I, ElementCount::getFixed(1)))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) - setWideningDecision(Member, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(Member, 1))); + setWideningDecision( + Member, VF, CM_Scalarize, + (VF.getKnownMinValue() * + getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); } } } else @@ -6216,9 +7175,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } } -unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, - unsigned VF, - Type *&VectorTy) { +InstructionCost +LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, + Type *&VectorTy) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); @@ -6240,19 +7199,22 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // blocks requires also an extract of its vector compare i1 element. bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); - if (VF > 1 && BI->isConditional() && + if (VF.isVector() && BI->isConditional() && (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. + assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *Vec_i1Ty = - FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), - false, true) + - (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); - } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) + VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), + false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * + VF.getKnownMinValue())); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); else @@ -6267,20 +7229,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. - if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) - return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - cast<VectorType>(VectorTy), VF - 1, - FixedVectorType::get(RetTy, 1)); + if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) + return TTI.getShuffleCost( + TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), + VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. - if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) + if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), - CostKind); + CmpInst::BAD_ICMP_PREDICATE, CostKind); return TTI.getCFInstrCost(Instruction::PHI, CostKind); } @@ -6292,17 +7254,19 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF > 1 && isScalarWithPredication(I)) { - unsigned Cost = 0; + if (VF.isVector() && isScalarWithPredication(I)) { + InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); + Cost += VF.getKnownMinValue() * + TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); + Cost += VF.getKnownMinValue() * + TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -6331,6 +7295,13 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // Since we will replace the stride by 1 the multiplication should go away. if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) return 0; + + // Detect reduction patterns + InstructionCost RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + .isValid()) + return RedCost; + // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); @@ -6341,14 +7312,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector<const Value *, 4> Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, @@ -6362,10 +7334,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) - CondTy = FixedVectorType::get(CondTy, VF); - + CondTy = VectorType::get(CondTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, - CostKind, I); + CmpInst::BAD_ICMP_PREDICATE, CostKind, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -6374,18 +7345,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, - I); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, + CmpInst::BAD_ICMP_PREDICATE, CostKind, I); } case Instruction::Store: case Instruction::Load: { - unsigned Width = VF; - if (Width > 1) { + ElementCount Width = VF; + if (Width.isVector()) { InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); if (Decision == CM_Scalarize) - Width = 1; + Width = ElementCount::getFixed(1); } VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); @@ -6402,15 +7373,62 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { + // Computes the CastContextHint from a Load/Store instruction. + auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Expected a load or a store!"); + + if (VF.isScalar() || !TheLoop->contains(I)) + return TTI::CastContextHint::Normal; + + switch (getWideningDecision(I, VF)) { + case LoopVectorizationCostModel::CM_GatherScatter: + return TTI::CastContextHint::GatherScatter; + case LoopVectorizationCostModel::CM_Interleave: + return TTI::CastContextHint::Interleave; + case LoopVectorizationCostModel::CM_Scalarize: + case LoopVectorizationCostModel::CM_Widen: + return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal; + case LoopVectorizationCostModel::CM_Widen_Reverse: + return TTI::CastContextHint::Reversed; + case LoopVectorizationCostModel::CM_Unknown: + llvm_unreachable("Instr did not go through cost modelling?"); + } + + llvm_unreachable("Unhandled case!"); + }; + + unsigned Opcode = I->getOpcode(); + TTI::CastContextHint CCH = TTI::CastContextHint::None; + // For Trunc, the context is the only user, which must be a StoreInst. + if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { + if (I->hasOneUse()) + if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) + CCH = ComputeCCH(Store); + } + // For Z/Sext, the context is the operand, which must be a LoadInst. + else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || + Opcode == Instruction::FPExt) { + if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) + CCH = ComputeCCH(Load); + } + // We optimize the truncation of induction variables having constant // integer steps. The cost of these truncations is the same as the scalar // operation. if (isOptimizableIVTruncate(I, VF)) { auto *Trunc = cast<TruncInst>(I); return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), - Trunc->getSrcTy(), CostKind, Trunc); + Trunc->getSrcTy(), CCH, CostKind, Trunc); } + // Detect reduction patterns + InstructionCost RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + .isValid()) + return RedCost; + Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; @@ -6421,35 +7439,39 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // // Calculate the modified src and dest types. Type *MinVecTy = VectorTy; - if (I->getOpcode() == Instruction::Trunc) { + if (Opcode == Instruction::Trunc) { SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); - } else if (I->getOpcode() == Instruction::ZExt || - I->getOpcode() == Instruction::SExt) { + } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } } - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; - return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, - CostKind, I); + assert(!VF.isScalable() && "VF is assumed to be non scalable"); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; + return N * + TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); - unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); - if (getVectorIntrinsicIDForCall(CI, TLI)) - return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); + InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); + if (getVectorIntrinsicIDForCall(CI, TLI)) { + InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); + return std::min(CallCost, IntrinsicCost); + } return CallCost; } + case Instruction::ExtractValue: + return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, - CostKind) + + return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( + Instruction::Mul, VectorTy, CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6502,7 +7524,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // detection. for (auto &Reduction : Legal->getReductionVars()) { RecurrenceDescriptor &RedDes = Reduction.second; - SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); + const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction @@ -6514,6 +7536,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } } +void LoopVectorizationCostModel::collectInLoopReductions() { + for (auto &Reduction : Legal->getReductionVars()) { + PHINode *Phi = Reduction.first; + RecurrenceDescriptor &RdxDesc = Reduction.second; + + // We don't collect reductions that are type promoted (yet). + if (RdxDesc.getRecurrenceType() != Phi->getType()) + continue; + + // If the target would prefer this reduction to happen "in-loop", then we + // want to record it as such. + unsigned Opcode = RdxDesc.getOpcode(); + if (!PreferInLoopReductions && + !TTI.preferInLoopReduction(Opcode, Phi->getType(), + TargetTransformInfo::ReductionFlags())) + continue; + + // Check that we can correctly put the reductions into the loop, by + // finding the chain of operations that leads from the phi to the loop + // exit value. + SmallVector<Instruction *, 4> ReductionOperations = + RdxDesc.getReductionOpChain(Phi, TheLoop); + bool InLoop = !ReductionOperations.empty(); + if (InLoop) { + InLoopReductionChains[Phi] = ReductionOperations; + // Add the elements to InLoopReductionImmediateChains for cost modelling. + Instruction *LastChain = Phi; + for (auto *I : ReductionOperations) { + InLoopReductionImmediateChains[I] = LastChain; + LastChain = I; + } + } + LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") + << " reduction for phi: " << *Phi << "\n"); + } +} + // TODO: we could return a pair of values that specify the max VF and // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment @@ -6527,37 +7586,40 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { - unsigned VF = UserVF; +LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { + assert(!UserVF.isScalable() && "scalable vectors not yet supported"); + ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. - if (!OrigLoop->empty()) { + if (!OrigLoop->isInnermost()) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. - if (!UserVF) { - VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); + if (UserVF.isZero()) { + VF = ElementCount::getFixed( + determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. - if (VPlanBuildStressTest && VF < 2) { + if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " << "overriding computed VF.\n"); - VF = 4; + VF = ElementCount::getFixed(4); } } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF - << " to build VPlans.\n"); + assert(isPowerOf2_32(VF.getKnownMinValue()) && + "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") + << "VF " << VF << " to build VPlans.\n"); buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0}; + return {VF, 0 /*Cost*/}; } LLVM_DEBUG( @@ -6566,10 +7628,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { return VectorizationFactor::Disabled(); } -Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, - unsigned UserIC) { - assert(OrigLoop->empty() && "Inner loop expected."); - Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); +Optional<VectorizationFactor> +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + assert(OrigLoop->isInnermost() && "Inner loop expected."); + Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6587,40 +7649,55 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, CM.invalidateCostModelingDecisions(); } - if (UserVF) { - LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + ElementCount MaxVF = MaybeMaxVF.getValue(); + assert(MaxVF.isNonZero() && "MaxVF is zero."); + + bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); + if (!UserVF.isZero() && + (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { + // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable + // VFs here, this should be reverted to only use legal UserVFs once the + // loop below supports scalable VFs. + ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; + LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") + << " VF " << VF << ".\n"); + assert(isPowerOf2_32(VF.getKnownMinValue()) && + "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(UserVF); - buildVPlansWithVPRecipes(UserVF, UserVF); + CM.selectUserVectorizationFactor(VF); + CM.collectInLoopReductions(); + buildVPlansWithVPRecipes(VF, VF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + return {{VF, 0}}; } - unsigned MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF != 0 && "MaxVF is zero."); + assert(!MaxVF.isScalable() && + "Scalable vectors not yet supported beyond this point"); - for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { + for (ElementCount VF = ElementCount::getFixed(1); + ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF > 1) + if (VF.isVector()) CM.collectInstsToScalarize(VF); } - buildVPlansWithVPRecipes(1, MaxVF); + CM.collectInLoopReductions(); + + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF == 1) + if (MaxVF.isScalar()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { +void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); BestVF = VF; @@ -6639,13 +7716,23 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPCallbackILV CallbackILV(ILV); - VPTransformState State{BestVF, BestUF, LI, - DT, ILV.Builder, ILV.VectorLoopValueMap, - &ILV, CallbackILV}; + assert(BestVF.hasValue() && "Vectorization Factor is missing"); + + VPTransformState State{*BestVF, + BestUF, + OrigLoop, + LI, + DT, + ILV.Builder, + ILV.VectorLoopValueMap, + &ILV, + CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; + ILV.printDebugTracesAtStart(); + //===------------------------------------------------===// // // Notice: any optimization or new instruction that go @@ -6661,25 +7748,48 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(); + + ILV.printDebugTracesAtEnd(); } void LoopVectorizationPlanner::collectTriviallyDeadInstructions( SmallPtrSetImpl<Instruction *> &DeadInstructions) { - BasicBlock *Latch = OrigLoop->getLoopLatch(); - // We create new control-flow for the vectorized loop, so the original - // condition will be dead after vectorization if it's only used by the - // branch. - auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && Cmp->hasOneUse()) - DeadInstructions.insert(Cmp); + // We create new control-flow for the vectorized loop, so the original exit + // conditions will be dead after vectorization if it's only used by the + // terminator + SmallVector<BasicBlock*> ExitingBlocks; + OrigLoop->getExitingBlocks(ExitingBlocks); + for (auto *BB : ExitingBlocks) { + auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); + if (!Cmp || !Cmp->hasOneUse()) + continue; + + // TODO: we should introduce a getUniqueExitingBlocks on Loop + if (!DeadInstructions.insert(Cmp).second) + continue; + + // The operands of the icmp is often a dead trunc, used by IndUpdate. + // TODO: can recurse through operands in general + for (Value *Op : Cmp->operands()) { + if (isa<TruncInst>(Op) && Op->hasOneUse()) + DeadInstructions.insert(cast<Instruction>(Op)); + } + } // We create new "steps" for induction variable updates to which the original // induction variables map. An original update instruction will be dead if // all its users except the induction variable are dead. + auto *Latch = OrigLoop->getLoopLatch(); for (auto &Induction : Legal->getInductionVars()) { PHINode *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + + // If the tail is to be folded by masking, the primary induction variable, + // if exists, isn't dead: it will be used for masking. Don't kill it. + if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) + continue; + if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { return U == Ind || DeadInstructions.count(cast<Instruction>(U)); })) @@ -6754,12 +7864,284 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } +//===--------------------------------------------------------------------===// +// EpilogueVectorizerMainLoop +//===--------------------------------------------------------------------===// + +/// This function is partially responsible for generating the control flow +/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. +BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { + MDNode *OrigLoopID = OrigLoop->getLoopID(); + Loop *Lp = createVectorLoopSkeleton(""); + + // Generate the code to check the minimum iteration count of the vector + // epilogue (see below). + EPI.EpilogueIterationCountCheck = + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); + EPI.EpilogueIterationCountCheck->setName("iter.check"); + + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + BasicBlock *SavedPreHeader = LoopVectorPreHeader; + emitSCEVChecks(Lp, LoopScalarPreHeader); + + // If a safety check was generated save it. + if (SavedPreHeader != LoopVectorPreHeader) + EPI.SCEVSafetyCheck = SavedPreHeader; + + // Generate the code that checks at runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + SavedPreHeader = LoopVectorPreHeader; + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + + // If a safety check was generated save/overwite it. + if (SavedPreHeader != LoopVectorPreHeader) + EPI.MemSafetyCheck = SavedPreHeader; + + // Generate the iteration count check for the main loop, *after* the check + // for the epilogue loop, so that the path-length is shorter for the case + // that goes directly through the vector epilogue. The longer-path length for + // the main loop is compensated for, by the gain from vectorizing the larger + // trip count. Note: the branch will get updated later on when we vectorize + // the epilogue. + EPI.MainLoopIterationCountCheck = + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); + + // Generate the induction variable. + OldInduction = Legal->getPrimaryInduction(); + Type *IdxTy = Legal->getWidestInductionType(); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + EPI.VectorTripCount = CountRoundDown; + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Skip induction resume value creation here because they will be created in + // the second pass. If we created them here, they wouldn't be used anyway, + // because the vplan in the second pass still contains the inductions from the + // original loop. + + return completeLoopSkeleton(Lp, OrigLoopID); +} + +void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { + LLVM_DEBUG({ + dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" + << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() + << ", Main Loop UF:" << EPI.MainLoopUF + << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + }); +} + +void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { + DEBUG_WITH_TYPE(VerboseDebug, { + dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; + }); +} + +BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( + Loop *L, BasicBlock *Bypass, bool ForEpilogue) { + assert(L && "Expected valid Loop."); + assert(Bypass && "Expected valid bypass basic block."); + unsigned VFactor = + ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); + unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; + Value *Count = getOrCreateTripCount(L); + // Reuse existing vector loop preheader for TC checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + IRBuilder<> Builder(TCCheckBlock->getTerminator()); + + // Generate code to check if the loop's trip count is less than VF * UF of the + // main vector loop. + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + + Value *CheckMinIters = Builder.CreateICmp( + P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), + "min.iters.check"); + + if (!ForEpilogue) + TCCheckBlock->setName("vector.main.loop.iter.check"); + + // Create new preheader for vector loop. + LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), + DT, LI, nullptr, "vector.ph"); + + if (ForEpilogue) { + assert(DT->properlyDominates(DT->getNode(TCCheckBlock), + DT->getNode(Bypass)->getIDom()) && + "TC check is expected to dominate Bypass"); + + // Update dominator for Bypass & LoopExit. + DT->changeImmediateDominator(Bypass, TCCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + + LoopBypassBlocks.push_back(TCCheckBlock); + + // Save the trip count so we don't have to regenerate it in the + // vec.epilog.iter.check. This is safe to do because the trip count + // generated here dominates the vector epilog iter check. + EPI.TripCount = Count; + } + + ReplaceInstWithInst( + TCCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + + return TCCheckBlock; +} + +//===--------------------------------------------------------------------===// +// EpilogueVectorizerEpilogueLoop +//===--------------------------------------------------------------------===// + +/// This function is partially responsible for generating the control flow +/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. +BasicBlock * +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { + MDNode *OrigLoopID = OrigLoop->getLoopID(); + Loop *Lp = createVectorLoopSkeleton("vec.epilog."); + + // Now, compare the remaining count and if there aren't enough iterations to + // execute the vectorized epilogue skip to the scalar part. + BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; + VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); + LoopVectorPreHeader = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + LI, nullptr, "vec.epilog.ph"); + emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, + VecEpilogueIterationCountCheck); + + // Adjust the control flow taking the state info from the main loop + // vectorization into account. + assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && + "expected this to be saved from the previous pass."); + EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopVectorPreHeader); + + DT->changeImmediateDominator(LoopVectorPreHeader, + EPI.MainLoopIterationCountCheck); + + EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + + if (EPI.SCEVSafetyCheck) + EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + if (EPI.MemSafetyCheck) + EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, LoopScalarPreHeader); + + DT->changeImmediateDominator( + VecEpilogueIterationCountCheck, + VecEpilogueIterationCountCheck->getSinglePredecessor()); + + DT->changeImmediateDominator(LoopScalarPreHeader, + EPI.EpilogueIterationCountCheck); + DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); + + // Keep track of bypass blocks, as they feed start values to the induction + // phis in the scalar loop preheader. + if (EPI.SCEVSafetyCheck) + LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); + if (EPI.MemSafetyCheck) + LoopBypassBlocks.push_back(EPI.MemSafetyCheck); + LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); + + // Generate a resume induction for the vector epilogue and put it in the + // vector epilogue preheader + Type *IdxTy = Legal->getWidestInductionType(); + PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", + LoopVectorPreHeader->getFirstNonPHI()); + EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); + EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), + EPI.MainLoopIterationCountCheck); + + // Generate the induction variable. + OldInduction = Legal->getPrimaryInduction(); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Value *StartIdx = EPResumeVal; + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Generate induction resume values. These variables save the new starting + // indexes for the scalar loop. They are used to test if there are any tail + // iterations left once the vector loop has completed. + // Note that when the vectorized epilogue is skipped due to iteration count + // check, then the resume value for the induction variable comes from + // the trip count of the main vector loop, hence passing the AdditionalBypass + // argument. + createInductionResumeValues(Lp, CountRoundDown, + {VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); + + AddRuntimeUnrollDisableMetaData(Lp); + return completeLoopSkeleton(Lp, OrigLoopID); +} + +BasicBlock * +EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( + Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { + + assert(EPI.TripCount && + "Expected trip count to have been safed in the first pass."); + assert( + (!isa<Instruction>(EPI.TripCount) || + DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && + "saved trip count does not dominate insertion point."); + Value *TC = EPI.TripCount; + IRBuilder<> Builder(Insert->getTerminator()); + Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); + + // Generate code to check if the loop's trip count is less than VF * UF of the + // vector epilogue loop. + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + + Value *CheckMinIters = Builder.CreateICmp( + P, Count, + ConstantInt::get(Count->getType(), + EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), + "min.epilog.iters.check"); + + ReplaceInstWithInst( + Insert->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + + LoopBypassBlocks.push_back(Insert); + return Insert; +} + +void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { + LLVM_DEBUG({ + dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" + << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() + << ", Main Loop UF:" << EPI.MainLoopUF + << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() + << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + }); +} + +void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { + DEBUG_WITH_TYPE(VerboseDebug, { + dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; + }); +} + bool LoopVectorizationPlanner::getDecisionAndClampRange( - const std::function<bool(unsigned)> &Predicate, VFRange &Range) { - assert(Range.End > Range.Start && "Trying to test an empty VF range."); + const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { + assert(!Range.isEmpty() && "Trying to test an empty VF range."); bool PredicateAtRangeStart = Predicate(Range.Start); - for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) + for (ElementCount TmpVF = Range.Start * 2; + ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) if (Predicate(TmpVF) != PredicateAtRangeStart) { Range.End = TmpVF; break; @@ -6773,9 +8155,11 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( /// of VF's starting at a given VF and extending it as much as possible. Each /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). -void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { - for (unsigned VF = MinVF; VF < MaxVF + 1;) { - VFRange SubRange = {VF, MaxVF + 1}; +void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, + ElementCount MaxVF) { + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { + VFRange SubRange = {VF, MaxVFPlusOne}; VPlans.push_back(buildVPlan(SubRange)); VF = SubRange.End; } @@ -6800,7 +8184,13 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) return EdgeMaskCache[Edge] = SrcMask; - VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); + // If source is an exiting block, we know the exit edge is dynamically dead + // in the vector loop, and thus we don't need to restrict the mask. Avoid + // adding uses of an otherwise potentially dead instruction. + if (OrigLoop->isLoopExiting(Src)) + return EdgeMaskCache[Edge] = SrcMask; + + VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) @@ -6828,23 +8218,34 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredication(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + // Create the block in mask as the first non-phi instruction in the block. + VPBuilder::InsertPointGuard Guard(Builder); + auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); + Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. // Start by constructing the desired canonical IV. VPValue *IV = nullptr; if (Legal->getPrimaryInduction()) - IV = Plan->getVPValue(Legal->getPrimaryInduction()); + IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); else { auto IVRecipe = new VPWidenCanonicalIVRecipe(); - Builder.getInsertBlock()->appendRecipe(IVRecipe); + Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); IV = IVRecipe->getVPValue(); } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); - if (TailFolded && CM.TTI.emitGetActiveLaneMask()) - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); - else + + if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { + // While ActiveLaneMask is a binary op that consumes the loop tripcount + // as a second argument, we only pass the IV here and extract the + // tripcount from the transform state where codegen of the VP instructions + // happen. + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + } else { BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + } return BlockMaskCache[BB] = BlockMask; } @@ -6865,14 +8266,13 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; } -VPWidenMemoryInstructionRecipe * -VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan) { +VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Must be called with either a load or store"); - auto willWiden = [&](unsigned VF) -> bool { - if (VF == 1) + auto willWiden = [&](ElementCount VF) -> bool { + if (VF.isScalar()) return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); @@ -6903,20 +8303,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, } VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { +VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. InductionDescriptor II = Legal->getInductionVars().lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) - return new VPWidenIntOrFpInductionRecipe(Phi); + II.getKind() == InductionDescriptor::IK_FpInduction) { + VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); + return new VPWidenIntOrFpInductionRecipe(Phi, Start); + } return nullptr; } VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, - VFRange &Range) const { +VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, + VPlan &Plan) const { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -6925,15 +8327,21 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, // Determine whether \p K is a truncation based on an induction variable that // can be optimized. auto isOptimizableIVTruncate = - [&](Instruction *K) -> std::function<bool(unsigned)> { - return - [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; + [&](Instruction *K) -> std::function<bool(ElementCount)> { + return [=](ElementCount VF) -> bool { + return CM.isOptimizableIVTruncate(K, VF); + }; }; if (LoopVectorizationPlanner::getDecisionAndClampRange( - isOptimizableIVTruncate(I), Range)) + isOptimizableIVTruncate(I), Range)) { + + InductionDescriptor II = + Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); + VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - I); + Start, I); + } return nullptr; } @@ -6962,7 +8370,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, VPlan &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -6970,19 +8380,23 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) + ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || + ID == Intrinsic::pseudoprobe || + ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - auto willWiden = [&](unsigned VF) -> bool { + auto willWiden = [&](ElementCount VF) -> bool { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // The following case may be scalarized depending on the VF. // The flag shows whether we use Intrinsic or a usual Call for vectorized // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize = false; - unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; + InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; + bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; + assert(IntrinsicCost.isValid() && CallCost.isValid() && + "Cannot have invalid costs while widening"); return UseVectorIntrinsic || !NeedToScalarize; }; @@ -6997,7 +8411,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { !isa<StoreInst>(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](unsigned VF) -> bool { + auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I, VF); @@ -7060,15 +8474,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, VPlanPtr &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, + [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, + Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); setRecipe(I, Recipe); + Plan->addVPValue(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar // value. Avoid hoisting the insert-element which packs the scalar value into @@ -7110,8 +8526,9 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, assert(Instr->getParent() && "Predicated instruction not in any basic block"); auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); - auto *PHIRecipe = - Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); + auto *PHIRecipe = Instr->getType()->isVoidTy() + ? nullptr + : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); @@ -7139,13 +8556,21 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (auto Phi = dyn_cast<PHINode>(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Plan); - if ((Recipe = tryToOptimizeInductionPHI(Phi))) + if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) return Recipe; + + if (Legal->isReductionVariable(Phi)) { + RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + VPValue *StartV = + Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); + return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); + } + return new VPWidenPHIRecipe(Phi); } - if (isa<TruncInst>(Instr) && - (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) + if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( + cast<TruncInst>(Instr), Range, *Plan))) return Recipe; if (!shouldWiden(Instr, Range)) @@ -7165,35 +8590,9 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return tryToWiden(Instr, *Plan); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, - unsigned MaxVF) { - assert(OrigLoop->empty() && "Inner loop expected."); - - // Collect conditions feeding internal conditional branches; they need to be - // represented in VPlan for it to model masking. - SmallPtrSet<Value *, 1> NeedDef; - - auto *Latch = OrigLoop->getLoopLatch(); - for (BasicBlock *BB : OrigLoop->blocks()) { - if (BB == Latch) - continue; - BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); - if (Branch && Branch->isConditional()) - NeedDef.insert(Branch->getCondition()); - } - - // If the tail is to be folded by masking, the primary induction variable, if - // exists needs to be represented in VPlan for it to model early-exit masking. - // Also, both the Phi and the live-out instruction of each reduction are - // required in order to introduce a select between them in VPlan. - if (CM.foldTailByMasking()) { - if (Legal->getPrimaryInduction()) - NeedDef.insert(Legal->getPrimaryInduction()); - for (auto &Reduction : Legal->getReductionVars()) { - NeedDef.insert(Reduction.first); - NeedDef.insert(Reduction.second.getLoopExitInstr()); - } - } +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, + ElementCount MaxVF) { + assert(OrigLoop->isInnermost() && "Inner loop expected."); // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -7216,17 +8615,17 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, for (Instruction *I : DeadInstructions) SinkAfter.erase(I); - for (unsigned VF = MinVF; VF < MaxVF + 1;) { - VFRange SubRange = {VF, MaxVF + 1}; - VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, - DeadInstructions, SinkAfter)); + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { + VFRange SubRange = {VF, MaxVFPlusOne}; + VPlans.push_back( + buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); VF = SubRange.End; } } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, - SmallPtrSetImpl<Instruction *> &DeadInstructions, + VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const DenseMap<Instruction *, Instruction *> &SinkAfter) { // Hold a mapping from predicated instructions to their recipes, in order to @@ -7249,14 +8648,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RecipeBuilder.recordRecipeOf(Entry.first); RecipeBuilder.recordRecipeOf(Entry.second); } + for (auto &Reduction : CM.getInLoopReductionChains()) { + PHINode *Phi = Reduction.first; + RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); + const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; + + RecipeBuilder.recordRecipeOf(Phi); + for (auto &R : ReductionOperations) { + RecipeBuilder.recordRecipeOf(R); + // For min/max reducitons, where we have a pair of icmp/select, we also + // need to record the ICmp recipe, so it can be removed later. + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) + RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); + } + } // For each interleave group which is relevant for this (possibly trimmed) // Range, add it to the set of groups to be later applied to the VPlan and add // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 + auto applyIG = [IG, this](ElementCount VF) -> bool { + return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); }; @@ -7278,10 +8691,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); Plan->setEntry(VPBB); - // Represent values that will have defs inside VPlan. - for (Value *V : NeedDef) - Plan->addVPValue(V); - // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -7308,6 +8717,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (auto Recipe = RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { + for (auto *Def : Recipe->definedValues()) { + auto *UV = Def->getUnderlyingValue(); + Plan->addVPValue(UV, Def); + } + RecipeBuilder.setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); continue; @@ -7343,6 +8757,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( for (auto &Entry : SinkAfter) { VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); + // If the target is in a replication region, make sure to move Sink to the + // block after it, not into the replication region itself. + if (auto *Region = + dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { + if (Region->isReplicator()) { + assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); + VPBasicBlock *NextBlock = + cast<VPBasicBlock>(Region->getSuccessors().front()); + Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); + continue; + } + } Sink->moveAfter(Target); } @@ -7352,33 +8778,52 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( for (auto IG : InterleaveGroups) { auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( RecipeBuilder.getRecipe(IG->getInsertPos())); - (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) - ->insertBefore(Recipe); + SmallVector<VPValue *, 4> StoredValues; + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) + StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); + auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, + Recipe->getMask()); + VPIG->insertBefore(Recipe); + unsigned J = 0; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *Member = IG->getMember(i)) { + if (!Member->getType()->isVoidTy()) { + VPValue *OriginalV = Plan->getVPValue(Member); + Plan->removeVPValueFor(Member); + Plan->addVPValue(Member, VPIG->getVPValue(J)); + OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); + J++; + } RecipeBuilder.getRecipe(Member)->eraseFromParent(); } } + // Adjust the recipes for any inloop reductions. + if (Range.Start.isVector()) + adjustRecipesForInLoopReductions(Plan, RecipeBuilder); + // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. - if (CM.foldTailByMasking()) { + if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { Builder.setInsertPoint(VPBB); auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); for (auto &Reduction : Legal->getReductionVars()) { - VPValue *Phi = Plan->getVPValue(Reduction.first); - VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); + if (CM.isInLoopReduction(Reduction.first)) + continue; + VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); + VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); } } std::string PlanName; raw_string_ostream RSO(PlanName); - unsigned VF = Range.Start; + ElementCount VF = Range.Start; Plan->addVF(VF); RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; VF < Range.End; VF *= 2) { + for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { Plan->addVF(VF); RSO << "," << VF; } @@ -7394,7 +8839,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. - assert(!OrigLoop->empty()); + assert(!OrigLoop->isInnermost()); assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan @@ -7404,7 +8849,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); - for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) + for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); + VF *= 2) Plan->addVF(VF); if (EnableVPlanPredication) { @@ -7422,6 +8868,67 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { return Plan; } +// Adjust the recipes for any inloop reductions. The chain of instructions +// leading from the loop exit instr to the phi need to be converted to +// reductions, with one operand being vector and the other being the scalar +// reduction chain. +void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( + VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { + for (auto &Reduction : CM.getInLoopReductionChains()) { + PHINode *Phi = Reduction.first; + RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; + + // ReductionOperations are orders top-down from the phi's use to the + // LoopExitValue. We keep a track of the previous item (the Chain) to tell + // which of the two operands will remain scalar and which will be reduced. + // For minmax the chain will be the select instructions. + Instruction *Chain = Phi; + for (Instruction *R : ReductionOperations) { + VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); + RecurKind Kind = RdxDesc.getRecurrenceKind(); + + VPValue *ChainOp = Plan->getVPValue(Chain); + unsigned FirstOpId; + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + assert(isa<VPWidenSelectRecipe>(WidenRecipe) && + "Expected to replace a VPWidenSelectSC"); + FirstOpId = 1; + } else { + assert(isa<VPWidenRecipe>(WidenRecipe) && + "Expected to replace a VPWidenSC"); + FirstOpId = 0; + } + unsigned VecOpId = + R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; + VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); + + auto *CondOp = CM.foldTailByMasking() + ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) + : nullptr; + VPReductionRecipe *RedRecipe = new VPReductionRecipe( + &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); + WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); + Plan->removeVPValueFor(R); + Plan->addVPValue(R, RedRecipe); + WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); + WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); + WidenRecipe->eraseFromParent(); + + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + VPRecipeBase *CompareRecipe = + RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); + assert(isa<VPWidenRecipe>(CompareRecipe) && + "Expected to replace a VPWidenSC"); + assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && + "Expected no remaining users"); + CompareRecipe->eraseFromParent(); + } + Chain = R; + } + } +} + Value* LoopVectorizationPlanner::VPCallbackILV:: getOrCreateVectorValues(Value *V, unsigned Part) { return ILV.getOrCreateVectorValue(V, Part); @@ -7449,29 +8956,35 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } void VPWidenCallRecipe::execute(VPTransformState &State) { - State.ILV->widenCallInstruction(Ingredient, User, State); + State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, + *this, State); } void VPWidenSelectRecipe::execute(VPTransformState &State) { - State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); + State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), + this, *this, InvariantCond, State); } void VPWidenRecipe::execute(VPTransformState &State) { - State.ILV->widenInstruction(Ingredient, User, State); + State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); } void VPWidenGEPRecipe::execute(VPTransformState &State) { - State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, + State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, + *this, State.UF, State.VF, IsPtrLoopInvariant, IsIndexLoopInvariant, State); } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, Trunc); + State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), + Trunc); } void VPWidenPHIRecipe::execute(VPTransformState &State) { - State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); + Value *StartV = + getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; + State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); } void VPBlendRecipe::execute(VPTransformState &State) { @@ -7515,22 +9028,59 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); + State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), + getStoredValues(), getMask()); +} + +void VPReductionRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Reduction being replicated."); + for (unsigned Part = 0; Part < State.UF; ++Part) { + RecurKind Kind = RdxDesc->getRecurrenceKind(); + Value *NewVecOp = State.get(getVecOp(), Part); + if (VPValue *Cond = getCondOp()) { + Value *NewCond = State.get(Cond, Part); + VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); + Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( + Kind, VecTy->getElementType()); + Constant *IdenVec = + ConstantVector::getSplat(VecTy->getElementCount(), Iden); + Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); + NewVecOp = Select; + } + Value *NewRed = + createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); + Value *PrevInChain = State.get(getChainOp(), Part); + Value *NextInChain; + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + NextInChain = + createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), + NewRed, PrevInChain); + } else { + NextInChain = State.Builder.CreateBinOp( + (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, + PrevInChain); + } + State.set(this, getUnderlyingInstr(), NextInChain, Part); + } } void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. - State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, - IsPredicated, State); + assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, + *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. - if (AlsoPack && State.VF > 1) { - // If we're constructing lane 0, initialize to start from undef. + if (AlsoPack && State.VF.isVector()) { + // If we're constructing lane 0, initialize to start from poison. if (State.Instance->Lane == 0) { - Value *Undef = UndefValue::get( - FixedVectorType::get(Ingredient->getType(), State.VF)); - State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + Value *Poison = PoisonValue::get( + VectorType::get(getUnderlyingValue()->getType(), State.VF)); + State.ValueMap.setVectorValue(getUnderlyingInstr(), + State.Instance->Part, Poison); } - State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); + State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), + *State.Instance); } return; } @@ -7538,10 +9088,12 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Generate scalar instances for all VF lanes of all UF parts, unless the // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF; + unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); + assert((!State.VF.isScalable() || IsUniform) && + "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, IsPredicated, State); } @@ -7573,8 +9125,8 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); - Instruction *ScalarPredInst = cast<Instruction>( - State.ValueMap.getScalarValue(PredInst, *State.Instance)); + Instruction *ScalarPredInst = + cast<Instruction>(State.get(getOperand(0), *State.Instance)); BasicBlock *PredicatedBB = ScalarPredInst->getParent(); BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); assert(PredicatingBB && "Predicated block has no single predecessor."); @@ -7586,6 +9138,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. unsigned Part = State.Instance->Part; + Instruction *PredInst = + cast<Instruction>(getOperand(0)->getUnderlyingValue()); if (State.ValueMap.hasVectorValue(PredInst, Part)) { Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); @@ -7596,16 +9150,17 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { } else { Type *PredInstType = PredInst->getType(); PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); - Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); + Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); Phi->addIncoming(ScalarPredInst, PredicatedBB); State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); } } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; - State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, - getMask()); + VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; + State.ILV->vectorizeMemoryInstruction(&Ingredient, State, + StoredValue ? nullptr : getVPValue(), + getAddr(), StoredValue, getMask()); } // Determine how to lower the scalar epilogue, which depends on 1) optimising @@ -7617,35 +9172,53 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, LoopVectorizationLegality &LVL) { - bool OptSize = - F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, - PGSOQueryType::IRPass); // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. - if (OptSize) + // (For PGSO, as shouldOptimizeForSize isn't currently accessible from + // LoopAccessInfo (due to code dependency and not being able to reliably get + // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection + // of strides in LoopAccessInfo::analyzeLoop() and vectorize without + // versioning when the vectorization is forced, unlike hasOptSize. So revert + // back to the old way and vectorize with versioning when forced. See D81345.) + if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass) && + Hints.getForce() != LoopVectorizeHints::FK_Enabled)) return CM_ScalarEpilogueNotAllowedOptSize; - bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && - !PreferPredicateOverEpilog; + // 2) If set, obey the directives + if (PreferPredicateOverEpilogue.getNumOccurrences()) { + switch (PreferPredicateOverEpilogue) { + case PreferPredicateTy::ScalarEpilogue: + return CM_ScalarEpilogueAllowed; + case PreferPredicateTy::PredicateElseScalarEpilogue: + return CM_ScalarEpilogueNotNeededUsePredicate; + case PreferPredicateTy::PredicateOrDontVectorize: + return CM_ScalarEpilogueNotAllowedUsePredicate; + }; + } - // 2) Next, if disabling predication is requested on the command line, honour - // this and request a scalar epilogue. - if (PredicateOptDisabled) + // 3) If set, obey the hints + switch (Hints.getPredicate()) { + case LoopVectorizeHints::FK_Enabled: + return CM_ScalarEpilogueNotNeededUsePredicate; + case LoopVectorizeHints::FK_Disabled: return CM_ScalarEpilogueAllowed; + }; - // 3) and 4) look if enabling predication is requested on the command line, - // with a loop hint, or if the TTI hook indicates this is profitable, request - // predication . - if (PreferPredicateOverEpilog || - Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || - (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, - LVL.getLAI()) && - Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) + // 4) if the TTI hook indicates this is profitable, request predication. + if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, + LVL.getLAI())) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; } +void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, + unsigned Part) { + set(Def, V, Part); + ILV->setVectorValue(IRDef, Part, V); +} + // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the @@ -7657,7 +9230,7 @@ static bool processLoopInVPlanNativePath( OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { - if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { + if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); return false; } @@ -7676,7 +9249,7 @@ static bool processLoopInVPlanNativePath( LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); // Get user vectorization factor. - const unsigned UserVF = Hints.getWidth(); + ElementCount UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); @@ -7691,7 +9264,7 @@ static bool processLoopInVPlanNativePath( LVP.setBestPlan(VF.Width, 1); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM); + &CM, BFI, PSI); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -7710,7 +9283,7 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) !EnableLoopVectorization) {} bool LoopVectorizePass::processLoop(Loop *L) { - assert((EnableVPlanNativePath || L->empty()) && + assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); #ifndef NDEBUG @@ -7755,7 +9328,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, - &Requirements, &Hints, DB, AC); + &Requirements, &Hints, DB, AC, BFI, PSI); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); @@ -7772,11 +9345,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { // even evaluating whether vectorization is profitable. Since we cannot modify // the incoming IR, we need to build VPlan upfront in the vectorization // pipeline. - if (!L->empty()) + if (!L->isInnermost()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, ORE, BFI, PSI, Hints); - assert(L->empty() && "Inner loop expected."); + assert(L->isInnermost() && "Inner loop expected."); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. @@ -7841,7 +9414,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); // Get user vectorization factor and interleave count. - unsigned UserVF = Hints.getWidth(); + ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. @@ -7866,7 +9439,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (VF.Width == 1) { + if (VF.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( "VectorizationNotBeneficial", @@ -7955,8 +9528,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, + BFI, PSI); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -7967,16 +9540,51 @@ bool LoopVectorizePass::processLoop(Loop *L) { }); } else { // If we decided that it is *legal* to vectorize the loop, then do it. - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM); - LVP.executePlan(LB, DT); - ++LoopsVectorized; - // Add metadata to disable runtime unrolling a scalar loop when there are - // no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + if (EpilogueVF.Width.isVector()) { + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, + EpilogueVF.Width.getKnownMinValue(), 1); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, + &LVL, &CM, BFI, PSI); + + LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); + LVP.executePlan(MainILV, DT); + ++LoopsVectorized; + + simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); + formLCSSARecursively(*L, *DT, LI, SE); + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); + EPI.MainLoopVF = EPI.EpilogueVF; + EPI.MainLoopUF = EPI.EpilogueUF; + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, + ORE, EPI, &LVL, &CM, BFI, PSI); + LVP.executePlan(EpilogILV, DT); + ++LoopsEpilogueVectorized; + + if (!MainILV.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, + &LVL, &CM, BFI, PSI); + LVP.executePlan(LB, DT); + ++LoopsVectorized; + + // Add metadata to disable runtime unrolling a scalar loop when there are + // no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + if (!LB.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } // Report the vectorization decision. ORE->emit([&]() { @@ -8090,7 +9698,8 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, MSSA}; return LAM.getResult<LoopAccessAnalysis>(L, AR); }; auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5bc35aa4695f..0b630197911a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17,11 +17,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/SLPVectorizer.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/MapVector.h" -#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" @@ -29,14 +26,16 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" @@ -47,7 +46,6 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" -#include "llvm/Analysis/AssumptionCache.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -66,7 +64,6 @@ #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" -#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" @@ -83,6 +80,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" +#include "llvm/Support/InstructionCost.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -130,6 +128,10 @@ static cl::opt<int> MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +static cl::opt<unsigned> +MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, + cl::desc("Maximum SLP vectorization factor (0=unlimited)")); + static cl::opt<int> MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, cl::desc("Maximum depth of the lookup for consecutive stores.")); @@ -204,12 +206,12 @@ static bool allSameBlock(ArrayRef<Value *> VL) { if (!I0) return false; BasicBlock *BB = I0->getParent(); - for (int i = 1, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - if (!I) + for (int I = 1, E = VL.size(); I < E; I++) { + auto *II = dyn_cast<Instruction>(VL[I]); + if (!II) return false; - if (BB != I->getParent()) + if (BB != II->getParent()) return false; } return true; @@ -234,11 +236,16 @@ static bool isSplat(ArrayRef<Value *> VL) { return true; } -/// \returns True if \p I is commutative, handles CmpInst as well as Instruction. +/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. static bool isCommutative(Instruction *I) { - if (auto *IC = dyn_cast<CmpInst>(I)) - return IC->isCommutative(); - return I->isCommutative(); + if (auto *Cmp = dyn_cast<CmpInst>(I)) + return Cmp->isCommutative(); + if (auto *BO = dyn_cast<BinaryOperator>(I)) + return BO->isCommutative(); + // TODO: This should check for generic Instruction::isCommutative(), but + // we need to confirm that the caller code correctly handles Intrinsics + // for example (does not have 2 operands). + return false; } /// Checks if the vector of instructions can be represented as a shuffle, like: @@ -250,7 +257,7 @@ static bool isCommutative(Instruction *I) { /// %x3x3 = mul i8 %x3, %x3 /// %y1y1 = mul i8 %y1, %y1 /// %y2y2 = mul i8 %y2, %y2 -/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0 +/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 @@ -265,13 +272,13 @@ static bool isCommutative(Instruction *I) { /// %x3 = extractelement <4 x i8> %x, i32 3 /// %y1 = extractelement <4 x i8> %y, i32 1 /// %y2 = extractelement <4 x i8> %y, i32 2 -/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0 +/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 /// %5 = mul <4 x i8> %4, %4 /// %6 = extractelement <4 x i8> %5, i32 0 -/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0 +/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 /// %7 = extractelement <4 x i8> %5, i32 1 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 /// %8 = extractelement <4 x i8> %5, i32 2 @@ -285,7 +292,8 @@ static bool isCommutative(Instruction *I) { static Optional<TargetTransformInfo::ShuffleKind> isShuffle(ArrayRef<Value *> VL) { auto *EI0 = cast<ExtractElementInst>(VL[0]); - unsigned Size = EI0->getVectorOperandType()->getNumElements(); + unsigned Size = + cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); Value *Vec1 = nullptr; Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; @@ -294,7 +302,7 @@ isShuffle(ArrayRef<Value *> VL) { auto *EI = cast<ExtractElementInst>(VL[I]); auto *Vec = EI->getVectorOperand(); // All vector operands must have the same number of vector elements. - if (cast<VectorType>(Vec->getType())->getNumElements() != Size) + if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) return None; auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); if (!Idx) @@ -303,7 +311,7 @@ isShuffle(ArrayRef<Value *> VL) { if (Idx->getValue().uge(Size)) continue; unsigned IntIdx = Idx->getValue().getZExtValue(); - // We can extractelement from undef vector. + // We can extractelement from undef or poison vector. if (isa<UndefValue>(Vec)) continue; // For correct shuffling we have to have at most 2 different vector operands @@ -500,7 +508,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } /// \returns the AA location that is being access by the instruction. -static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) { +static MemoryLocation getLocation(Instruction *I, AAResults *AA) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return MemoryLocation::get(SI); if (LoadInst *LI = dyn_cast<LoadInst>(I)) @@ -521,6 +529,15 @@ static bool isSimple(Instruction *I) { namespace llvm { +static void inversePermutation(ArrayRef<unsigned> Indices, + SmallVectorImpl<int> &Mask) { + Mask.clear(); + const unsigned E = Indices.size(); + Mask.resize(E, E + 1); + for (unsigned I = 0; I < E; ++I) + Mask[Indices[I]] = I; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -535,9 +552,10 @@ public: using StoreList = SmallVector<StoreInst *, 8>; using ExtraValueToDebugLocsMap = MapVector<Value *, SmallVector<Instruction *, 2>>; + using OrdersType = SmallVector<unsigned, 4>; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, - TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, + TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE) : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), @@ -571,11 +589,11 @@ public: /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost() const; + InstructionCost getSpillCost() const; /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(); + InstructionCost getTreeCost(); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -612,6 +630,14 @@ public: /// \returns The best order of instructions for vectorization. Optional<ArrayRef<unsigned>> bestOrder() const { + assert(llvm::all_of( + NumOpsWantToKeepOrder, + [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { + return D.getFirst().size() == + VectorizableTree[0]->Scalars.size(); + }) && + "All orders must have the same size as number of instructions in " + "tree node."); auto I = std::max_element( NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, @@ -625,6 +651,81 @@ public: return makeArrayRef(I->getFirst()); } + /// Builds the correct order for root instructions. + /// If some leaves have the same instructions to be vectorized, we may + /// incorrectly evaluate the best order for the root node (it is built for the + /// vector of instructions without repeated instructions and, thus, has less + /// elements than the root node). This function builds the correct order for + /// the root node. + /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves + /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first + /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should + /// be reordered, the best order will be \<1, 0\>. We need to extend this + /// order for the root node. For the root node this order should look like + /// \<3, 0, 1, 2\>. This function extends the order for the reused + /// instructions. + void findRootOrder(OrdersType &Order) { + // If the leaf has the same number of instructions to vectorize as the root + // - order must be set already. + unsigned RootSize = VectorizableTree[0]->Scalars.size(); + if (Order.size() == RootSize) + return; + SmallVector<unsigned, 4> RealOrder(Order.size()); + std::swap(Order, RealOrder); + SmallVector<int, 4> Mask; + inversePermutation(RealOrder, Mask); + Order.assign(Mask.begin(), Mask.end()); + // The leaf has less number of instructions - need to find the true order of + // the root. + // Scan the nodes starting from the leaf back to the root. + const TreeEntry *PNode = VectorizableTree.back().get(); + SmallVector<const TreeEntry *, 4> Nodes(1, PNode); + SmallPtrSet<const TreeEntry *, 4> Visited; + while (!Nodes.empty() && Order.size() != RootSize) { + const TreeEntry *PNode = Nodes.pop_back_val(); + if (!Visited.insert(PNode).second) + continue; + const TreeEntry &Node = *PNode; + for (const EdgeInfo &EI : Node.UserTreeIndices) + if (EI.UserTE) + Nodes.push_back(EI.UserTE); + if (Node.ReuseShuffleIndices.empty()) + continue; + // Build the order for the parent node. + OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); + SmallVector<unsigned, 4> OrderCounter(Order.size(), 0); + // The algorithm of the order extension is: + // 1. Calculate the number of the same instructions for the order. + // 2. Calculate the index of the new order: total number of instructions + // with order less than the order of the current instruction + reuse + // number of the current instruction. + // 3. The new order is just the index of the instruction in the original + // vector of the instructions. + for (unsigned I : Node.ReuseShuffleIndices) + ++OrderCounter[Order[I]]; + SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0); + for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { + unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; + unsigned OrderIdx = Order[ReusedIdx]; + unsigned NewIdx = 0; + for (unsigned J = 0; J < OrderIdx; ++J) + NewIdx += OrderCounter[J]; + NewIdx += CurrentCounter[OrderIdx]; + ++CurrentCounter[OrderIdx]; + assert(NewOrder[NewIdx] == RootSize && + "The order index should not be written already."); + NewOrder[NewIdx] = I; + } + std::swap(Order, NewOrder); + } + assert(Order.size() == RootSize && + "Root node is expected or the size of the order must be the same as " + "the number of elements in the root node."); + assert(llvm::all_of(Order, + [RootSize](unsigned Val) { return Val != RootSize; }) && + "All indices must be initialized"); + } + /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of /// the stored value. Otherwise, the size is the width of the largest loaded @@ -646,6 +747,12 @@ public: return MinVecRegSize; } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { + unsigned MaxVF = MaxVFOption.getNumOccurrences() ? + MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); + return MaxVF ? MaxVF : UINT_MAX; + } + /// Check if homogeneous aggregate is isomorphic to some VectorType. /// Accepts homogeneous multidimensional aggregate of scalars/vectors like /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, @@ -665,7 +772,7 @@ public: /// effectively impossible for the backend to undo. /// TODO: If load combining is allowed in the IR optimizer, this analysis /// may not be necessary. - bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; + bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values /// can be load combined in the backend. Load combining may not be allowed in @@ -880,6 +987,14 @@ public: std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}}; for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { Value *V = Values[Idx].first; + if (isa<Constant>(V)) { + // Since this is a function pass, it doesn't make semantic sense to + // walk the users of a subclass of Constant. The users could be in + // another function, or even another module that happens to be in + // the same LLVMContext. + continue; + } + // Calculate the absolute lane, using the minimum relative lane of LHS // and RHS as base and Idx as the offset. int Ln = std::min(LHS.second, RHS.second) + Idx; @@ -1388,7 +1503,7 @@ private: bool areAllUsersVectorized(Instruction *I) const; /// \returns the cost of the vectorizable entry. - int getEntryCost(TreeEntry *E); + InstructionCost getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, @@ -1410,20 +1525,21 @@ private: /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(VectorType *Ty, - const DenseSet<unsigned> &ShuffledIndices) const; + InstructionCost + getGatherCost(FixedVectorType *Ty, + const DenseSet<unsigned> &ShuffledIndices) const; /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. - int getGatherCost(ArrayRef<Value *> VL) const; + InstructionCost getGatherCost(ArrayRef<Value *> VL) const; /// Set the Builder insert point to one after the last instruction in /// the bundle void setInsertPointAfterBundle(TreeEntry *E); /// \returns a vector from a collection of scalars in \p VL. - Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); + Value *gather(ArrayRef<Value *> VL); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. @@ -1457,15 +1573,17 @@ private: /// The Scalars are vectorized into this value. It is initialized to Null. Value *VectorizedValue = nullptr; - /// Do we need to gather this sequence ? - enum EntryState { Vectorize, NeedToGather }; + /// Do we need to gather this sequence or vectorize it + /// (either with vector instruction or with scatter/gather + /// intrinsics for store/load)? + enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; EntryState State; /// Does this sequence require some shuffling? SmallVector<int, 4> ReuseShuffleIndices; /// Does this entry require reordering? - ArrayRef<unsigned> ReorderIndices; + SmallVector<unsigned, 4> ReorderIndices; /// Points back to the VectorizableTree. /// @@ -1606,6 +1724,9 @@ private: case Vectorize: dbgs() << "Vectorize\n"; break; + case ScatterVectorize: + dbgs() << "ScatterVectorize\n"; + break; case NeedToGather: dbgs() << "NeedToGather\n"; break; @@ -1627,7 +1748,7 @@ private: dbgs() << "NULL\n"; dbgs() << "ReuseShuffleIndices: "; if (ReuseShuffleIndices.empty()) - dbgs() << "Emtpy"; + dbgs() << "Empty"; else for (unsigned ReuseIdx : ReuseShuffleIndices) dbgs() << ReuseIdx << ", "; @@ -1644,26 +1765,55 @@ private: #endif }; +#ifndef NDEBUG + void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost, + InstructionCost VecCost, + InstructionCost ScalarCost) const { + dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); + dbgs() << "SLP: Costs:\n"; + dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; + dbgs() << "SLP: VectorCost = " << VecCost << "\n"; + dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; + dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " << + ReuseShuffleCost + VecCost - ScalarCost << "\n"; + } +#endif + /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef<unsigned> ReuseShuffleIndices = None, ArrayRef<unsigned> ReorderIndices = None) { - bool Vectorized = (bool)Bundle; + TreeEntry::EntryState EntryState = + Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; + return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, + ReuseShuffleIndices, ReorderIndices); + } + + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, + TreeEntry::EntryState EntryState, + Optional<ScheduleData *> Bundle, + const InstructionsState &S, + const EdgeInfo &UserTreeIdx, + ArrayRef<unsigned> ReuseShuffleIndices = None, + ArrayRef<unsigned> ReorderIndices = None) { + assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || + (Bundle && EntryState != TreeEntry::NeedToGather)) && + "Need to vectorize gather entry?"); VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); - Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather; + Last->State = EntryState; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); - Last->ReorderIndices = ReorderIndices; + Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); Last->setOperations(S); - if (Vectorized) { - for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = Last; + if (Last->State != TreeEntry::NeedToGather) { + for (Value *V : VL) { + assert(!getTreeEntry(V) && "Scalar already in tree!"); + ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. unsigned Lane = 0; @@ -1699,18 +1849,10 @@ private: } #endif - TreeEntry *getTreeEntry(Value *V) { - auto I = ScalarToTreeEntry.find(V); - if (I != ScalarToTreeEntry.end()) - return I->second; - return nullptr; - } + TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } const TreeEntry *getTreeEntry(Value *V) const { - auto I = ScalarToTreeEntry.find(V); - if (I != ScalarToTreeEntry.end()) - return I->second; - return nullptr; + return ScalarToTreeEntry.lookup(V); } /// Maps a specific scalar to its tree entry. @@ -2195,7 +2337,6 @@ private: /// List of users to ignore during scheduling and that don't need extracting. ArrayRef<Value *> UserIgnoreList; - using OrdersType = SmallVector<unsigned, 4>; /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of /// sorted SmallVectors of unsigned. struct OrdersTypeDenseMapInfo { @@ -2233,7 +2374,7 @@ private: ScalarEvolution *SE; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; - AliasAnalysis *AA; + AAResults *AA; LoopInfo *LI; DominatorTree *DT; AssumptionCache *AC; @@ -2332,9 +2473,9 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { } for (auto V : Entry->Scalars) { OS << *V; - if (std::any_of( - R->ExternalUses.begin(), R->ExternalUses.end(), - [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; })) + if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { + return EU.Scalar == V; + })) OS << " <extract>"; OS << "\n"; } @@ -2366,13 +2507,17 @@ BoUpSLP::~BoUpSLP() { "trying to erase instruction with users."); Pair.getFirst()->eraseFromParent(); } +#ifdef EXPENSIVE_CHECKS + // If we could guarantee that this call is not extremely slow, we could + // remove the ifdef limitation (see PR47712). assert(!verifyFunction(*F, &dbgs())); +#endif } void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { for (auto *V : AV) { if (auto *I = dyn_cast<Instruction>(V)) - eraseInstruction(I, /*ReplaceWithUndef=*/true); + eraseInstruction(I, /*ReplaceOpsWithUndef=*/true); }; } @@ -2597,11 +2742,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto *PH = cast<PHINode>(VL0); // Check for terminator values (e.g. invoke). - for (unsigned j = 0; j < VL.size(); ++j) - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + for (Value *V : VL) + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { Instruction *Term = dyn_cast<Instruction>( - cast<PHINode>(VL[j])->getIncomingValueForBlock( - PH->getIncomingBlock(i))); + cast<PHINode>(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); @@ -2618,13 +2763,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Keeps the reordered operands to avoid code duplication. SmallVector<ValueList, 2> OperandsVec; - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( - PH->getIncomingBlock(i))); - TE->setOperand(i, Operands); + for (Value *V : VL) + Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); + TE->setOperand(I, Operands); OperandsVec.push_back(Operands); } for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) @@ -2657,12 +2802,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, }); // Insert new order with initial value 0, if it does not exist, // otherwise return the iterator to the existing one. - auto StoredCurrentOrderAndNum = - NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; - ++StoredCurrentOrderAndNum->getSecond(); newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, - StoredCurrentOrderAndNum->getFirst()); + ReuseShuffleIndicies, CurrentOrder); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; @@ -2739,16 +2882,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. - auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; - ++I->getSecond(); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); + ReuseShuffleIndicies, CurrentOrder); TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + return; } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); @@ -2883,8 +3033,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2952,6 +3102,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::Store: { // Check if the stores are consecutive or if we need to swizzle them. llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); + // Avoid types that are padded when being allocated as scalars, while + // being packed together in a vector (such as i1). + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); + return; + } // Make sure all stores in the bundle are simple - we can't vectorize // atomic or volatile stores. SmallVector<Value *, 4> PointerOps(VL.size()); @@ -3001,15 +3161,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); } else { - // Need to reorder. - auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; - ++(I->getSecond()); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); + ReuseShuffleIndicies, CurrentOrder); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -3028,7 +3187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); VFShape Shape = VFShape::get( - *CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/}, + *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); @@ -3165,7 +3324,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { N *= AT->getNumElements(); EltTy = AT->getElementType(); } else { - auto *VT = cast<VectorType>(EltTy); + auto *VT = cast<FixedVectorType>(EltTy); N *= VT->getNumElements(); EltTy = VT->getElementType(); } @@ -3203,7 +3362,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) return false; } else { - NElts = cast<VectorType>(Vec->getType())->getNumElements(); + NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); } if (NElts != VL.size()) @@ -3247,27 +3406,26 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, } bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { - return I->hasOneUse() || - std::all_of(I->user_begin(), I->user_end(), [this](User *U) { + return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) { return ScalarToTreeEntry.count(U) > 0; }); } -static std::pair<unsigned, unsigned> -getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI, - TargetLibraryInfo *TLI) { +static std::pair<InstructionCost, InstructionCost> +getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, + TargetTransformInfo *TTI, TargetLibraryInfo *TLI) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements()); - int IntrinsicCost = + IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount()); + auto IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); - auto Shape = - VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false}, - false /*HasGlobalPred*/); + auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( + VecTy->getNumElements())), + false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); - int LibCost = IntrinsicCost; + auto LibCost = IntrinsicCost; if (!CI->isNoBuiltin() && VecFunc) { // Calculate the cost of the vector library call. SmallVector<Type *, 4> VecTys; @@ -3282,7 +3440,7 @@ getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI, return {IntrinsicCost, LibCost}; } -int BoUpSLP::getEntryCost(TreeEntry *E) { +InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef<Value*> VL = E->Scalars; Type *ScalarTy = VL[0]->getType(); @@ -3301,7 +3459,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - int ReuseShuffleCost = 0; + InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) { ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); @@ -3317,7 +3475,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { allSameType(VL) && allSameBlock(VL)) { Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { - int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); + InstructionCost Cost = + TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); for (auto *V : VL) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this @@ -3336,7 +3495,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return ReuseShuffleCost + getGatherCost(VL); } - assert(E->State == TreeEntry::Vectorize && "Unhandled state"); + assert((E->State == TreeEntry::Vectorize || + E->State == TreeEntry::ScatterVectorize) && + "Unhandled state"); assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = @@ -3375,37 +3536,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - int DeadCost = ReuseShuffleCost; + InstructionCost DeadCost = ReuseShuffleCost; if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. DeadCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast<Instruction>(VL[i]); + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + Instruction *EI = cast<Instruction>(VL[I]); // If all users are going to be vectorized, instruction can be // considered as dead. // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(E)) { + if (areAllUsersVectorized(EI)) { // Take credit for instruction that will become dead. - if (E->hasOneUse()) { - Instruction *Ext = E->user_back(); + if (EI->hasOneUse()) { + Instruction *Ext = EI->user_back(); if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && all_of(Ext->users(), [](User *U) { return isa<GetElementPtrInst>(U); })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. DeadCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, i); + Ext->getOpcode(), Ext->getType(), VecTy, I); // Add back the cost of s|zext which is subtracted separately. DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), E->getType(), CostKind, - Ext); + Ext->getOpcode(), Ext->getType(), EI->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); continue; } } DeadCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } } return DeadCost; @@ -3423,40 +3584,78 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - int ScalarEltCost = - TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind, - VL0); + InstructionCost ScalarEltCost = + TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, + TTI::getCastContextHint(VL0), CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. - int ScalarCost = VL.size() * ScalarEltCost; + InstructionCost ScalarCost = VL.size() * ScalarEltCost; auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); - int VecCost = 0; + InstructionCost VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, - CostKind, VL0); + VecCost = + ReuseShuffleCost + + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), CostKind, VL0); } + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return VecCost - ScalarCost; } case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, - Builder.getInt1Ty(), - CostKind, VL0); + InstructionCost ScalarEltCost = + TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), + CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, - CostKind, VL0); + InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; + + // Check if all entries in VL are either compares or selects with compares + // as condition that have the same predicates. + CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE; + bool First = true; + for (auto *V : VL) { + CmpInst::Predicate CurrentPred; + auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); + if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) && + !match(V, MatchCmp)) || + (!First && VecPred != CurrentPred)) { + VecPred = CmpInst::BAD_ICMP_PREDICATE; + break; + } + First = false; + VecPred = CurrentPred; + } + + InstructionCost VecCost = TTI->getCmpSelInstrCost( + E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); + // Check if it is possible and profitable to use min/max for selects in + // VL. + // + auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); + if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { + IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, + {VecTy, VecTy}); + InstructionCost IntrinsicCost = + TTI->getIntrinsicInstrCost(CostAttrs, CostKind); + // If the selects are the only uses of the compares, they will be dead + // and we can adjust the cost by removing their cost. + if (IntrinsicAndUse.second) + IntrinsicCost -= + TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + VecCost = std::min(VecCost, IntrinsicCost); + } + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::FNeg: @@ -3516,16 +3715,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } SmallVector<const Value *, 4> Operands(VL0->operand_values()); - int ScalarEltCost = TTI->getArithmeticInstrCost( - E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, - Operands, VL0); + InstructionCost ScalarEltCost = + TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, + Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost( - E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, - Operands, VL0); + InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost VecCost = + TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, + Op2VK, Op1VP, Op2VP, Operands, VL0); + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -3534,36 +3734,42 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; - int ScalarEltCost = - TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind, - Op1VK, Op2VK); + InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( + Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = - TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind, - Op1VK, Op2VK); + InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost VecCost = TTI->getArithmeticInstrCost( + Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. Align alignment = cast<LoadInst>(VL0)->getAlign(); - int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, - CostKind, VL0); + InstructionCost ScalarEltCost = TTI->getMemoryOpCost( + Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; - int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, - CostKind, VL0); + InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost VecLdCost; + if (E->State == TreeEntry::Vectorize) { + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + CostKind, VL0); + } else { + assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); + VecLdCost = TTI->getGatherScatterOpCost( + Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), + /*VariableMask=*/false, alignment, CostKind, VL0); + } if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); return ReuseShuffleCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { @@ -3572,19 +3778,19 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { auto *SI = cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); Align Alignment = SI->getAlign(); - int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, - CostKind, VL0); + InstructionCost ScalarEltCost = TTI->getMemoryOpCost( + Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; - int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, Alignment, 0, CostKind, VL0); + InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost VecStCost = TTI->getMemoryOpCost( + Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); if (IsReorder) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecStCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { @@ -3592,15 +3798,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1); - int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); + IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1); + InstructionCost ScalarEltCost = + TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } - int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; + InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); - int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second); + InstructionCost VecCallCost = + std::min(VecCallCosts.first, VecCallCosts.second); LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" @@ -3615,7 +3823,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); - int ScalarCost = 0; + InstructionCost ScalarCost = 0; if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { Instruction *I = cast<Instruction>(VL[Idx]); @@ -3633,7 +3841,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. - int VecCost = 0; + InstructionCost VecCost = 0; if (Instruction::isBinaryOp(E->getOpcode())) { VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, @@ -3644,11 +3852,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, - CostKind); + TTI::CastContextHint::None, CostKind); VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, - CostKind); + TTI::CastContextHint::None, CostKind); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); + LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } default: @@ -3686,11 +3895,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI) { // Look past the root to find a source value. Arbitrarily follow the // path through operand 0 of any 'or'. Also, peek through optional - // shift-left-by-constant. + // shift-left-by-multiple-of-8-bits. Value *ZextLoad = Root; + const APInt *ShAmtC; while (!isa<ConstantExpr>(ZextLoad) && (match(ZextLoad, m_Or(m_Value(), m_Value())) || - match(ZextLoad, m_Shl(m_Value(), m_Constant())))) + (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && + ShAmtC->urem(8) == 0))) ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0); // Check if the input is an extended load of the required or/shift expression. @@ -3714,8 +3925,8 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, return true; } -bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { - if (RdxOpcode != Instruction::Or) +bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { + if (RdxKind != RecurKind::Or) return false; unsigned NumElts = VectorizableTree[0]->Scalars.size(); @@ -3756,22 +3967,35 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { return true; } -int BoUpSLP::getSpillCost() const { +InstructionCost BoUpSLP::getSpillCost() const { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); - int Cost = 0; + InstructionCost Cost = 0; SmallPtrSet<Instruction*, 4> LiveValues; Instruction *PrevInst = nullptr; + // The entries in VectorizableTree are not necessarily ordered by their + // position in basic blocks. Collect them and order them by dominance so later + // instructions are guaranteed to be visited first. For instructions in + // different basic blocks, we only scan to the beginning of the block, so + // their order does not matter, as long as all instructions in a basic block + // are grouped together. Using dominance ensures a deterministic order. + SmallVector<Instruction *, 16> OrderedScalars; for (const auto &TEPtr : VectorizableTree) { Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); if (!Inst) continue; + OrderedScalars.push_back(Inst); + } + llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) { + return DT->dominates(B, A); + }); + for (Instruction *Inst : OrderedScalars) { if (!PrevInst) { PrevInst = Inst; continue; @@ -3825,8 +4049,8 @@ int BoUpSLP::getSpillCost() const { return Cost; } -int BoUpSLP::getTreeCost() { - int Cost = 0; +InstructionCost BoUpSLP::getTreeCost() { + InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); @@ -3856,15 +4080,16 @@ int BoUpSLP::getTreeCost() { })) continue; - int C = getEntryCost(&TE); + InstructionCost C = getEntryCost(&TE); + Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] - << ".\n"); - Cost += C; + << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); } SmallPtrSet<Value *, 16> ExtractCostCalculated; - int ExtractCost = 0; + InstructionCost ExtractCost = 0; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!ExtractCostCalculated.insert(EU.Scalar).second) @@ -3894,39 +4119,42 @@ int BoUpSLP::getTreeCost() { } } - int SpillCost = getSpillCost(); + InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - std::string Str; +#ifndef NDEBUG + SmallString<256> Str; { - raw_string_ostream OS(Str); + raw_svector_ostream OS(Str); OS << "SLP: Spill Cost = " << SpillCost << ".\n" << "SLP: Extract Cost = " << ExtractCost << ".\n" << "SLP: Total Cost = " << Cost << ".\n"; } LLVM_DEBUG(dbgs() << Str); - if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); +#endif return Cost; } -int BoUpSLP::getGatherCost(VectorType *Ty, - const DenseSet<unsigned> &ShuffledIndices) const { +InstructionCost +BoUpSLP::getGatherCost(FixedVectorType *Ty, + const DenseSet<unsigned> &ShuffledIndices) const { unsigned NumElts = Ty->getNumElements(); APInt DemandedElts = APInt::getNullValue(NumElts); - for (unsigned i = 0; i < NumElts; ++i) - if (!ShuffledIndices.count(i)) - DemandedElts.setBit(i); - int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, - /*Extract*/ false); + for (unsigned I = 0; I < NumElts; ++I) + if (!ShuffledIndices.count(I)) + DemandedElts.setBit(I); + InstructionCost Cost = + TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, + /*Extract*/ false); if (!ShuffledIndices.empty()) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; } -int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { +InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) @@ -3968,11 +4196,10 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { // should be in this block. auto *Front = E->getMainOp(); auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), - [=](Value *V) -> bool { - auto *I = cast<Instruction>(V); - return !E->isOpcodeOrAlt(I) || I->getParent() == BB; - })); + assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { + auto *I = cast<Instruction>(V); + return !E->isOpcodeOrAlt(I) || I->getParent() == BB; + })); // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -4025,34 +4252,30 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } -Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { - Value *Vec = UndefValue::get(Ty); - // Generate the 'InsertElement' instruction. - for (unsigned i = 0; i < Ty->getNumElements(); ++i) { - Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); - if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) { - GatherSeq.insert(Insrt); - CSEBlocks.insert(Insrt->getParent()); - - // Add to our 'need-to-extract' list. - if (TreeEntry *E = getTreeEntry(VL[i])) { - // Find which lane we need to extract. - int FoundLane = -1; - for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) { - // Is this the lane of the scalar that we are looking for ? - if (E->Scalars[Lane] == VL[i]) { - FoundLane = Lane; - break; - } - } - assert(FoundLane >= 0 && "Could not find the correct lane"); - if (!E->ReuseShuffleIndices.empty()) { - FoundLane = - std::distance(E->ReuseShuffleIndices.begin(), - llvm::find(E->ReuseShuffleIndices, FoundLane)); - } - ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); +Value *BoUpSLP::gather(ArrayRef<Value *> VL) { + Value *Val0 = + isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; + FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); + Value *Vec = PoisonValue::get(VecTy); + unsigned InsIndex = 0; + for (Value *Val : VL) { + Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++)); + auto *InsElt = dyn_cast<InsertElementInst>(Vec); + if (!InsElt) + continue; + GatherSeq.insert(InsElt); + CSEBlocks.insert(InsElt->getParent()); + // Add to our 'need-to-extract' list. + if (TreeEntry *Entry = getTreeEntry(Val)) { + // Find which lane we need to extract. + unsigned FoundLane = std::distance(Entry->Scalars.begin(), + find(Entry->Scalars, Val)); + assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane"); + if (!Entry->ReuseShuffleIndices.empty()) { + FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), + find(Entry->ReuseShuffleIndices, FoundLane)); } + ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane)); } } @@ -4076,8 +4299,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { for (int Idx : E->ReuseShuffleIndices) if (UsedIdxs.insert(Idx).second) UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), - UniqueIdxs); + V = Builder.CreateShuffleVector(V, UniqueIdxs); } } return V; @@ -4085,10 +4307,6 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { } } - Type *ScalarTy = S.OpValue->getType(); - if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) - ScalarTy = SI->getValueOperand()->getType(); - // Check that every instruction appears once in this bundle. SmallVector<int, 4> ReuseShuffleIndicies; SmallVector<Value *, 4> UniqueValues; @@ -4108,27 +4326,16 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { else VL = UniqueValues; } - auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); - Value *V = Gather(VL, VecTy); + Value *Vec = gather(VL); if (!ReuseShuffleIndicies.empty()) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - ReuseShuffleIndicies, "shuffle"); - if (auto *I = dyn_cast<Instruction>(V)) { + Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle"); + if (auto *I = dyn_cast<Instruction>(Vec)) { GatherSeq.insert(I); CSEBlocks.insert(I->getParent()); } } - return V; -} - -static void inversePermutation(ArrayRef<unsigned> Indices, - SmallVectorImpl<int> &Mask) { - Mask.clear(); - const unsigned E = Indices.size(); - Mask.resize(E); - for (unsigned I = 0; I < E; ++I) - Mask[Indices[I]] = I; + return Vec; } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { @@ -4139,32 +4346,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } - Instruction *VL0 = E->getMainOp(); - Type *ScalarTy = VL0->getType(); - if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) - ScalarTy = SI->getValueOperand()->getType(); - auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); - bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); - auto *V = Gather(E->Scalars, VecTy); + Value *Vec = gather(E->Scalars); if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast<Instruction>(V)) { + Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle"); + if (auto *I = dyn_cast<Instruction>(Vec)) { GatherSeq.insert(I); CSEBlocks.insert(I->getParent()); } } - E->VectorizedValue = V; - return V; + E->VectorizedValue = Vec; + return Vec; } - assert(E->State == TreeEntry::Vectorize && "Unhandled state"); + assert((E->State == TreeEntry::Vectorize || + E->State == TreeEntry::ScatterVectorize) && + "Unhandled state"); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); + Instruction *VL0 = E->getMainOp(); + Type *ScalarTy = VL0->getType(); + if (auto *Store = dyn_cast<StoreInst>(VL0)) + ScalarTy = Store->getValueOperand()->getType(); + auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { auto *PH = cast<PHINode>(VL0); @@ -4172,10 +4378,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); Value *V = NewPhi; - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; // PHINodes may have multiple entries from the same block. We want to @@ -4208,37 +4413,33 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector<int, 4> Mask; inversePermutation(E->ReorderIndices, Mask); Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); + V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); } if (NeedToShuffleReuses) { // TODO: Merge this shuffle with the ReorderShuffleMask. if (E->ReorderIndices.empty()) Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); + auto *LI = cast<LoadInst>(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); - PointerType *PtrTy = - PointerType::get(VecTy, LI->getPointerAddressSpace()); + auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); Value *NewV = propagateMetadata(V, E->Scalars); if (!E->ReorderIndices.empty()) { SmallVector<int, 4> Mask; inversePermutation(E->ReorderIndices, Mask); - NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); + NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle"); } if (NeedToShuffleReuses) { // TODO: Merge this shuffle with the ReorderShuffleMask. - NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); + NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices, + "shuffle"); } E->VectorizedValue = NewV; return NewV; @@ -4266,10 +4467,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *CI = cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4289,10 +4489,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4310,10 +4509,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateSelect(Cond, True, False); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4334,10 +4532,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; @@ -4378,10 +4575,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; @@ -4396,30 +4592,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E); LoadInst *LI = cast<LoadInst>(VL0); + Instruction *NewLI; unsigned AS = LI->getPointerAddressSpace(); + Value *PO = LI->getPointerOperand(); + if (E->State == TreeEntry::Vectorize) { - Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo(AS)); + Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); - // The pointer operand uses an in-tree scalar so we add the new BitCast to - // ExternalUses list to make sure that an extract will be generated in the - // future. - Value *PO = LI->getPointerOperand(); - if (getTreeEntry(PO)) - ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); + // The pointer operand uses an in-tree scalar so we add the new BitCast + // to ExternalUses list to make sure that an extract will be generated + // in the future. + if (getTreeEntry(PO)) + ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0); + + NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); + } else { + assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); + Value *VecPtr = vectorizeTree(E->getOperand(0)); + // Use the minimum alignment of the gathered loads. + Align CommonAlignment = LI->getAlign(); + for (Value *V : E->Scalars) + CommonAlignment = + commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); + NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment); + } + Value *V = propagateMetadata(NewLI, E->Scalars); - LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); - Value *V = propagateMetadata(LI, E->Scalars); if (IsReorder) { SmallVector<int, 4> Mask; inversePermutation(E->ReorderIndices, Mask); - V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), - Mask, "reorder_shuffle"); + V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); } if (NeedToShuffleReuses) { // TODO: Merge this shuffle with the ReorderShuffleMask. - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); } E->VectorizedValue = V; ++NumVectorInstructions; @@ -4437,9 +4643,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (IsReorder) { SmallVector<int, 4> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); - VecValue = Builder.CreateShuffleVector( - VecValue, UndefValue::get(VecValue->getType()), Mask, - "reorder_shuffle"); + VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf"); } Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast( @@ -4454,10 +4658,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); Value *V = propagateMetadata(ST, E->Scalars); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4494,10 +4697,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; @@ -4537,9 +4739,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Function *CF; if (!UseIntrinsic) { - VFShape Shape = VFShape::get( - *CI, {static_cast<unsigned>(VecTy->getNumElements()), false}, - false /*HasGlobalPred*/); + VFShape Shape = + VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( + VecTy->getNumElements())), + false /*HasGlobalPred*/); CF = VFDatabase(*CI).getVectorizedFunction(Shape); } else { Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())}; @@ -4557,10 +4760,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4625,10 +4827,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + E->VectorizedValue = V; ++NumVectorInstructions; @@ -4693,7 +4894,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list"); + assert(E->State != TreeEntry::NeedToGather && + "Extracting from a gather list"); Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -4851,7 +5053,8 @@ void BoUpSLP::optimizeGatherSequence() { // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { - assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && + assert(*I && + (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"); BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: @@ -4961,8 +5164,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // cancelScheduling). while (!Bundle->isReady() && !ReadyInsts.empty()) { - ScheduleData *pickedSD = ReadyInsts.back(); - ReadyInsts.pop_back(); + ScheduleData *pickedSD = ReadyInsts.pop_back_val(); if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { schedule(pickedSD, ReadyInsts); @@ -5106,7 +5308,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, if (I->mayReadOrWriteMemory() && (!isa<IntrinsicInst>(I) || - cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) { + (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect && + cast<IntrinsicInst>(I)->getIntrinsicID() != + Intrinsic::pseudoprobe))) { // Update the linked list of memory accessing instructions. if (CurrentLoadStore) { CurrentLoadStore->NextLoadStore = SD; @@ -5133,8 +5337,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, WorkList.push_back(SD); while (!WorkList.empty()) { - ScheduleData *SD = WorkList.back(); - WorkList.pop_back(); + ScheduleData *SD = WorkList.pop_back_val(); ScheduleData *BundleMember = SD; while (BundleMember) { @@ -5331,10 +5534,15 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { } unsigned BoUpSLP::getVectorElementSize(Value *V) { - // If V is a store, just return the width of the stored value without - // traversing the expression tree. This is the common case. - if (auto *Store = dyn_cast<StoreInst>(V)) - return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + // If V is a store, just return the width of the stored value (or value + // truncated just before storing) without traversing the expression tree. + // This is the common case. + if (auto *Store = dyn_cast<StoreInst>(V)) { + if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) + return DL->getTypeSizeInBits(Trunc->getSrcTy()); + else + return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + } auto E = InstrElementSize.find(V); if (E != InstrElementSize.end()) @@ -5683,7 +5891,7 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AliasAnalysis *AA_, + TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) { @@ -5783,11 +5991,11 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(); - LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); if (Cost < -SLPCostThreshold) { - LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); using namespace ore; @@ -5860,7 +6068,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, // If a vector register can't hold 1 element, we are done. unsigned MaxVecRegSize = R.getMaxVecRegSize(); - unsigned EltSize = R.getVectorElementSize(Stores[0]); + unsigned EltSize = R.getVectorElementSize(Operands[0]); if (MaxVecRegSize % EltSize != 0) continue; @@ -5911,7 +6119,7 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { continue; if (!isValidElementType(SI->getValueOperand()->getType())) continue; - Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI); + Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); } // Ignore getelementptr instructions that have more than one index, a @@ -5975,6 +6183,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); + MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { R.getORE()->emit([&]() { return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) @@ -5986,7 +6195,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool Changed = false; bool CandidateFound = false; - int MinCost = SLPCostThreshold; + InstructionCost MinCost = SLPCostThreshold.getValue(); bool CompensateUseCost = !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) { @@ -6042,7 +6251,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(); CandidateFound = true; if (CompensateUseCost) { // TODO: Use TTI's getScalarizationOverhead for sequence of inserts @@ -6052,7 +6261,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // part should also switch to same interface. // For example, the following case is projected code after SLP: // %4 = extractelement <4 x i64> %3, i32 0 - // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0 + // %v0 = insertelement <4 x i64> poison, i64 %4, i32 0 // %5 = extractelement <4 x i64> %3, i32 1 // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 // %6 = extractelement <4 x i64> %3, i32 2 @@ -6072,7 +6281,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // Switching to the TTI interface might help a bit. // Alternative solution could be pattern-match to detect a no-op or // shuffle. - unsigned UserCost = 0; + InstructionCost UserCost = 0; for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]); if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) @@ -6163,50 +6372,20 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { return false; } -/// Generate a shuffle mask to be used in a reduction tree. -/// -/// \param VecLen The length of the vector to be reduced. -/// \param NumEltsToRdx The number of elements that should be reduced in the -/// vector. -/// \param IsPairwise Whether the reduction is a pairwise or splitting -/// reduction. A pairwise reduction will generate a mask of -/// <0,2,...> or <1,3,..> while a splitting reduction will generate -/// <2,3, undef,undef> for a vector of 4 and NumElts = 2. -/// \param IsLeft True will generate a mask of even elements, odd otherwise. -static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen, - unsigned NumEltsToRdx, - bool IsPairwise, bool IsLeft) { - assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); - - SmallVector<int, 32> ShuffleMask(VecLen, -1); - - if (IsPairwise) - // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). - for (unsigned i = 0; i != NumEltsToRdx; ++i) - ShuffleMask[i] = 2 * i + !IsLeft; - else - // Move the upper half of the vector to the lower half. - for (unsigned i = 0; i != NumEltsToRdx; ++i) - ShuffleMask[i] = NumEltsToRdx + i; - - return ShuffleMask; -} - namespace { /// Model horizontal reductions. /// -/// A horizontal reduction is a tree of reduction operations (currently add and -/// fadd) that has operations that can be put into a vector as its leaf. -/// For example, this tree: +/// A horizontal reduction is a tree of reduction instructions that has values +/// that can be put into a vector as its leaves. For example: /// /// mul mul mul mul /// \ / \ / /// + + /// \ / /// + -/// This tree has "mul" as its reduced values and "+" as its reduction -/// operations. A reduction might be feeding into a store or a binary operation +/// This tree has "mul" as its leaf values and "+" as its reduction +/// instructions. A reduction can feed into a store or a binary operation /// feeding a phi. /// ... /// \ / @@ -6224,458 +6403,284 @@ namespace { class HorizontalReduction { using ReductionOpsType = SmallVector<Value *, 16>; using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; - ReductionOpsListType ReductionOps; + ReductionOpsListType ReductionOps; SmallVector<Value *, 32> ReducedVals; // Use map vector to make stable output. MapVector<Instruction *, Value *> ExtraArgs; + WeakTrackingVH ReductionRoot; + /// The type of reduction operation. + RecurKind RdxKind; - /// Kind of the reduction data. - enum ReductionKind { - RK_None, /// Not a reduction. - RK_Arithmetic, /// Binary reduction data. - RK_Min, /// Minimum reduction data. - RK_UMin, /// Unsigned minimum reduction data. - RK_Max, /// Maximum reduction data. - RK_UMax, /// Unsigned maximum reduction data. - }; - - /// Contains info about operation, like its opcode, left and right operands. - class OperationData { - /// Opcode of the instruction. - unsigned Opcode = 0; - - /// Left operand of the reduction operation. - Value *LHS = nullptr; - - /// Right operand of the reduction operation. - Value *RHS = nullptr; - - /// Kind of the reduction operation. - ReductionKind Kind = RK_None; - - /// True if float point min/max reduction has no NaNs. - bool NoNaN = false; - - /// Checks if the reduction operation can be vectorized. - bool isVectorizable() const { - return LHS && RHS && - // We currently only support add/mul/logical && min/max reductions. - ((Kind == RK_Arithmetic && - (Opcode == Instruction::Add || Opcode == Instruction::FAdd || - Opcode == Instruction::Mul || Opcode == Instruction::FMul || - Opcode == Instruction::And || Opcode == Instruction::Or || - Opcode == Instruction::Xor)) || - ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && - (Kind == RK_Min || Kind == RK_Max)) || - (Opcode == Instruction::ICmp && - (Kind == RK_UMin || Kind == RK_UMax))); - } + /// Checks if instruction is associative and can be vectorized. + static bool isVectorizable(RecurKind Kind, Instruction *I) { + if (Kind == RecurKind::None) + return false; + if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind)) + return true; - /// Creates reduction operation with the current opcode. - Value *createOp(IRBuilder<> &Builder, const Twine &Name) const { - assert(isVectorizable() && - "Expected add|fadd or min/max reduction operation."); - Value *Cmp = nullptr; - switch (Kind) { - case RK_Arithmetic: - return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS, - Name); - case RK_Min: - Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS) - : Builder.CreateFCmpOLT(LHS, RHS); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - case RK_Max: - Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS) - : Builder.CreateFCmpOGT(LHS, RHS); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - case RK_UMin: - assert(Opcode == Instruction::ICmp && "Expected integer types."); - Cmp = Builder.CreateICmpULT(LHS, RHS); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - case RK_UMax: - assert(Opcode == Instruction::ICmp && "Expected integer types."); - Cmp = Builder.CreateICmpUGT(LHS, RHS); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - case RK_None: - break; - } - llvm_unreachable("Unknown reduction operation."); + if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { + // FP min/max are associative except for NaN and -0.0. We do not + // have to rule out -0.0 here because the intrinsic semantics do not + // specify a fixed result for it. + return I->getFastMathFlags().noNaNs(); } - public: - explicit OperationData() = default; - - /// Construction for reduced values. They are identified by opcode only and - /// don't have associated LHS/RHS values. - explicit OperationData(Value *V) { - if (auto *I = dyn_cast<Instruction>(V)) - Opcode = I->getOpcode(); - } + return I->isAssociative(); + } - /// Constructor for reduction operations with opcode and its left and - /// right operands. - OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind, - bool NoNaN = false) - : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) { - assert(Kind != RK_None && "One of the reduction operations is expected."); + /// Checks if the ParentStackElem.first should be marked as a reduction + /// operation with an extra argument or as extra argument itself. + void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, + Value *ExtraArg) { + if (ExtraArgs.count(ParentStackElem.first)) { + ExtraArgs[ParentStackElem.first] = nullptr; + // We ran into something like: + // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. + // The whole ParentStackElem.first should be considered as an extra value + // in this case. + // Do not perform analysis of remaining operands of ParentStackElem.first + // instruction, this whole instruction is an extra argument. + RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first); + ParentStackElem.second = getNumberOfOperands(ParentRdxKind); + } else { + // We ran into something like: + // ParentStackElem.first += ... + ExtraArg + ... + ExtraArgs[ParentStackElem.first] = ExtraArg; } + } - explicit operator bool() const { return Opcode; } + /// Creates reduction operation with the current opcode. + static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, + Value *RHS, const Twine &Name) { + unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); + switch (Kind) { + case RecurKind::Add: + case RecurKind::Mul: + case RecurKind::Or: + case RecurKind::And: + case RecurKind::Xor: + case RecurKind::FAdd: + case RecurKind::FMul: + return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, + Name); + case RecurKind::FMax: + return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); + case RecurKind::FMin: + return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); - /// Return true if this operation is any kind of minimum or maximum. - bool isMinMax() const { - switch (Kind) { - case RK_Arithmetic: - return false; - case RK_Min: - case RK_Max: - case RK_UMin: - case RK_UMax: - return true; - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + case RecurKind::SMax: { + Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); } - - /// Get the index of the first operand. - unsigned getFirstOperandIndex() const { - assert(!!*this && "The opcode is not set."); - // We allow calling this before 'Kind' is set, so handle that specially. - if (Kind == RK_None) - return 0; - return isMinMax() ? 1 : 0; + case RecurKind::SMin: { + Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); } - - /// Total number of operands in the reduction operation. - unsigned getNumberOfOperands() const { - assert(Kind != RK_None && !!*this && LHS && RHS && - "Expected reduction operation."); - return isMinMax() ? 3 : 2; + case RecurKind::UMax: { + Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); } - - /// Checks if the operation has the same parent as \p P. - bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const { - assert(Kind != RK_None && !!*this && LHS && RHS && - "Expected reduction operation."); - if (!IsRedOp) - return I->getParent() == P; - if (isMinMax()) { - // SelectInst must be used twice while the condition op must have single - // use only. - auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); - return I->getParent() == P && Cmp && Cmp->getParent() == P; - } - // Arithmetic reduction operation must be used once only. - return I->getParent() == P; + case RecurKind::UMin: { + Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); } - - /// Expected number of uses for reduction operations/reduced values. - bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const { - assert(Kind != RK_None && !!*this && LHS && RHS && - "Expected reduction operation."); - if (isMinMax()) - return I->hasNUses(2) && - (!IsReductionOp || - cast<SelectInst>(I)->getCondition()->hasOneUse()); - return I->hasOneUse(); + default: + llvm_unreachable("Unknown reduction operation."); } + } - /// Initializes the list of reduction operations. - void initReductionOps(ReductionOpsListType &ReductionOps) { - assert(Kind != RK_None && !!*this && LHS && RHS && - "Expected reduction operation."); - if (isMinMax()) - ReductionOps.assign(2, ReductionOpsType()); - else - ReductionOps.assign(1, ReductionOpsType()); + /// Creates reduction operation with the current opcode with the IR flags + /// from \p ReductionOps. + static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, + Value *RHS, const Twine &Name, + const ReductionOpsListType &ReductionOps) { + Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); + if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { + if (auto *Sel = dyn_cast<SelectInst>(Op)) + propagateIRFlags(Sel->getCondition(), ReductionOps[0]); + propagateIRFlags(Op, ReductionOps[1]); + return Op; } - - /// Add all reduction operations for the reduction instruction \p I. - void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) { - assert(Kind != RK_None && !!*this && LHS && RHS && - "Expected reduction operation."); - if (isMinMax()) { - ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); - ReductionOps[1].emplace_back(I); - } else { - ReductionOps[0].emplace_back(I); + propagateIRFlags(Op, ReductionOps[0]); + return Op; + } + /// Creates reduction operation with the current opcode with the IR flags + /// from \p I. + static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, + Value *RHS, const Twine &Name, Instruction *I) { + Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); + if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { + if (auto *Sel = dyn_cast<SelectInst>(Op)) { + propagateIRFlags(Sel->getCondition(), + cast<SelectInst>(I)->getCondition()); } } + propagateIRFlags(Op, I); + return Op; + } - /// Checks if instruction is associative and can be vectorized. - bool isAssociative(Instruction *I) const { - assert(Kind != RK_None && *this && LHS && RHS && - "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - return I->isAssociative(); - case RK_Min: - case RK_Max: - return Opcode == Instruction::ICmp || - cast<Instruction>(I->getOperand(0))->isFast(); - case RK_UMin: - case RK_UMax: - assert(Opcode == Instruction::ICmp && - "Only integer compare operation is expected."); - return true; - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); - } + static RecurKind getRdxKind(Instruction *I) { + assert(I && "Expected instruction for reduction matching"); + TargetTransformInfo::ReductionFlags RdxFlags; + if (match(I, m_Add(m_Value(), m_Value()))) + return RecurKind::Add; + if (match(I, m_Mul(m_Value(), m_Value()))) + return RecurKind::Mul; + if (match(I, m_And(m_Value(), m_Value()))) + return RecurKind::And; + if (match(I, m_Or(m_Value(), m_Value()))) + return RecurKind::Or; + if (match(I, m_Xor(m_Value(), m_Value()))) + return RecurKind::Xor; + if (match(I, m_FAdd(m_Value(), m_Value()))) + return RecurKind::FAdd; + if (match(I, m_FMul(m_Value(), m_Value()))) + return RecurKind::FMul; - /// Checks if the reduction operation can be vectorized. - bool isVectorizable(Instruction *I) const { - return isVectorizable() && isAssociative(I); - } + if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) + return RecurKind::FMax; + if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) + return RecurKind::FMin; - /// Checks if two operation data are both a reduction op or both a reduced - /// value. - bool operator==(const OperationData &OD) const { - assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && - "One of the comparing operations is incorrect."); - return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode); - } - bool operator!=(const OperationData &OD) const { return !(*this == OD); } - void clear() { - Opcode = 0; - LHS = nullptr; - RHS = nullptr; - Kind = RK_None; - NoNaN = false; - } + if (match(I, m_SMax(m_Value(), m_Value()))) + return RecurKind::SMax; + if (match(I, m_SMin(m_Value(), m_Value()))) + return RecurKind::SMin; + if (match(I, m_UMax(m_Value(), m_Value()))) + return RecurKind::UMax; + if (match(I, m_UMin(m_Value(), m_Value()))) + return RecurKind::UMin; - /// Get the opcode of the reduction operation. - unsigned getOpcode() const { - assert(isVectorizable() && "Expected vectorizable operation."); - return Opcode; - } + if (auto *Select = dyn_cast<SelectInst>(I)) { + // Try harder: look for min/max pattern based on instructions producing + // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). + // During the intermediate stages of SLP, it's very common to have + // pattern like this (since optimizeGatherSequence is run only once + // at the end): + // %1 = extractelement <2 x i32> %a, i32 0 + // %2 = extractelement <2 x i32> %a, i32 1 + // %cond = icmp sgt i32 %1, %2 + // %3 = extractelement <2 x i32> %a, i32 0 + // %4 = extractelement <2 x i32> %a, i32 1 + // %select = select i1 %cond, i32 %3, i32 %4 + CmpInst::Predicate Pred; + Instruction *L1; + Instruction *L2; - /// Get kind of reduction data. - ReductionKind getKind() const { return Kind; } - Value *getLHS() const { return LHS; } - Value *getRHS() const { return RHS; } - Type *getConditionType() const { - return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr; - } + Value *LHS = Select->getTrueValue(); + Value *RHS = Select->getFalseValue(); + Value *Cond = Select->getCondition(); - /// Creates reduction operation with the current opcode with the IR flags - /// from \p ReductionOps. - Value *createOp(IRBuilder<> &Builder, const Twine &Name, - const ReductionOpsListType &ReductionOps) const { - assert(isVectorizable() && - "Expected add|fadd or min/max reduction operation."); - auto *Op = createOp(Builder, Name); - switch (Kind) { - case RK_Arithmetic: - propagateIRFlags(Op, ReductionOps[0]); - return Op; - case RK_Min: - case RK_Max: - case RK_UMin: - case RK_UMax: - if (auto *SI = dyn_cast<SelectInst>(Op)) - propagateIRFlags(SI->getCondition(), ReductionOps[0]); - propagateIRFlags(Op, ReductionOps[1]); - return Op; - case RK_None: - break; - } - llvm_unreachable("Unknown reduction operation."); - } - /// Creates reduction operation with the current opcode with the IR flags - /// from \p I. - Value *createOp(IRBuilder<> &Builder, const Twine &Name, - Instruction *I) const { - assert(isVectorizable() && - "Expected add|fadd or min/max reduction operation."); - auto *Op = createOp(Builder, Name); - switch (Kind) { - case RK_Arithmetic: - propagateIRFlags(Op, I); - return Op; - case RK_Min: - case RK_Max: - case RK_UMin: - case RK_UMax: - if (auto *SI = dyn_cast<SelectInst>(Op)) { - propagateIRFlags(SI->getCondition(), - cast<SelectInst>(I)->getCondition()); - } - propagateIRFlags(Op, I); - return Op; - case RK_None: - break; + // TODO: Support inverse predicates. + if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { + if (!isa<ExtractElementInst>(RHS) || + !L2->isIdenticalTo(cast<Instruction>(RHS))) + return RecurKind::None; + } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { + if (!isa<ExtractElementInst>(LHS) || + !L1->isIdenticalTo(cast<Instruction>(LHS))) + return RecurKind::None; + } else { + if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) + return RecurKind::None; + if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || + !L1->isIdenticalTo(cast<Instruction>(LHS)) || + !L2->isIdenticalTo(cast<Instruction>(RHS))) + return RecurKind::None; } - llvm_unreachable("Unknown reduction operation."); - } - TargetTransformInfo::ReductionFlags getFlags() const { - TargetTransformInfo::ReductionFlags Flags; - Flags.NoNaN = NoNaN; - switch (Kind) { - case RK_Arithmetic: - break; - case RK_Min: - Flags.IsSigned = Opcode == Instruction::ICmp; - Flags.IsMaxOp = false; - break; - case RK_Max: - Flags.IsSigned = Opcode == Instruction::ICmp; - Flags.IsMaxOp = true; - break; - case RK_UMin: - Flags.IsSigned = false; - Flags.IsMaxOp = false; - break; - case RK_UMax: - Flags.IsSigned = false; - Flags.IsMaxOp = true; - break; - case RK_None: - llvm_unreachable("Reduction kind is not set"); + TargetTransformInfo::ReductionFlags RdxFlags; + switch (Pred) { + default: + return RecurKind::None; + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_SGE: + return RecurKind::SMax; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SLE: + return RecurKind::SMin; + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_UGE: + return RecurKind::UMax; + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_ULE: + return RecurKind::UMin; } - return Flags; } - }; - - WeakTrackingVH ReductionRoot; - - /// The operation data of the reduction operation. - OperationData ReductionData; - - /// The operation data of the values we perform a reduction on. - OperationData ReducedValueData; + return RecurKind::None; + } - /// Should we model this reduction as a pairwise reduction tree or a tree that - /// splits the vector in halves and adds those halves. - bool IsPairwiseReduction = false; + /// Return true if this operation is a cmp+select idiom. + static bool isCmpSel(RecurKind Kind) { + return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind); + } - /// Checks if the ParentStackElem.first should be marked as a reduction - /// operation with an extra argument or as extra argument itself. - void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, - Value *ExtraArg) { - if (ExtraArgs.count(ParentStackElem.first)) { - ExtraArgs[ParentStackElem.first] = nullptr; - // We ran into something like: - // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. - // The whole ParentStackElem.first should be considered as an extra value - // in this case. - // Do not perform analysis of remaining operands of ParentStackElem.first - // instruction, this whole instruction is an extra argument. - ParentStackElem.second = ParentStackElem.first->getNumOperands(); - } else { - // We ran into something like: - // ParentStackElem.first += ... + ExtraArg + ... - ExtraArgs[ParentStackElem.first] = ExtraArg; - } + /// Get the index of the first operand. + static unsigned getFirstOperandIndex(RecurKind Kind) { + // We allow calling this before 'Kind' is set, so handle that specially. + if (Kind == RecurKind::None) + return 0; + return isCmpSel(Kind) ? 1 : 0; } - static OperationData getOperationData(Value *V) { - if (!V) - return OperationData(); + /// Total number of operands in the reduction operation. + static unsigned getNumberOfOperands(RecurKind Kind) { + return isCmpSel(Kind) ? 3 : 2; + } - Value *LHS; - Value *RHS; - if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) { - return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS, - RK_Arithmetic); + /// Checks if the instruction is in basic block \p BB. + /// For a min/max reduction check that both compare and select are in \p BB. + static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB, + bool IsRedOp) { + if (IsRedOp && isCmpSel(Kind)) { + auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); + return I->getParent() == BB && Cmp && Cmp->getParent() == BB; } - if (auto *Select = dyn_cast<SelectInst>(V)) { - // Look for a min/max pattern. - if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin); - } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData(Instruction::ICmp, LHS, RHS, RK_Min); - } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) || - m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData( - Instruction::FCmp, LHS, RHS, RK_Min, - cast<Instruction>(Select->getCondition())->hasNoNaNs()); - } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax); - } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData(Instruction::ICmp, LHS, RHS, RK_Max); - } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) || - m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData( - Instruction::FCmp, LHS, RHS, RK_Max, - cast<Instruction>(Select->getCondition())->hasNoNaNs()); - } else { - // Try harder: look for min/max pattern based on instructions producing - // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). - // During the intermediate stages of SLP, it's very common to have - // pattern like this (since optimizeGatherSequence is run only once - // at the end): - // %1 = extractelement <2 x i32> %a, i32 0 - // %2 = extractelement <2 x i32> %a, i32 1 - // %cond = icmp sgt i32 %1, %2 - // %3 = extractelement <2 x i32> %a, i32 0 - // %4 = extractelement <2 x i32> %a, i32 1 - // %select = select i1 %cond, i32 %3, i32 %4 - CmpInst::Predicate Pred; - Instruction *L1; - Instruction *L2; - - LHS = Select->getTrueValue(); - RHS = Select->getFalseValue(); - Value *Cond = Select->getCondition(); - - // TODO: Support inverse predicates. - if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { - if (!isa<ExtractElementInst>(RHS) || - !L2->isIdenticalTo(cast<Instruction>(RHS))) - return OperationData(V); - } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { - if (!isa<ExtractElementInst>(LHS) || - !L1->isIdenticalTo(cast<Instruction>(LHS))) - return OperationData(V); - } else { - if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) - return OperationData(V); - if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || - !L1->isIdenticalTo(cast<Instruction>(LHS)) || - !L2->isIdenticalTo(cast<Instruction>(RHS))) - return OperationData(V); - } - switch (Pred) { - default: - return OperationData(V); - - case CmpInst::ICMP_ULT: - case CmpInst::ICMP_ULE: - return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin); - - case CmpInst::ICMP_SLT: - case CmpInst::ICMP_SLE: - return OperationData(Instruction::ICmp, LHS, RHS, RK_Min); + return I->getParent() == BB; + } - case CmpInst::FCMP_OLT: - case CmpInst::FCMP_OLE: - case CmpInst::FCMP_ULT: - case CmpInst::FCMP_ULE: - return OperationData(Instruction::FCmp, LHS, RHS, RK_Min, - cast<Instruction>(Cond)->hasNoNaNs()); + /// Expected number of uses for reduction operations/reduced values. + static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I, + bool IsReductionOp) { + // SelectInst must be used twice while the condition op must have single + // use only. + if (isCmpSel(Kind)) + return I->hasNUses(2) && + (!IsReductionOp || + cast<SelectInst>(I)->getCondition()->hasOneUse()); - case CmpInst::ICMP_UGT: - case CmpInst::ICMP_UGE: - return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax); + // Arithmetic reduction operation must be used once only. + return I->hasOneUse(); + } - case CmpInst::ICMP_SGT: - case CmpInst::ICMP_SGE: - return OperationData(Instruction::ICmp, LHS, RHS, RK_Max); + /// Initializes the list of reduction operations. + void initReductionOps(RecurKind Kind) { + if (isCmpSel(Kind)) + ReductionOps.assign(2, ReductionOpsType()); + else + ReductionOps.assign(1, ReductionOpsType()); + } - case CmpInst::FCMP_OGT: - case CmpInst::FCMP_OGE: - case CmpInst::FCMP_UGT: - case CmpInst::FCMP_UGE: - return OperationData(Instruction::FCmp, LHS, RHS, RK_Max, - cast<Instruction>(Cond)->hasNoNaNs()); - } - } + /// Add all reduction operations for the reduction instruction \p I. + void addReductionOps(RecurKind Kind, Instruction *I) { + assert(Kind != RecurKind::None && "Expected reduction operation."); + if (isCmpSel(Kind)) { + ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); + ReductionOps[1].emplace_back(I); + } else { + ReductionOps[0].emplace_back(I); } - return OperationData(V); + } + + static Value *getLHS(RecurKind Kind, Instruction *I) { + if (Kind == RecurKind::None) + return nullptr; + return I->getOperand(getFirstOperandIndex(Kind)); + } + static Value *getRHS(RecurKind Kind, Instruction *I) { + if (Kind == RecurKind::None) + return nullptr; + return I->getOperand(getFirstOperandIndex(Kind) + 1); } public: @@ -6684,50 +6689,59 @@ public: /// Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, Instruction *B) { assert((!Phi || is_contained(Phi->operands(), B)) && - "Thi phi needs to use the binary operator"); + "Phi needs to use the binary operator"); - ReductionData = getOperationData(B); + RdxKind = getRdxKind(B); // We could have a initial reductions that is not an add. // r *= v1 + v2 + v3 + v4 // In such a case start looking for a tree rooted in the first '+'. if (Phi) { - if (ReductionData.getLHS() == Phi) { + if (getLHS(RdxKind, B) == Phi) { Phi = nullptr; - B = dyn_cast<Instruction>(ReductionData.getRHS()); - ReductionData = getOperationData(B); - } else if (ReductionData.getRHS() == Phi) { + B = dyn_cast<Instruction>(getRHS(RdxKind, B)); + if (!B) + return false; + RdxKind = getRdxKind(B); + } else if (getRHS(RdxKind, B) == Phi) { Phi = nullptr; - B = dyn_cast<Instruction>(ReductionData.getLHS()); - ReductionData = getOperationData(B); + B = dyn_cast<Instruction>(getLHS(RdxKind, B)); + if (!B) + return false; + RdxKind = getRdxKind(B); } } - if (!ReductionData.isVectorizable(B)) + if (!isVectorizable(RdxKind, B)) return false; + // Analyze "regular" integer/FP types for reductions - no target-specific + // types or pointers. Type *Ty = B->getType(); - if (!isValidElementType(Ty)) - return false; - if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy()) + if (!isValidElementType(Ty) || Ty->isPointerTy()) return false; - ReducedValueData.clear(); ReductionRoot = B; + // The opcode for leaf values that we perform a reduction on. + // For example: load(x) + load(y) + load(z) + fptoui(w) + // The leaf opcode for 'w' does not match, so we don't include it as a + // potential candidate for the reduction. + unsigned LeafOpcode = 0; + // Post order traverse the reduction tree starting at B. We only handle true // trees containing only binary operators. SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; - Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex())); - ReductionData.initReductionOps(ReductionOps); + Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind))); + initReductionOps(RdxKind); while (!Stack.empty()) { Instruction *TreeN = Stack.back().first; - unsigned EdgeToVist = Stack.back().second++; - OperationData OpData = getOperationData(TreeN); - bool IsReducedValue = OpData != ReductionData; + unsigned EdgeToVisit = Stack.back().second++; + const RecurKind TreeRdxKind = getRdxKind(TreeN); + bool IsReducedValue = TreeRdxKind != RdxKind; - // Postorder vist. - if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) { + // Postorder visit. + if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) { if (IsReducedValue) ReducedVals.push_back(TreeN); else { @@ -6745,7 +6759,7 @@ public: markExtraArg(Stack[Stack.size() - 2], TreeN); ExtraArgs.erase(TreeN); } else - ReductionData.addReductionOps(TreeN, ReductionOps); + addReductionOps(RdxKind, TreeN); } // Retract. Stack.pop_back(); @@ -6753,91 +6767,72 @@ public: } // Visit left or right. - Value *NextV = TreeN->getOperand(EdgeToVist); - if (NextV != Phi) { - auto *I = dyn_cast<Instruction>(NextV); - OpData = getOperationData(I); - // Continue analysis if the next operand is a reduction operation or - // (possibly) a reduced value. If the reduced value opcode is not set, - // the first met operation != reduction operation is considered as the - // reduced value class. - if (I && (!ReducedValueData || OpData == ReducedValueData || - OpData == ReductionData)) { - const bool IsReductionOperation = OpData == ReductionData; - // Only handle trees in the current basic block. - if (!ReductionData.hasSameParent(I, B->getParent(), - IsReductionOperation)) { - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), I); - continue; - } - - // Each tree node needs to have minimal number of users except for the - // ultimate reduction. - if (!ReductionData.hasRequiredNumberOfUses(I, - OpData == ReductionData) && - I != B) { + Value *EdgeVal = TreeN->getOperand(EdgeToVisit); + auto *I = dyn_cast<Instruction>(EdgeVal); + if (!I) { + // Edge value is not a reduction instruction or a leaf instruction. + // (It may be a constant, function argument, or something else.) + markExtraArg(Stack.back(), EdgeVal); + continue; + } + RecurKind EdgeRdxKind = getRdxKind(I); + // Continue analysis if the next operand is a reduction operation or + // (possibly) a leaf value. If the leaf value opcode is not set, + // the first met operation != reduction operation is considered as the + // leaf opcode. + // Only handle trees in the current basic block. + // Each tree node needs to have minimal number of users except for the + // ultimate reduction. + const bool IsRdxInst = EdgeRdxKind == RdxKind; + if (I != Phi && I != B && + hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) && + hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) && + (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) { + if (IsRdxInst) { + // We need to be able to reassociate the reduction operations. + if (!isVectorizable(EdgeRdxKind, I)) { // I is an extra argument for TreeN (its parent operation). markExtraArg(Stack.back(), I); continue; } - - if (IsReductionOperation) { - // We need to be able to reassociate the reduction operations. - if (!OpData.isAssociative(I)) { - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), I); - continue; - } - } else if (ReducedValueData && - ReducedValueData != OpData) { - // Make sure that the opcodes of the operations that we are going to - // reduce match. - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), I); - continue; - } else if (!ReducedValueData) - ReducedValueData = OpData; - - Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex())); - continue; + } else if (!LeafOpcode) { + LeafOpcode = I->getOpcode(); } + Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind))); + continue; } - // NextV is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), NextV); + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); } return true; } - /// Attempt to vectorize the tree found by - /// matchAssociativeReduction. + /// Attempt to vectorize the tree found by matchAssociativeReduction. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { - if (ReducedVals.empty()) - return false; - - // If there is a sufficient number of reduction values, reduce - // to a nearby power-of-2. Can safely generate oversized + // If there are a sufficient number of reduction values, reduce + // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); if (NumReducedVals < 4) return false; - unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); - - Value *VectorizedTree = nullptr; + // Intersect the fast-math-flags from all reduction operations. + FastMathFlags RdxFMF; + RdxFMF.set(); + for (ReductionOpsType &RdxOp : ReductionOps) { + for (Value *RdxVal : RdxOp) { + if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal)) + RdxFMF &= FPMO->getFastMathFlags(); + } + } - // FIXME: Fast-math-flags should be set based on the instructions in the - // reduction (not all of 'fast' are required). IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); - FastMathFlags Unsafe; - Unsafe.setFast(); - Builder.setFastMathFlags(Unsafe); - unsigned i = 0; + Builder.setFastMathFlags(RdxFMF); BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; - // The same extra argument may be used several time, so log each attempt + // The same extra argument may be used several times, so log each attempt // to use it. - for (auto &Pair : ExtraArgs) { + for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); } @@ -6857,14 +6852,48 @@ public: // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; SmallVector<Value *, 16> IgnoreList; - for (auto &V : ReductionOps) - IgnoreList.append(V.begin(), V.end()); + for (ReductionOpsType &RdxOp : ReductionOps) + IgnoreList.append(RdxOp.begin(), RdxOp.end()); + + unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + if (NumReducedVals > ReduxWidth) { + // In the loop below, we are building a tree based on a window of + // 'ReduxWidth' values. + // If the operands of those values have common traits (compare predicate, + // constant operand, etc), then we want to group those together to + // minimize the cost of the reduction. + + // TODO: This should be extended to count common operands for + // compares and binops. + + // Step 1: Count the number of times each compare predicate occurs. + SmallDenseMap<unsigned, unsigned> PredCountMap; + for (Value *RdxVal : ReducedVals) { + CmpInst::Predicate Pred; + if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) + ++PredCountMap[Pred]; + } + // Step 2: Sort the values so the most common predicates come first. + stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { + CmpInst::Predicate PredA, PredB; + if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && + match(B, m_Cmp(PredB, m_Value(), m_Value()))) { + return PredCountMap[PredA] > PredCountMap[PredB]; + } + return false; + }); + } + + Value *VectorizedTree = nullptr; + unsigned i = 0; while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { - auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); + ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional<ArrayRef<unsigned>> Order = V.bestOrder(); - // TODO: Handle orders of size less than number of elements in the vector. - if (Order && Order->size() == VL.size()) { + if (Order) { + assert(Order->size() == VL.size() && + "Order size must be the same as number of vectorized " + "instructions."); // TODO: reorder tree nodes without tree rebuilding. SmallVector<Value *, 4> ReorderedOps(VL.size()); llvm::transform(*Order, ReorderedOps.begin(), @@ -6873,60 +6902,66 @@ public: } if (V.isTreeTinyAndNotFullyVectorizable()) break; - if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode())) + if (V.isLoadCombineReductionCandidate(RdxKind)) break; V.computeMinimumValueSizes(); // Estimate cost. - int TreeCost = V.getTreeCost(); - int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth); - int Cost = TreeCost + ReductionCost; + InstructionCost TreeCost = V.getTreeCost(); + InstructionCost ReductionCost = + getReductionCost(TTI, ReducedVals[i], ReduxWidth); + InstructionCost Cost = TreeCost + ReductionCost; + if (!Cost.isValid()) { + LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); + return false; + } if (Cost >= -SLPCostThreshold) { - V.getORE()->emit([&]() { - return OptimizationRemarkMissed( - SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0])) - << "Vectorizing horizontal reduction is possible" - << "but not beneficial with cost " - << ore::NV("Cost", Cost) << " and threshold " - << ore::NV("Threshold", -SLPCostThreshold); - }); - break; + V.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", + cast<Instruction>(VL[0])) + << "Vectorizing horizontal reduction is possible" + << "but not beneficial with cost " << ore::NV("Cost", Cost) + << " and threshold " + << ore::NV("Threshold", -SLPCostThreshold); + }); + break; } LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); V.getORE()->emit([&]() { - return OptimizationRemark( - SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0])) - << "Vectorized horizontal reduction with cost " - << ore::NV("Cost", Cost) << " and with tree size " - << ore::NV("TreeSize", V.getTreeSize()); + return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", + cast<Instruction>(VL[0])) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize()); }); // Vectorize a tree. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); - // Emit a reduction. For min/max, the root is a select, but the insertion + // Emit a reduction. If the root is a select (min/max idiom), the insert // point is the compare condition of that select. Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); - if (ReductionData.isMinMax()) + if (isCmpSel(RdxKind)) Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); else Builder.SetInsertPoint(RdxRootInst); Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); - if (VectorizedTree) { - Builder.SetCurrentDebugLocation(Loc); - OperationData VectReductionData(ReductionData.getOpcode(), - VectorizedTree, ReducedSubTree, - ReductionData.getKind()); - VectorizedTree = - VectReductionData.createOp(Builder, "op.rdx", ReductionOps); - } else + + if (!VectorizedTree) { + // Initialize the final value in the reduction. VectorizedTree = ReducedSubTree; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation(Loc); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + ReducedSubTree, "op.rdx", ReductionOps); + } i += ReduxWidth; ReduxWidth = PowerOf2Floor(NumReducedVals - i); } @@ -6936,19 +6971,15 @@ public: for (; i < NumReducedVals; ++i) { auto *I = cast<Instruction>(ReducedVals[i]); Builder.SetCurrentDebugLocation(I->getDebugLoc()); - OperationData VectReductionData(ReductionData.getOpcode(), - VectorizedTree, I, - ReductionData.getKind()); - VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps); + VectorizedTree = + createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps); } for (auto &Pair : ExternallyUsedValues) { // Add each externally used value to the final reduction. for (auto *I : Pair.second) { Builder.SetCurrentDebugLocation(I->getDebugLoc()); - OperationData VectReductionData(ReductionData.getOpcode(), - VectorizedTree, Pair.first, - ReductionData.getKind()); - VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + Pair.first, "op.extra", I); } } @@ -6956,7 +6987,7 @@ public: // select, we also have to RAUW for the compare instruction feeding the // reduction root. That's because the original compare may have extra uses // besides the final select of the reduction. - if (ReductionData.isMinMax()) { + if (isCmpSel(RdxKind)) { if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) { Instruction *ScalarCmp = getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot)); @@ -6972,77 +7003,68 @@ public: return VectorizedTree != nullptr; } - unsigned numReductionValues() const { - return ReducedVals.size(); - } + unsigned numReductionValues() const { return ReducedVals.size(); } private: /// Calculate the cost of a reduction. - int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal, - unsigned ReduxWidth) { + InstructionCost getReductionCost(TargetTransformInfo *TTI, + Value *FirstReducedVal, + unsigned ReduxWidth) { Type *ScalarTy = FirstReducedVal->getType(); - auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth); - - int PairwiseRdxCost; - int SplittingRdxCost; - switch (ReductionData.getKind()) { - case RK_Arithmetic: - PairwiseRdxCost = - TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, - /*IsPairwiseForm=*/true); - SplittingRdxCost = - TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, - /*IsPairwiseForm=*/false); + FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); + InstructionCost VectorCost, ScalarCost; + switch (RdxKind) { + case RecurKind::Add: + case RecurKind::Mul: + case RecurKind::Or: + case RecurKind::And: + case RecurKind::Xor: + case RecurKind::FAdd: + case RecurKind::FMul: { + unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); + VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, + /*IsPairwiseForm=*/false); + ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); break; - case RK_Min: - case RK_Max: - case RK_UMin: - case RK_UMax: { - auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy)); - bool IsUnsigned = ReductionData.getKind() == RK_UMin || - ReductionData.getKind() == RK_UMax; - PairwiseRdxCost = - TTI->getMinMaxReductionCost(VecTy, VecCondTy, - /*IsPairwiseForm=*/true, IsUnsigned); - SplittingRdxCost = - TTI->getMinMaxReductionCost(VecTy, VecCondTy, - /*IsPairwiseForm=*/false, IsUnsigned); - break; - } - case RK_None: - llvm_unreachable("Expected arithmetic or min/max reduction operation"); } - - IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; - int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; - - int ScalarReduxCost = 0; - switch (ReductionData.getKind()) { - case RK_Arithmetic: - ScalarReduxCost = - TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); + case RecurKind::FMax: + case RecurKind::FMin: { + auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); + VectorCost = + TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + /*pairwise=*/false, /*unsigned=*/false); + ScalarCost = + TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy)); break; - case RK_Min: - case RK_Max: - case RK_UMin: - case RK_UMax: - ScalarReduxCost = - TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) + + } + case RecurKind::SMax: + case RecurKind::SMin: + case RecurKind::UMax: + case RecurKind::UMin: { + auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); + bool IsUnsigned = + RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; + VectorCost = + TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + /*IsPairwiseForm=*/false, IsUnsigned); + ScalarCost = + TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, CmpInst::makeCmpResultType(ScalarTy)); break; - case RK_None: + } + default: llvm_unreachable("Expected arithmetic or min/max reduction operation"); } - ScalarReduxCost *= (ReduxWidth - 1); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost + // Scalar cost is repeated for N-1 elements. + ScalarCost *= (ReduxWidth - 1); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost << " for reduction that starts with " << *FirstReducedVal - << " (It is a " - << (IsPairwiseReduction ? "pairwise" : "splitting") - << " reduction)\n"); - - return VecReduxCost - ScalarReduxCost; + << " (It is a splitting reduction)\n"); + return VectorCost - ScalarCost; } /// Emit a horizontal reduction of the vectorized value. @@ -7052,92 +7074,142 @@ private: assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); - if (!IsPairwiseReduction) { - // FIXME: The builder should use an FMF guard. It should not be hard-coded - // to 'fast'. - assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF"); - return createSimpleTargetReduction( - Builder, TTI, ReductionData.getOpcode(), VectorizedValue, - ReductionData.getFlags(), ReductionOps.back()); - } + return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, + ReductionOps.back()); + } +}; + +} // end anonymous namespace - Value *TmpVec = VectorizedValue; - for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { - auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true); - auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false); +static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { + if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) + return cast<FixedVectorType>(IE->getType())->getNumElements(); - Value *LeftShuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); - Value *RightShuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), - "rdx.shuf.r"); - OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf, - RightShuf, ReductionData.getKind()); - TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps); + unsigned AggregateSize = 1; + auto *IV = cast<InsertValueInst>(InsertInst); + Type *CurrentType = IV->getType(); + do { + if (auto *ST = dyn_cast<StructType>(CurrentType)) { + for (auto *Elt : ST->elements()) + if (Elt != ST->getElementType(0)) // check homogeneity + return None; + AggregateSize *= ST->getNumElements(); + CurrentType = ST->getElementType(0); + } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { + AggregateSize *= AT->getNumElements(); + CurrentType = AT->getElementType(); + } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { + AggregateSize *= VT->getNumElements(); + return AggregateSize; + } else if (CurrentType->isSingleValueType()) { + return AggregateSize; + } else { + return None; } + } while (true); +} - // The result is in the first element of the vector. - return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); +static Optional<unsigned> getOperandIndex(Instruction *InsertInst, + unsigned OperandOffset) { + unsigned OperandIndex = OperandOffset; + if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { + if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { + auto *VT = cast<FixedVectorType>(IE->getType()); + OperandIndex *= VT->getNumElements(); + OperandIndex += CI->getZExtValue(); + return OperandIndex; + } + return None; } -}; -} // end anonymous namespace + auto *IV = cast<InsertValueInst>(InsertInst); + Type *CurrentType = IV->getType(); + for (unsigned int Index : IV->indices()) { + if (auto *ST = dyn_cast<StructType>(CurrentType)) { + OperandIndex *= ST->getNumElements(); + CurrentType = ST->getElementType(Index); + } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { + OperandIndex *= AT->getNumElements(); + CurrentType = AT->getElementType(); + } else { + return None; + } + OperandIndex += Index; + } + return OperandIndex; +} + +static bool findBuildAggregate_rec(Instruction *LastInsertInst, + TargetTransformInfo *TTI, + SmallVectorImpl<Value *> &BuildVectorOpds, + SmallVectorImpl<Value *> &InsertElts, + unsigned OperandOffset) { + do { + Value *InsertedOperand = LastInsertInst->getOperand(1); + Optional<unsigned> OperandIndex = + getOperandIndex(LastInsertInst, OperandOffset); + if (!OperandIndex) + return false; + if (isa<InsertElementInst>(InsertedOperand) || + isa<InsertValueInst>(InsertedOperand)) { + if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, + BuildVectorOpds, InsertElts, *OperandIndex)) + return false; + } else { + BuildVectorOpds[*OperandIndex] = InsertedOperand; + InsertElts[*OperandIndex] = LastInsertInst; + } + if (isa<UndefValue>(LastInsertInst->getOperand(0))) + return true; + LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); + } while (LastInsertInst != nullptr && + (isa<InsertValueInst>(LastInsertInst) || + isa<InsertElementInst>(LastInsertInst)) && + LastInsertInst->hasOneUse()); + return false; +} /// Recognize construction of vectors like -/// %ra = insertelement <4 x float> undef, float %s0, i32 0 +/// %ra = insertelement <4 x float> poison, float %s0, i32 0 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 /// starting from the last insertelement or insertvalue instruction. /// -/// Also recognize aggregates like {<2 x float>, <2 x float>}, +/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, /// {{float, float}, {float, float}}, [2 x {float, float}] and so on. /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. /// /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. /// /// \return true if it matches. -static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, +static bool findBuildAggregate(Instruction *LastInsertInst, + TargetTransformInfo *TTI, SmallVectorImpl<Value *> &BuildVectorOpds, SmallVectorImpl<Value *> &InsertElts) { + assert((isa<InsertElementInst>(LastInsertInst) || isa<InsertValueInst>(LastInsertInst)) && "Expected insertelement or insertvalue instruction!"); - do { - Value *InsertedOperand; - auto *IE = dyn_cast<InsertElementInst>(LastInsertInst); - if (IE) { - InsertedOperand = IE->getOperand(1); - LastInsertInst = IE->getOperand(0); - } else { - auto *IV = cast<InsertValueInst>(LastInsertInst); - InsertedOperand = IV->getInsertedValueOperand(); - LastInsertInst = IV->getAggregateOperand(); - } - if (isa<InsertElementInst>(InsertedOperand) || - isa<InsertValueInst>(InsertedOperand)) { - SmallVector<Value *, 8> TmpBuildVectorOpds; - SmallVector<Value *, 8> TmpInsertElts; - if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds, - TmpInsertElts)) - return false; - BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), - TmpBuildVectorOpds.rend()); - InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend()); - } else { - BuildVectorOpds.push_back(InsertedOperand); - InsertElts.push_back(IE); - } - if (isa<UndefValue>(LastInsertInst)) - break; - if ((!isa<InsertValueInst>(LastInsertInst) && - !isa<InsertElementInst>(LastInsertInst)) || - !LastInsertInst->hasOneUse()) - return false; - } while (true); - std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); - std::reverse(InsertElts.begin(), InsertElts.end()); - return true; + + assert((BuildVectorOpds.empty() && InsertElts.empty()) && + "Expected empty result vectors!"); + + Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); + if (!AggregateSize) + return false; + BuildVectorOpds.resize(*AggregateSize); + InsertElts.resize(*AggregateSize); + + if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, + 0)) { + llvm::erase_value(BuildVectorOpds, nullptr); + llvm::erase_value(InsertElts, nullptr); + if (BuildVectorOpds.size() >= 2) + return true; + } + + return false; } static bool PhiTypeSorterFunc(Value *V, Value *V2) { @@ -7195,6 +7267,16 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, return nullptr; } +static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { + if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) + return true; + return false; +} + /// Attempt to reduce a horizontal reduction. /// If it is legal to match a horizontal reduction feeding the phi node \a P /// with reduction operators \a Root (or one of its operands) in a basic block @@ -7234,9 +7316,10 @@ static bool tryToVectorizeHorReductionOrInstOperands( Instruction *Inst; unsigned Level; std::tie(Inst, Level) = Stack.pop_back_val(); - auto *BI = dyn_cast<BinaryOperator>(Inst); - auto *SI = dyn_cast<SelectInst>(Inst); - if (BI || SI) { + Value *B0, *B1; + bool IsBinop = matchRdxBop(Inst, B0, B1); + bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); + if (IsBinop || IsSelect) { HorizontalReduction HorRdx; if (HorRdx.matchAssociativeReduction(P, Inst)) { if (HorRdx.tryToReduce(R, TTI)) { @@ -7247,10 +7330,10 @@ static bool tryToVectorizeHorReductionOrInstOperands( continue; } } - if (P && BI) { - Inst = dyn_cast<Instruction>(BI->getOperand(0)); + if (P && IsBinop) { + Inst = dyn_cast<Instruction>(B0); if (Inst == P) - Inst = dyn_cast<Instruction>(BI->getOperand(1)); + Inst = dyn_cast<Instruction>(B1); if (!Inst) { // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. @@ -7283,9 +7366,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI) { - if (!V) - return false; - auto *I = dyn_cast<Instruction>(V); + auto *I = dyn_cast_or_null<Instruction>(V); if (!I) return false; @@ -7307,8 +7388,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, SmallVector<Value *, 16> BuildVectorOpds; SmallVector<Value *, 16> BuildVectorInsts; - if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) || - BuildVectorOpds.size() < 2) + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); @@ -7323,7 +7403,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, SmallVector<Value *, 16> BuildVectorInsts; SmallVector<Value *, 16> BuildVectorOpds; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || - BuildVectorOpds.size() < 2 || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa<ExtractElementInst>(V); }) && isShuffle(BuildVectorOpds))) @@ -7369,7 +7448,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; SmallPtrSet<Value *, 16> VisitedInstrs; - unsigned MaxVecRegSize = R.getMaxVecRegSize(); bool HaveVectorizedPhiNodes = true; while (HaveVectorizedPhiNodes) { @@ -7396,18 +7474,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Look for the next elements with the same type. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; - Type *EltTy = (*IncIt)->getType(); - unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy) - : MaxVecRegSize; - unsigned MaxNumElts = MaxVecRegSize / EltSize; - if (MaxNumElts < 2) { - ++IncIt; - continue; - } - while (SameTypeIt != E && - (*SameTypeIt)->getType() == EltTy && - (SameTypeIt - IncIt) < MaxNumElts) { + (*SameTypeIt)->getType() == (*IncIt)->getType()) { VisitedInstrs.insert(*SameTypeIt); ++SameTypeIt; } @@ -7439,12 +7507,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { SmallVector<Instruction *, 8> PostProcessInstructions; SmallDenseSet<Instruction *, 4> KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Skip instructions with scalable type. The num of elements is unknown at + // compile-time for scalable type. + if (isa<ScalableVectorType>(it->getType())) + continue; + // Skip instructions marked for the deletion. if (R.isDeleted(&*it)) continue; // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { - if (it->use_empty() && KeyNodes.count(&*it) > 0 && + if (it->use_empty() && KeyNodes.contains(&*it) && vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. @@ -7461,16 +7534,29 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Try to vectorize reductions that use PHINodes. if (PHINode *P = dyn_cast<PHINode>(it)) { // Check that the PHI is a reduction PHI. - if (P->getNumIncomingValues() != 2) - return Changed; + if (P->getNumIncomingValues() == 2) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, + TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + } + // Try to vectorize the incoming values of the PHI, to catch reductions + // that feed into PHIs. + for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) { + // Skip if the incoming block is the current BB for now. Also, bypass + // unreachable IR for efficiency and to avoid crashing. + // TODO: Collect the skipped incoming values and try to vectorize them + // after processing BB. + if (BB == P->getIncomingBlock(I) || + !DT->isReachableFromEntry(P->getIncomingBlock(I))) + continue; - // Try to match and vectorize a horizontal reduction. - if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, - TTI)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; + Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), + P->getIncomingBlock(I), R, TTI); } continue; } @@ -7534,7 +7620,7 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { unsigned MaxElts = MaxVecRegSize / EltSize; for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { auto Len = std::min<unsigned>(BE - BI, MaxElts); - auto GEPList = makeArrayRef(&Entry.second[BI], Len); + ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); // Initialize a set a candidate getelementptrs. Note that we use a // SetVector here to preserve program order. If the index computations diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 6f055ca80ff2..873701676067 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -61,17 +61,19 @@ class VPRecipeBuilder { /// Check if the load or store instruction \p I should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a /// recipe that takes an additional VPInstruction for the mask. - VPWidenMemoryInstructionRecipe * - tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan); + VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan); /// Check if an induction recipe should be constructed for \I. If so build and /// return it. If not, return null. - VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi) const; + VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi, + VPlan &Plan) const; /// Optimize the special case where the operand of \p I is a constant integer /// induction variable. VPWidenIntOrFpInductionRecipe * - tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range) const; + tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, + VPlan &Plan) const; /// Handle non-loop phi nodes. Currently all such phi nodes are turned into /// a sequence of select instructions as the vectorizer currently performs diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f5f28a3bffa1..b26399e0ae58 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -20,8 +20,10 @@ #include "VPlanDominatorTree.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -56,13 +58,69 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { return OS; } +VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) + : SubclassID(SC), UnderlyingVal(UV), Def(Def) { + if (Def) + Def->addDefinedValue(this); +} + +VPValue::~VPValue() { + assert(Users.empty() && "trying to delete a VPValue with remaining users"); + if (Def) + Def->removeDefinedValue(this); +} + void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { - if (const VPInstruction *Instr = dyn_cast<VPInstruction>(this)) - Instr->print(OS, SlotTracker); + if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def)) + R->print(OS, "", SlotTracker); else printAsOperand(OS, SlotTracker); } +void VPValue::dump() const { + const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def); + VPSlotTracker SlotTracker( + (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); + print(dbgs(), SlotTracker); + dbgs() << "\n"; +} + +void VPDef::dump() const { + const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this); + VPSlotTracker SlotTracker( + (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); + print(dbgs(), "", SlotTracker); + dbgs() << "\n"; +} + +VPUser *VPRecipeBase::toVPUser() { + if (auto *U = dyn_cast<VPInstruction>(this)) + return U; + if (auto *U = dyn_cast<VPWidenRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPWidenCallRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPWidenSelectRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPWidenGEPRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPBlendRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPInterleaveRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPReplicateRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPBranchOnMaskRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPWidenMemoryInstructionRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPReductionRecipe>(this)) + return U; + if (auto *U = dyn_cast<VPPredInstPHIRecipe>(this)) + return U; + return nullptr; +} + // Get the top-most entry block of \p Start. This is the entry block of the // containing VPlan. This function is templated to support both const and non-const blocks template <typename T> static T *getPlanEntry(T *Start) { @@ -142,14 +200,43 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { } void VPBlockBase::deleteCFG(VPBlockBase *Entry) { - SmallVector<VPBlockBase *, 8> Blocks; - for (VPBlockBase *Block : depth_first(Entry)) - Blocks.push_back(Block); + SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry)); for (VPBlockBase *Block : Blocks) delete Block; } +VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { + iterator It = begin(); + while (It != end() && (isa<VPWidenPHIRecipe>(&*It) || + isa<VPWidenIntOrFpInductionRecipe>(&*It) || + isa<VPPredInstPHIRecipe>(&*It) || + isa<VPWidenCanonicalIVRecipe>(&*It))) + It++; + return It; +} + +Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { + if (!Def->getDef() && OrigLoop->isLoopInvariant(Def->getLiveInIRValue())) + return Def->getLiveInIRValue(); + + if (hasScalarValue(Def, Instance)) + return Data.PerPartScalars[Def][Instance.Part][Instance.Lane]; + + if (hasVectorValue(Def, Instance.Part)) { + assert(Data.PerPartOutput.count(Def)); + auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; + if (!VecPart->getType()->isVectorTy()) { + assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar"); + return VecPart; + } + // TODO: Cache created scalar values. + return Builder.CreateExtractElement(VecPart, + Builder.getInt32(Instance.Lane)); + } + return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); +} + BasicBlock * VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. @@ -267,6 +354,24 @@ void VPBasicBlock::execute(VPTransformState *State) { LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); } +void VPBasicBlock::dropAllReferences(VPValue *NewValue) { + for (VPRecipeBase &R : Recipes) { + for (auto *Def : R.definedValues()) + Def->replaceAllUsesWith(NewValue); + + if (auto *User = R.toVPUser()) + for (unsigned I = 0, E = User->getNumOperands(); I != E; I++) + User->setOperand(I, NewValue); + } +} + +void VPRegionBlock::dropAllReferences(VPValue *NewValue) { + for (VPBlockBase *Block : depth_first(Entry)) + // Drop all references in VPBasicBlocks and replace all uses with + // DummyValue. + Block->dropAllReferences(NewValue); +} + void VPRegionBlock::execute(VPTransformState *State) { ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry); @@ -300,7 +405,9 @@ void VPRegionBlock::execute(VPTransformState *State) { for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { State->Instance->Part = Part; - for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) { + assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); + for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; + ++Lane) { State->Instance->Lane = Lane; // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { @@ -346,6 +453,14 @@ void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { insertAfter(InsertPos); } +void VPRecipeBase::moveBefore(VPBasicBlock &BB, + iplist<VPRecipeBase>::iterator I) { + assert(I == BB.end() || I->getParent() == &BB); + removeFromParent(); + Parent = &BB; + BB.getRecipeList().insert(I, this); +} + void VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilder<> &Builder = State.Builder; @@ -383,14 +498,14 @@ void VPInstruction::generateInstruction(VPTransformState &State, case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), {Part, 0}); - // Get first lane of backedge-taken-count. - Value *ScalarBTC = State.get(getOperand(1), {Part, 0}); + // Get the original loop tripcount. + Value *ScalarTC = State.TripCount; auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF); + auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); Instruction *Call = Builder.CreateIntrinsic( - Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()}, - {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask"); + Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, + {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); State.set(this, Call, Part); break; } @@ -405,18 +520,15 @@ void VPInstruction::execute(VPTransformState &State) { generateInstruction(State, Part); } -void VPInstruction::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << "\"EMIT "; - print(O, SlotTracker); -} - -void VPInstruction::print(raw_ostream &O) const { +void VPInstruction::dump() const { VPSlotTracker SlotTracker(getParent()->getPlan()); - print(O, SlotTracker); + print(dbgs(), "", SlotTracker); } -void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { +void VPInstruction::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "EMIT "; + if (hasResult()) { printAsOperand(O, SlotTracker); O << " = "; @@ -461,7 +573,7 @@ void VPlan::execute(VPTransformState *State) { "trip.count.minus.1"); auto VF = State->VF; Value *VTCMO = - VF == 1 ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); + VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) State->set(BackedgeTakenCount, VTCMO, Part); } @@ -666,7 +778,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { // Dump the block predicate. const VPValue *Pred = BasicBlock->getPredicate(); if (Pred) { - OS << " +\n" << Indent << " \"BlockPredicate: "; + OS << " +\n" << Indent << " \"BlockPredicate: \""; if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) { PredI->printAsOperand(OS, SlotTracker); OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) @@ -676,7 +788,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { } for (const VPRecipeBase &Recipe : *BasicBlock) { - OS << " +\n" << Indent; + OS << " +\n" << Indent << "\""; Recipe.print(OS, Indent, SlotTracker); OS << "\\l\""; } @@ -715,7 +827,7 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { dumpEdges(Region); } -void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) { +void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) { std::string IngredientString; raw_string_ostream RSO(IngredientString); if (auto *Inst = dyn_cast<Instruction>(V)) { @@ -738,24 +850,45 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) { void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient); + O << "WIDEN-CALL "; + + auto *CI = cast<CallInst>(getUnderlyingInstr()); + if (CI->getType()->isVoidTy()) + O << "void "; + else { + printAsOperand(O, SlotTracker); + O << " = "; + } + + O << "call @" << CI->getCalledFunction()->getName() << "("; + printOperands(O, SlotTracker); + O << ")"; } void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"WIDEN-SELECT" << VPlanIngredient(&Ingredient) - << (InvariantCond ? " (condition is loop invariant)" : ""); + O << "WIDEN-SELECT "; + printAsOperand(O, SlotTracker); + O << " = select "; + getOperand(0)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(1)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(2)->printAsOperand(O, SlotTracker); + O << (InvariantCond ? " (condition is loop invariant)" : ""); } void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"WIDEN\\l\""; - O << "\" " << VPlanIngredient(&Ingredient); + O << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = " << getUnderlyingInstr()->getOpcodeName() << " "; + printOperands(O, SlotTracker); } void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"WIDEN-INDUCTION"; + O << "WIDEN-INDUCTION"; if (Trunc) { O << "\\l\""; O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; @@ -766,23 +899,26 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"WIDEN-GEP "; + O << "WIDEN-GEP "; O << (IsPtrLoopInvariant ? "Inv" : "Var"); size_t IndicesNumber = IsIndexLoopInvariant.size(); for (size_t I = 0; I < IndicesNumber; ++I) O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; - O << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(GEP); + + O << " "; + printAsOperand(O, SlotTracker); + O << " = getelementptr "; + printOperands(O, SlotTracker); } void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"WIDEN-PHI " << VPlanIngredient(Phi); + O << "WIDEN-PHI " << VPlanIngredient(Phi); } void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"BLEND "; + O << "BLEND "; Phi->printAsOperand(O, false); O << " ="; if (getNumIncomingValues() == 1) { @@ -800,46 +936,75 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, } } +void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << "REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) + << " ("; + getVecOp()->printAsOperand(O, SlotTracker); + if (getCondOp()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; +} + void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") - << VPlanIngredient(Ingredient); + O << (IsUniform ? "CLONE " : "REPLICATE "); + + if (!getUnderlyingInstr()->getType()->isVoidTy()) { + printAsOperand(O, SlotTracker); + O << " = "; + } + O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; + printOperands(O, SlotTracker); + if (AlsoPack) O << " (S->V)"; } void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst); + O << "PHI-PREDICATED-INSTRUCTION "; + printOperands(O, SlotTracker); } void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"WIDEN " << VPlanIngredient(&Instr); - O << ", "; - getAddr()->printAsOperand(O, SlotTracker); - VPValue *Mask = getMask(); - if (Mask) { - O << ", "; - Mask->printAsOperand(O, SlotTracker); + O << "WIDEN "; + + if (!isStore()) { + getVPValue()->printAsOperand(O, SlotTracker); + O << " = "; } + O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; + + printOperands(O, SlotTracker); } void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.CanonicalIV; Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - auto VF = State.VF; - Value *VStart = VF == 1 + ElementCount VF = State.VF; + assert(!VF.isScalable() && "the code following assumes non scalables ECs"); + Value *VStart = VF.isScalar() ? CanonicalIV - : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); + : Builder.CreateVectorSplat(VF.getKnownMinValue(), + CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { SmallVector<Constant *, 8> Indices; - for (unsigned Lane = 0; Lane < VF; ++Lane) - Indices.push_back(ConstantInt::get(STy, Part * VF + Lane)); + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) + Indices.push_back( + ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); // If VF == 1, there is only one iteration in the loop above, thus the // element pushed back into Indices is ConstantInt::get(STy, Part) - Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices); + Constant *VStep = + VF.isScalar() ? Indices.back() : ConstantVector::get(Indices); // Add the consecutive indices to the vector value. Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); State.set(getVPValue(), CanonicalVectorIV, Part); @@ -848,7 +1013,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << "\"EMIT "; + O << "EMIT "; getVPValue()->printAsOperand(O, SlotTracker); O << " = WIDEN-CANONICAL-INDUCTION"; } @@ -856,10 +1021,18 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT); void VPValue::replaceAllUsesWith(VPValue *New) { - for (VPUser *User : users()) + for (unsigned J = 0; J < getNumUsers();) { + VPUser *User = Users[J]; + unsigned NumUsers = getNumUsers(); for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) if (User->getOperand(I) == this) User->setOperand(I, New); + // If a user got removed after updating the current user, the next user to + // update will be moved to the current position, so we only need to + // increment the index if the number of users did not change. + if (NumUsers == getNumUsers()) + J++; + } } void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { @@ -877,6 +1050,12 @@ void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { OS << "vp<%" << Tracker.getSlot(this) << ">"; } +void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { + interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { + Op->printAsOperand(O, SlotTracker); + }); +} + void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, InterleavedAccessInfo &IAI) { @@ -925,13 +1104,6 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, void VPSlotTracker::assignSlot(const VPValue *V) { assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!"); - const Value *UV = V->getUnderlyingValue(); - if (UV) - return; - const auto *VPI = dyn_cast<VPInstruction>(V); - if (VPI && !VPI->hasResult()) - return; - Slots[V] = NextSlot++; } @@ -950,10 +1122,8 @@ void VPSlotTracker::assignSlots(const VPRegionBlock *Region) { void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) { for (const VPRecipeBase &Recipe : *VPBB) { - if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe)) - assignSlot(VPI); - else if (const auto *VPIV = dyn_cast<VPWidenCanonicalIVRecipe>(&Recipe)) - assignSlot(VPIV->getVPValue()); + for (VPValue *Def : Recipe.definedValues()) + assignSlot(Def); } } @@ -962,10 +1132,6 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) { for (const VPValue *V : Plan.VPExternalDefs) assignSlot(V); - for (auto &E : Plan.Value2VPValue) - if (!isa<VPInstruction>(E.second)) - assignSlot(E.second); - for (const VPValue *V : Plan.VPCBVs) assignSlot(V); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f07c94e7a3c7..2cce127cd4ce 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -51,13 +51,12 @@ namespace llvm { class BasicBlock; class DominatorTree; class InnerLoopVectorizer; -template <class T> class InterleaveGroup; class LoopInfo; class raw_ostream; +class RecurrenceDescriptor; class Value; class VPBasicBlock; class VPRegionBlock; -class VPSlotTracker; class VPlan; class VPlanSlp; @@ -66,10 +65,22 @@ class VPlanSlp; /// [1, 9) = {1, 2, 4, 8} struct VFRange { // A power of 2. - const unsigned Start; + const ElementCount Start; // Need not be a power of 2. If End <= Start range is empty. - unsigned End; + ElementCount End; + + bool isEmpty() const { + return End.getKnownMinValue() <= Start.getKnownMinValue(); + } + + VFRange(const ElementCount &Start, const ElementCount &End) + : Start(Start), End(End) { + assert(Start.isScalable() == End.isScalable() && + "Both Start and End should have the same scalable flag"); + assert(isPowerOf2_32(Start.getKnownMinValue()) && + "Expected Start to be a power of 2"); + } }; using VPlanPtr = std::unique_ptr<VPlan>; @@ -114,7 +125,7 @@ private: /// The vectorization factor. Each entry in the scalar map contains UF x VF /// scalar values. - unsigned VF; + ElementCount VF; /// The vector and scalar map storage. We use std::map and not DenseMap /// because insertions to DenseMap invalidate its iterators. @@ -125,7 +136,7 @@ private: public: /// Construct an empty map with the given unroll and vectorization factors. - VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {} /// \return True if the map has any vector entry for \p Key. bool hasAnyVectorValue(Value *Key) const { @@ -150,12 +161,14 @@ public: /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF && "Queried Scalar Lane is too large."); + assert(Instance.Lane < VF.getKnownMinValue() && + "Queried Scalar Lane is too large."); + if (!hasAnyScalarValue(Key)) return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF && + assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && "ScalarParts has wrong dimensions."); return Entry[Instance.Part][Instance.Lane] != nullptr; } @@ -194,7 +207,7 @@ public: // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF, nullptr); + Entry[Part].resize(VF.getKnownMinValue(), nullptr); ScalarMapStorage[Key] = Entry; } ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; @@ -233,14 +246,15 @@ struct VPCallback { /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, - IRBuilder<> &Builder, VectorizerValueMap &ValueMap, - InnerLoopVectorizer *ILV, VPCallback &Callback) - : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), - ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} + VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI, + DominatorTree *DT, IRBuilder<> &Builder, + VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, + VPCallback &Callback) + : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT), + Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. - unsigned VF; + ElementCount VF; unsigned UF; /// Hold the indices to generate specific scalar instructions. Null indicates @@ -255,6 +269,9 @@ struct VPTransformState { typedef SmallVector<Value *, 2> PerPartValuesTy; DenseMap<VPValue *, PerPartValuesTy> PerPartOutput; + + using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>; + DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars; } Data; /// Get the generated Value for a given VPValue and a given Part. Note that @@ -271,20 +288,21 @@ struct VPTransformState { } /// Get the generated Value for a given VPValue and given Part and Lane. - Value *get(VPValue *Def, const VPIteration &Instance) { - // If the Def is managed directly by VPTransformState, extract the lane from - // the relevant part. Note that currently only VPInstructions and external - // defs are managed by VPTransformState. Other Defs are still created by ILV - // and managed in its ValueMap. For those this method currently just - // delegates the call to ILV below. - if (Data.PerPartOutput.count(Def)) { - auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; - // TODO: Cache created scalar values. - return Builder.CreateExtractElement(VecPart, - Builder.getInt32(Instance.Lane)); - } + Value *get(VPValue *Def, const VPIteration &Instance); + + bool hasVectorValue(VPValue *Def, unsigned Part) { + auto I = Data.PerPartOutput.find(Def); + return I != Data.PerPartOutput.end() && Part < I->second.size() && + I->second[Part]; + } - return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); + bool hasScalarValue(VPValue *Def, VPIteration Instance) { + auto I = Data.PerPartScalars.find(Def); + if (I == Data.PerPartScalars.end()) + return false; + return Instance.Part < I->second.size() && + Instance.Lane < I->second[Instance.Part].size() && + I->second[Instance.Part][Instance.Lane]; } /// Set the generated Value for a given VPValue and a given Part. @@ -295,6 +313,18 @@ struct VPTransformState { } Data.PerPartOutput[Def][Part] = V; } + void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part); + + void set(VPValue *Def, Value *V, const VPIteration &Instance) { + auto Iter = Data.PerPartScalars.insert({Def, {}}); + auto &PerPartVec = Iter.first->second; + while (PerPartVec.size() <= Instance.Part) + PerPartVec.emplace_back(); + auto &Scalars = PerPartVec[Instance.Part]; + while (Scalars.size() <= Instance.Lane) + Scalars.push_back(nullptr); + Scalars[Instance.Lane] = V; + } /// Hold state information used when constructing the CFG of the output IR, /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. @@ -321,6 +351,9 @@ struct VPTransformState { CFGState() = default; } CFG; + /// Hold a pointer to the original loop. + Loop *OrigLoop; + /// Hold a pointer to LoopInfo to register new basic blocks in the loop. LoopInfo *LI; @@ -394,14 +427,14 @@ class VPBlockBase { /// Remove \p Predecessor from the predecessors of this block. void removePredecessor(VPBlockBase *Predecessor) { - auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor); + auto Pos = find(Predecessors, Predecessor); assert(Pos && "Predecessor does not exist"); Predecessors.erase(Pos); } /// Remove \p Successor from the successors of this block. void removeSuccessor(VPBlockBase *Successor) { - auto Pos = std::find(Successors.begin(), Successors.end(), Successor); + auto Pos = find(Successors, Successor); assert(Pos && "Successor does not exist"); Successors.erase(Pos); } @@ -594,49 +627,30 @@ public: // hoisted into a VPBlockBase. return true; } + + /// Replace all operands of VPUsers in the block with \p NewValue and also + /// replaces all uses of VPValues defined in the block with NewValue. + virtual void dropAllReferences(VPValue *NewValue) = 0; }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR -/// instructions. -class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> { +/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef +/// and is responsible for deleting its defined values. Single-value +/// VPRecipeBases that also inherit from VPValue must make sure to inherit from +/// VPRecipeBase before VPValue. +class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, + public VPDef { friend VPBasicBlock; friend class VPBlockUtils; - const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). /// Each VPRecipe belongs to a single VPBasicBlock. VPBasicBlock *Parent = nullptr; public: - /// An enumeration for keeping track of the concrete subclass of VPRecipeBase - /// that is actually instantiated. Values of this enumeration are kept in the - /// SubclassID field of the VPRecipeBase objects. They are used for concrete - /// type identification. - using VPRecipeTy = enum { - VPBlendSC, - VPBranchOnMaskSC, - VPInstructionSC, - VPInterleaveSC, - VPPredInstPHISC, - VPReplicateSC, - VPWidenCallSC, - VPWidenCanonicalIVSC, - VPWidenGEPSC, - VPWidenIntOrFpInductionSC, - VPWidenMemoryInstructionSC, - VPWidenPHISC, - VPWidenSC, - VPWidenSelectSC - }; - - VPRecipeBase(const unsigned char SC) : SubclassID(SC) {} + VPRecipeBase(const unsigned char SC) : VPDef(SC) {} virtual ~VPRecipeBase() = default; - /// \return an ID for the concrete type of this object. - /// This is used to implement the classof checks. This should not be used - /// for any other purpose, as the values may change as LLVM evolves. - unsigned getVPRecipeID() const { return SubclassID; } - /// \return the VPBasicBlock which this VPRecipe belongs to. VPBasicBlock *getParent() { return Parent; } const VPBasicBlock *getParent() const { return Parent; } @@ -645,10 +659,6 @@ public: /// this VPRecipe, thereby "executing" the VPlan. virtual void execute(struct VPTransformState &State) = 0; - /// Each recipe prints itself. - virtual void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const = 0; - /// Insert an unlinked recipe into a basic block immediately before /// the specified recipe. void insertBefore(VPRecipeBase *InsertPos); @@ -661,6 +671,11 @@ public: /// the VPBasicBlock that MovePos lives in, right after MovePos. void moveAfter(VPRecipeBase *MovePos); + /// Unlink this recipe and insert into BB before I. + /// + /// \pre I is a valid iterator into BB. + void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I); + /// This method unlinks 'this' from the containing basic block, but does not /// delete it. void removeFromParent(); @@ -669,13 +684,46 @@ public: /// /// \returns an iterator pointing to the element after the erased one iplist<VPRecipeBase>::iterator eraseFromParent(); + + /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or + /// nullptr otherwise. + VPUser *toVPUser(); + + /// Returns the underlying instruction, if the recipe is a VPValue or nullptr + /// otherwise. + Instruction *getUnderlyingInstr() { + return cast<Instruction>(getVPValue()->getUnderlyingValue()); + } + const Instruction *getUnderlyingInstr() const { + return cast<Instruction>(getVPValue()->getUnderlyingValue()); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + // All VPDefs are also VPRecipeBases. + return true; + } }; +inline bool VPUser::classof(const VPDef *Def) { + return Def->getVPDefID() == VPRecipeBase::VPInstructionSC || + Def->getVPDefID() == VPRecipeBase::VPWidenSC || + Def->getVPDefID() == VPRecipeBase::VPWidenCallSC || + Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC || + Def->getVPDefID() == VPRecipeBase::VPWidenGEPSC || + Def->getVPDefID() == VPRecipeBase::VPBlendSC || + Def->getVPDefID() == VPRecipeBase::VPInterleaveSC || + Def->getVPDefID() == VPRecipeBase::VPReplicateSC || + Def->getVPDefID() == VPRecipeBase::VPReductionSC || + Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC || + Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; +} + /// This is a concrete Recipe that models a single VPlan-level instruction. /// While as any Recipe it may generate a sequence of IR instructions when /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. -class VPInstruction : public VPUser, public VPRecipeBase { +class VPInstruction : public VPRecipeBase, public VPUser, public VPValue { friend class VPlanSlp; public: @@ -697,23 +745,26 @@ private: void generateInstruction(VPTransformState &State, unsigned Part); protected: - Instruction *getUnderlyingInstr() { - return cast_or_null<Instruction>(getUnderlyingValue()); - } - void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } public: VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands) - : VPUser(VPValue::VPInstructionSC, Operands), - VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {} + : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands), + VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} + + VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands) + : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}), + VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) { + for (auto *I : Operands) + addOperand(I->getVPValue()); + } VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { - return V->getVPValueID() == VPValue::VPInstructionSC; + return V->getVPValueID() == VPValue::VPVInstructionSC; } VPInstruction *clone() const { @@ -722,8 +773,8 @@ public: } /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *R) { - return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC; + static inline bool classof(const VPDef *R) { + return R->getVPDefID() == VPRecipeBase::VPInstructionSC; } unsigned getOpcode() const { return Opcode; } @@ -733,13 +784,12 @@ public: /// provided. void execute(VPTransformState &State) override; - /// Print the Recipe. + /// Print the VPInstruction to \p O. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; - /// Print the VPInstruction. - void print(raw_ostream &O) const; - void print(raw_ostream &O, VPSlotTracker &SlotTracker) const; + /// Print the VPInstruction to dbgs() (for debugging). + void dump() const; /// Return true if this instruction may modify memory. bool mayWriteToMemory() const { @@ -773,23 +823,21 @@ public: /// VPWidenRecipe is a recipe for producing a copy of vector type its /// ingredient. This recipe covers most of the traditional vectorization cases /// where each ingredient transforms into a vectorized version of itself. -class VPWidenRecipe : public VPRecipeBase { - /// Hold the instruction to be widened. - Instruction &Ingredient; - - /// Hold VPValues for the operands of the ingredient. - VPUser User; - +class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser { public: template <typename IterT> VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) - : VPRecipeBase(VPWidenSC), Ingredient(I), User(Operands) {} + : VPRecipeBase(VPRecipeBase::VPWidenSC), + VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {} ~VPWidenRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenSC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVWidenSC; } /// Produce widened copies of all Ingredients. @@ -801,23 +849,19 @@ public: }; /// A recipe for widening Call instructions. -class VPWidenCallRecipe : public VPRecipeBase { - /// Hold the call to be widened. - CallInst &Ingredient; - - /// Hold VPValues for the arguments of the call. - VPUser User; +class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue { public: template <typename IterT> VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments) - : VPRecipeBase(VPWidenCallSC), Ingredient(I), User(CallArguments) {} + : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments), + VPValue(VPValue::VPVWidenCallSC, &I, this) {} ~VPWidenCallRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenCallSC; } /// Produce a widened version of the call instruction. @@ -829,13 +873,7 @@ public: }; /// A recipe for widening select instructions. -class VPWidenSelectRecipe : public VPRecipeBase { -private: - /// Hold the select to be widened. - SelectInst &Ingredient; - - /// Hold VPValues for the operands of the select. - VPUser User; +class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue { /// Is the condition of the select loop invariant? bool InvariantCond; @@ -844,14 +882,15 @@ public: template <typename IterT> VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands, bool InvariantCond) - : VPRecipeBase(VPWidenSelectSC), Ingredient(I), User(Operands), + : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands), + VPValue(VPValue::VPVWidenSelectSC, &I, this), InvariantCond(InvariantCond) {} ~VPWidenSelectRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenSelectSC; } /// Produce a widened version of the select instruction. @@ -863,20 +902,24 @@ public: }; /// A recipe for handling GEP instructions. -class VPWidenGEPRecipe : public VPRecipeBase { - GetElementPtrInst *GEP; - - /// Hold VPValues for the base and indices of the GEP. - VPUser User; - +class VPWidenGEPRecipe : public VPRecipeBase, + public VPUser, + public VPValue { bool IsPtrLoopInvariant; SmallBitVector IsIndexLoopInvariant; public: template <typename IterT> + VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands) + : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), + VPValue(VPWidenGEPSC, GEP, this), + IsIndexLoopInvariant(GEP->getNumIndices(), false) {} + + template <typename IterT> VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands, Loop *OrigLoop) - : VPRecipeBase(VPWidenGEPSC), GEP(GEP), User(Operands), + : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), + VPValue(VPValue::VPVWidenGEPSC, GEP, this), IsIndexLoopInvariant(GEP->getNumIndices(), false) { IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); for (auto Index : enumerate(GEP->indices())) @@ -886,8 +929,8 @@ public: ~VPWidenGEPRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenGEPSC; } /// Generate the gep nodes. @@ -900,18 +943,25 @@ public: /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser { PHINode *IV; TruncInst *Trunc; public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {} + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + TruncInst *Trunc = nullptr) + : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV), + Trunc(Trunc) { + if (Trunc) + new VPValue(Trunc, this); + else + new VPValue(IV, this); + } ~VPWidenIntOrFpInductionRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC; } /// Generate the vectorized and scalarized versions of the phi node as @@ -921,19 +971,38 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; + + /// Returns the start value of the induction. + VPValue *getStartValue() { return getOperand(0); } }; /// A recipe for handling all phi nodes except for integer and FP inductions. -class VPWidenPHIRecipe : public VPRecipeBase { +/// For reduction PHIs, RdxDesc must point to the corresponding recurrence +/// descriptor and the start value is the first operand of the recipe. +class VPWidenPHIRecipe : public VPRecipeBase, public VPUser { PHINode *Phi; + /// Descriptor for a reduction PHI. + RecurrenceDescriptor *RdxDesc = nullptr; + public: - VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {} + /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p + /// RdxDesc. + VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start) + : VPWidenPHIRecipe(Phi) { + this->RdxDesc = &RdxDesc; + addOperand(&Start); + } + + /// Create a VPWidenPHIRecipe for \p Phi + VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) { + new VPValue(Phi, this); + } ~VPWidenPHIRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenPHISC; } /// Generate the phi/select nodes. @@ -942,21 +1011,25 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; + + /// Returns the start value of the phi, if it is a reduction. + VPValue *getStartValue() { + return getNumOperands() == 0 ? nullptr : getOperand(0); + } }; /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. -class VPBlendRecipe : public VPRecipeBase { +class VPBlendRecipe : public VPRecipeBase, public VPUser { PHINode *Phi; +public: /// The blend operation is a User of the incoming values and of their /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value /// might be incoming with a full mask for which there is no VPValue. - VPUser User; - -public: VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) - : VPRecipeBase(VPBlendSC), Phi(Phi), User(Operands) { + : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) { + new VPValue(Phi, this); assert(Operands.size() > 0 && ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && "Expected either a single incoming value or a positive even number " @@ -964,23 +1037,19 @@ public: } /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPBlendSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPBlendSC; } /// Return the number of incoming values, taking into account that a single /// incoming value has no mask. - unsigned getNumIncomingValues() const { - return (User.getNumOperands() + 1) / 2; - } + unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; } /// Return incoming value number \p Idx. - VPValue *getIncomingValue(unsigned Idx) const { - return User.getOperand(Idx * 2); - } + VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); } /// Return mask number \p Idx. - VPValue *getMask(unsigned Idx) const { return User.getOperand(Idx * 2 + 1); } + VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); } /// Generate the phi/select nodes. void execute(VPTransformState &State) override; @@ -991,35 +1060,58 @@ public: }; /// VPInterleaveRecipe is a recipe for transforming an interleave group of load -/// or stores into one wide load/store and shuffles. -class VPInterleaveRecipe : public VPRecipeBase { +/// or stores into one wide load/store and shuffles. The first operand of a +/// VPInterleave recipe is the address, followed by the stored values, followed +/// by an optional mask. +class VPInterleaveRecipe : public VPRecipeBase, public VPUser { const InterleaveGroup<Instruction> *IG; - VPUser User; + + bool HasMask = false; public: VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, - VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) { - if (Mask) - User.addOperand(Mask); + ArrayRef<VPValue *> StoredValues, VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) { + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (Instruction *I = IG->getMember(i)) { + if (I->getType()->isVoidTy()) + continue; + new VPValue(I, this); + } + + for (auto *SV : StoredValues) + addOperand(SV); + if (Mask) { + HasMask = true; + addOperand(Mask); + } } ~VPInterleaveRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPInterleaveSC; } /// Return the address accessed by this recipe. VPValue *getAddr() const { - return User.getOperand(0); // Address is the 1st, mandatory operand. + return getOperand(0); // Address is the 1st, mandatory operand. } /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { // Mask is optional and therefore the last, currently 2nd operand. - return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; + return HasMask ? getOperand(getNumOperands() - 1) : nullptr; + } + + /// Return the VPValues stored by this interleave group. If it is a load + /// interleave group, return an empty ArrayRef. + ArrayRef<VPValue *> getStoredValues() const { + // The first operand is the address, followed by the stored values, followed + // by an optional mask. + return ArrayRef<VPValue *>(op_begin(), getNumOperands()) + .slice(1, getNumOperands() - (HasMask ? 2 : 1)); } /// Generate the wide load or store, and shuffles. @@ -1032,17 +1124,61 @@ public: const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } }; +/// A recipe to represent inloop reduction operations, performing a reduction on +/// a vector operand into a scalar value, and adding the result to a chain. +/// The Operands are {ChainOp, VecOp, [Condition]}. +class VPReductionRecipe : public VPRecipeBase, public VPUser, public VPValue { + /// The recurrence decriptor for the reduction in question. + RecurrenceDescriptor *RdxDesc; + /// Fast math flags to use for the resulting reduction operation. + bool NoNaN; + /// Pointer to the TTI, needed to create the target reduction + const TargetTransformInfo *TTI; + +public: + VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, + VPValue *VecOp, VPValue *CondOp, bool NoNaN, + const TargetTransformInfo *TTI) + : VPRecipeBase(VPRecipeBase::VPReductionSC), VPUser({ChainOp, VecOp}), + VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), NoNaN(NoNaN), + TTI(TTI) { + if (CondOp) + addOperand(CondOp); + } + + ~VPReductionRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVReductionSC; + } + + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPReductionSC; + } + + /// Generate the reduction in the loop + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; + + /// The VPValue of the scalar Chain being accumulated. + VPValue *getChainOp() const { return getOperand(0); } + /// The VPValue of the vector value to be reduced. + VPValue *getVecOp() const { return getOperand(1); } + /// The VPValue of the condition for the block. + VPValue *getCondOp() const { + return getNumOperands() > 2 ? getOperand(2) : nullptr; + } +}; + /// VPReplicateRecipe replicates a given instruction producing multiple scalar /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be /// uniform only one copy, per lane zero, will be generated. -class VPReplicateRecipe : public VPRecipeBase { - /// The instruction being replicated. - Instruction *Ingredient; - - /// Hold VPValues for the operands of the ingredient. - VPUser User; - +class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue { /// Indicator if only a single replica per lane is needed. bool IsUniform; @@ -1056,8 +1192,9 @@ public: template <typename IterT> VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands, bool IsUniform, bool IsPredicated = false) - : VPRecipeBase(VPReplicateSC), Ingredient(I), User(Operands), - IsUniform(IsUniform), IsPredicated(IsPredicated) { + : VPRecipeBase(VPReplicateSC), VPUser(Operands), + VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform), + IsPredicated(IsPredicated) { // Retain the previous behavior of predicateInstructions(), where an // insert-element of a predicated instruction got hoisted into the // predicated basic block iff it was its only user. This is achieved by @@ -1069,8 +1206,12 @@ public: ~VPReplicateRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPReplicateSC; + } + + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVReplicateSC; } /// Generate replicas of the desired Ingredient. Replicas will be generated @@ -1083,21 +1224,21 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; + + bool isUniform() const { return IsUniform; } }; /// A recipe for generating conditional branches on the bits of a mask. -class VPBranchOnMaskRecipe : public VPRecipeBase { - VPUser User; - +class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser { public: VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) { if (BlockInMask) // nullptr means all-one mask. - User.addOperand(BlockInMask); + addOperand(BlockInMask); } /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC; } /// Generate the extraction of the appropriate bit from the block mask and the @@ -1109,7 +1250,7 @@ public: VPSlotTracker &SlotTracker) const override { O << " +\n" << Indent << "\"BRANCH-ON-MASK "; if (VPValue *Mask = getMask()) - Mask->print(O, SlotTracker); + Mask->printAsOperand(O, SlotTracker); else O << " All-One"; O << "\\l\""; @@ -1118,9 +1259,9 @@ public: /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - assert(User.getNumOperands() <= 1 && "should have either 0 or 1 operands"); + assert(getNumOperands() <= 1 && "should have either 0 or 1 operands"); // Mask is optional. - return User.getNumOperands() == 1 ? User.getOperand(0) : nullptr; + return getNumOperands() == 1 ? getOperand(0) : nullptr; } }; @@ -1129,19 +1270,20 @@ public: /// order to merge values that are set under such a branch and feed their uses. /// The phi nodes can be scalar or vector depending on the users of the value. /// This recipe works in concert with VPBranchOnMaskRecipe. -class VPPredInstPHIRecipe : public VPRecipeBase { - Instruction *PredInst; +class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser { public: /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi /// nodes after merging back from a Branch-on-Mask. - VPPredInstPHIRecipe(Instruction *PredInst) - : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {} + VPPredInstPHIRecipe(VPValue *PredV) + : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) { + new VPValue(PredV->getUnderlyingValue(), this); + } ~VPPredInstPHIRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC; } /// Generates phi nodes for live-outs as needed to retain SSA form. @@ -1158,56 +1300,59 @@ public: /// - For store: Address, stored value, optional mask /// TODO: We currently execute only per-part unless a specific instance is /// provided. -class VPWidenMemoryInstructionRecipe : public VPRecipeBase { - Instruction &Instr; - VPUser User; +class VPWidenMemoryInstructionRecipe : public VPRecipeBase, + public VPUser { + Instruction &Ingredient; void setMask(VPValue *Mask) { if (!Mask) return; - User.addOperand(Mask); + addOperand(Mask); } bool isMasked() const { - return (isa<LoadInst>(Instr) && User.getNumOperands() == 2) || - (isa<StoreInst>(Instr) && User.getNumOperands() == 3); + return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; } public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Load), User({Addr}) { + : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}), + Ingredient(Load) { + new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); setMask(Mask); } VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredValue, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Store), - User({Addr, StoredValue}) { + : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}), + Ingredient(Store) { setMask(Mask); } /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; } /// Return the address accessed by this recipe. VPValue *getAddr() const { - return User.getOperand(0); // Address is the 1st, mandatory operand. + return getOperand(0); // Address is the 1st, mandatory operand. } /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { // Mask is optional and therefore the last operand. - return isMasked() ? User.getOperand(User.getNumOperands() - 1) : nullptr; + return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; } + /// Returns true if this recipe is a store. + bool isStore() const { return isa<StoreInst>(Ingredient); } + /// Return the address accessed by this recipe. VPValue *getStoredValue() const { - assert(isa<StoreInst>(Instr) && - "Stored value only available for store instructions"); - return User.getOperand(1); // Stored value is the 2nd, mandatory operand. + assert(isStore() && "Stored value only available for store instructions"); + return getOperand(1); // Stored value is the 2nd, mandatory operand. } /// Generate the wide load/store. @@ -1220,21 +1365,16 @@ public: /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase { - /// A VPValue representing the canonical vector IV. - VPValue Val; - public: - VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {} - ~VPWidenCanonicalIVRecipe() override = default; + VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) { + new VPValue(nullptr, this); + } - /// Return the VPValue representing the canonical vector induction variable of - /// the vector loop. - const VPValue *getVPValue() const { return &Val; } - VPValue *getVPValue() { return &Val; } + ~VPWidenCanonicalIVRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenCanonicalIVSC; + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; } /// Generate a canonical vector induction variable of the vector loop, with @@ -1321,6 +1461,11 @@ public: /// this VPBasicBlock, thereby "executing" the VPlan. void execute(struct VPTransformState *State) override; + /// Return the position of the first non-phi node recipe in the block. + iterator getFirstNonPhi(); + + void dropAllReferences(VPValue *NewValue) override; + private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. @@ -1361,8 +1506,11 @@ public: IsReplicator(IsReplicator) {} ~VPRegionBlock() override { - if (Entry) + if (Entry) { + VPValue DummyValue; + Entry->dropAllReferences(&DummyValue); deleteCFG(Entry); + } } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1407,6 +1555,8 @@ public: /// The method which generates the output IR instructions that correspond to /// this VPRegionBlock, thereby "executing" the VPlan. void execute(struct VPTransformState *State) override; + + void dropAllReferences(VPValue *NewValue) override; }; //===----------------------------------------------------------------------===// @@ -1544,7 +1694,7 @@ class VPlan { VPBlockBase *Entry; /// Holds the VFs applicable to this VPlan. - SmallSet<unsigned, 2> VFs; + SmallSetVector<ElementCount, 2> VFs; /// Holds the name of the VPlan, for printing. std::string Name; @@ -1564,6 +1714,10 @@ class VPlan { /// VPlan. Value2VPValueTy Value2VPValue; + /// Contains all VPValues that been allocated by addVPValue directly and need + /// to be free when the plan's destructor is called. + SmallVector<VPValue *, 16> VPValuesToFree; + /// Holds the VPLoopInfo analysis for this VPlan. VPLoopInfo VPLInfo; @@ -1577,10 +1731,15 @@ public: } ~VPlan() { - if (Entry) + if (Entry) { + VPValue DummyValue; + for (VPBlockBase *Block : depth_first(Entry)) + Block->dropAllReferences(&DummyValue); + VPBlockBase::deleteCFG(Entry); - for (auto &MapEntry : Value2VPValue) - delete MapEntry.second; + } + for (VPValue *VPV : VPValuesToFree) + delete VPV; if (BackedgeTakenCount) delete BackedgeTakenCount; for (VPValue *Def : VPExternalDefs) @@ -1608,9 +1767,9 @@ public: return BackedgeTakenCount; } - void addVF(unsigned VF) { VFs.insert(VF); } + void addVF(ElementCount VF) { VFs.insert(VF); } - bool hasVF(unsigned VF) { return VFs.count(VF); } + bool hasVF(ElementCount VF) { return VFs.count(VF); } const std::string &getName() const { return Name; } @@ -1630,7 +1789,15 @@ public: void addVPValue(Value *V) { assert(V && "Trying to add a null Value to VPlan"); assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); - Value2VPValue[V] = new VPValue(V); + VPValue *VPV = new VPValue(V); + Value2VPValue[V] = VPV; + VPValuesToFree.push_back(VPV); + } + + void addVPValue(Value *V, VPValue *VPV) { + assert(V && "Trying to add a null Value to VPlan"); + assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); + Value2VPValue[V] = VPV; } VPValue *getVPValue(Value *V) { @@ -1646,6 +1813,8 @@ public: return getVPValue(V); } + void removeVPValueFor(Value *V) { Value2VPValue.erase(V); } + /// Return the VPLoopInfo analysis for this VPlan. VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } @@ -1723,13 +1892,13 @@ private: void dump(); - static void printAsIngredient(raw_ostream &O, Value *V); + static void printAsIngredient(raw_ostream &O, const Value *V); }; struct VPlanIngredient { - Value *V; + const Value *V; - VPlanIngredient(Value *V) : V(V) {} + VPlanIngredient(const Value *V) : V(V) {} }; inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { @@ -1879,9 +2048,7 @@ public: /// \returns nullptr if doesn't have such group. InterleaveGroup<VPInstruction> * getInterleaveGroup(VPInstruction *Instr) const { - if (InterleaveGroupMap.count(Instr)) - return InterleaveGroupMap.find(Instr)->second; - return nullptr; + return InterleaveGroupMap.lookup(Instr); } }; @@ -1965,10 +2132,7 @@ class VPlanSlp { public: VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} - ~VPlanSlp() { - for (auto &KV : BundleToCombined) - delete KV.second; - } + ~VPlanSlp() = default; /// Tries to build an SLP tree rooted at \p Operands and returns a /// VPInstruction combining \p Operands, if they can be combined. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 7a80f3ff80a5..ac3b3505dc34 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -191,7 +191,7 @@ void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) { // Generate edge predicates and append them to the block predicate. RPO is // necessary since the predecessor blocks' block predicate needs to be set // before the current block's block predicate can be computed. - for (VPBlockBase *Block : make_range(RPOT.begin(), RPOT.end())) { + for (VPBlockBase *Block : RPOT) { // TODO: Handle nested regions once we start generating the same. assert(!isa<VPRegionBlock>(Block) && "Nested region not expected"); createOrPropagatePredicates(Block, Region); @@ -208,7 +208,7 @@ void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) { ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry()); VPBlockBase *PrevBlock = nullptr; - for (VPBlockBase *CurrBlock : make_range(RPOT.begin(), RPOT.end())) { + for (VPBlockBase *CurrBlock : RPOT) { // TODO: Handle nested regions once we start generating the same. assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index 9019ed15ec5f..6f21bf44291a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -124,7 +124,7 @@ bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { for (auto &I : *Parent) { auto *VPI = cast<VPInstruction>(&I); if (VPI->getOpcode() == Instruction::Load && - std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) + llvm::is_contained(Operands, VPI)) LoadsSeen++; if (LoadsSeen == Operands.size()) @@ -161,7 +161,8 @@ static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, unsigned OperandIndex) { SmallVector<VPValue *, 4> Operands; for (VPValue *V : Values) { - auto *U = cast<VPUser>(V); + // Currently we only support VPInstructions. + auto *U = cast<VPInstruction>(V); Operands.push_back(U->getOperand(OperandIndex)); } return Operands; @@ -222,18 +223,20 @@ static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, /// Traverses and compares operands of V1 and V2 to MaxLevel. static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, VPInterleavedAccessInfo &IAI) { - if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2)) + auto *I1 = dyn_cast<VPInstruction>(V1); + auto *I2 = dyn_cast<VPInstruction>(V2); + // Currently we only support VPInstructions. + if (!I1 || !I2) return 0; if (MaxLevel == 0) - return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1), - cast<VPInstruction>(V2), IAI); + return (unsigned)areConsecutiveOrMatch(I1, I2, IAI); unsigned Score = 0; - for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I) - for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J) - Score += getLAScore(cast<VPUser>(V1)->getOperand(I), - cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI); + for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) + for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) + Score += + getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI); return Score; } @@ -463,8 +466,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { auto *VPI = new VPInstruction(Opcode, CombinedOperands); VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); - LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); - cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " + << *cast<VPInstruction>(Values[0]) << "\n"); addCombined(Values, VPI); return VPI; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3a4872a72122..1a54603faf22 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -48,6 +48,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPInstruction *VPInst = cast<VPInstruction>(Ingredient); Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue()); if (DeadInstructions.count(Inst)) { + VPValue DummyValue; + VPInst->replaceAllUsesWith(&DummyValue); Ingredient->eraseFromParent(); continue; } @@ -66,7 +68,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes( InductionDescriptor II = Inductions.lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || II.getKind() == InductionDescriptor::IK_FpInduction) { - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi); + VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start); } else NewRecipe = new VPWidenPHIRecipe(Phi); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { @@ -77,6 +80,11 @@ void VPlanTransforms::VPInstructionsToVPRecipes( new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands())); NewRecipe->insertBefore(Ingredient); + if (NewRecipe->getNumDefinedValues() == 1) + VPInst->replaceAllUsesWith(NewRecipe->getVPValue()); + else + assert(NewRecipe->getNumDefinedValues() == 0 && + "Only recpies with zero or one defined values expected"); Ingredient->eraseFromParent(); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index f73505d0279a..ed572ca36627 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -10,9 +10,9 @@ /// This file contains the declarations of the entities induced by Vectorization /// Plans, e.g. the instructions the VPlan intends to generate if executed. /// VPlan models the following entities: -/// VPValue -/// |-- VPUser -/// | |-- VPInstruction +/// VPValue VPUser VPDef +/// | | +/// VPInstruction /// These are documented in docs/VectorizationPlan.rst. /// //===----------------------------------------------------------------------===// @@ -21,7 +21,9 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/TinyPtrVector.h" #include "llvm/ADT/iterator_range.h" namespace llvm { @@ -29,8 +31,11 @@ namespace llvm { // Forward declarations. class raw_ostream; class Value; +class VPDef; class VPSlotTracker; class VPUser; +class VPRecipeBase; +class VPWidenMemoryInstructionRecipe; // This is the base class of the VPlan Def/Use graph, used for modeling the data // flow into, within and out of the VPlan. VPValues can stand for live-ins @@ -38,10 +43,14 @@ class VPUser; // and live-outs which the VPlan will need to fix accordingly. class VPValue { friend class VPBuilder; + friend class VPDef; + friend class VPInstruction; friend struct VPlanTransforms; friend class VPBasicBlock; friend class VPInterleavedAccessInfo; friend class VPSlotTracker; + friend class VPRecipeBase; + friend class VPWidenMemoryInstructionRecipe; const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -51,8 +60,11 @@ protected: // Hold the underlying Value, if any, attached to this VPValue. Value *UnderlyingVal; - VPValue(const unsigned char SC, Value *UV = nullptr) - : SubclassID(SC), UnderlyingVal(UV) {} + /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the + /// VPValue is not defined by any recipe modeled in VPlan. + VPDef *Def; + + VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr); // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to // the front-end and back-end of VPlan so that the middle-end is as @@ -61,10 +73,6 @@ protected: // for multiple underlying IRs (Polly?) by providing a new VPlan front-end, // back-end and analysis information for the new IR. - /// Return the underlying Value attached to this VPValue. - Value *getUnderlyingValue() { return UnderlyingVal; } - const Value *getUnderlyingValue() const { return UnderlyingVal; } - // Set \p Val as the underlying Value of this VPValue. void setUnderlyingValue(Value *Val) { assert(!UnderlyingVal && "Underlying Value is already set."); @@ -72,16 +80,33 @@ protected: } public: + /// Return the underlying Value attached to this VPValue. + Value *getUnderlyingValue() { return UnderlyingVal; } + const Value *getUnderlyingValue() const { return UnderlyingVal; } + /// An enumeration for keeping track of the concrete subclass of VPValue that /// are actually instantiated. Values of this enumeration are kept in the /// SubclassID field of the VPValue objects. They are used for concrete /// type identification. - enum { VPValueSC, VPUserSC, VPInstructionSC }; + enum { + VPValueSC, + VPVInstructionSC, + VPVMemoryInstructionSC, + VPVReductionSC, + VPVReplicateSC, + VPVWidenSC, + VPVWidenCallSC, + VPVWidenGEPSC, + VPVWidenSelectSC, + }; - VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {} + VPValue(Value *UV = nullptr, VPDef *Def = nullptr) + : VPValue(VPValueSC, UV, Def) {} VPValue(const VPValue &) = delete; VPValue &operator=(const VPValue &) = delete; + virtual ~VPValue(); + /// \return an ID for the concrete type of this object. /// This is used to implement the classof checks. This should not be used /// for any other purpose, as the values may change as LLVM evolves. @@ -90,9 +115,28 @@ public: void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const; void print(raw_ostream &OS, VPSlotTracker &Tracker) const; + /// Dump the value to stderr (for debugging). + void dump() const; + unsigned getNumUsers() const { return Users.size(); } void addUser(VPUser &User) { Users.push_back(&User); } + /// Remove a single \p User from the list of users. + void removeUser(VPUser &User) { + bool Found = false; + // The same user can be added multiple times, e.g. because the same VPValue + // is used twice by the same VPUser. Remove a single one. + erase_if(Users, [&User, &Found](VPUser *Other) { + if (Found) + return false; + if (Other == &User) { + Found = true; + return true; + } + return false; + }); + } + typedef SmallVectorImpl<VPUser *>::iterator user_iterator; typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator; typedef iterator_range<user_iterator> user_range; @@ -120,6 +164,17 @@ public: } void replaceAllUsesWith(VPValue *New); + + VPDef *getDef() { return Def; } + + /// Returns the underlying IR value, if this VPValue is defined outside the + /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef + /// inside a VPlan. + Value *getLiveInIRValue() { + assert(!getDef() && + "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); + return getUnderlyingValue(); + } }; typedef DenseMap<Value *, VPValue *> Value2VPValueTy; @@ -129,34 +184,32 @@ raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); /// This class augments VPValue with operands which provide the inverse def-use /// edges from VPValue's users to their defs. -class VPUser : public VPValue { +class VPUser { SmallVector<VPValue *, 2> Operands; protected: - VPUser(const unsigned char SC) : VPValue(SC) {} - VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) { + /// Print the operands to \p O. + void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const; + +public: + VPUser() {} + VPUser(ArrayRef<VPValue *> Operands) { for (VPValue *Operand : Operands) addOperand(Operand); } -public: - VPUser() : VPValue(VPValue::VPUserSC) {} - VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {} VPUser(std::initializer_list<VPValue *> Operands) : VPUser(ArrayRef<VPValue *>(Operands)) {} - template <typename IterT> - VPUser(iterator_range<IterT> Operands) : VPValue(VPValue::VPUserSC) { + template <typename IterT> VPUser(iterator_range<IterT> Operands) { for (VPValue *Operand : Operands) addOperand(Operand); } VPUser(const VPUser &) = delete; VPUser &operator=(const VPUser &) = delete; - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPValue *V) { - return V->getVPValueID() >= VPUserSC && - V->getVPValueID() <= VPInstructionSC; + virtual ~VPUser() { + for (VPValue *Op : operands()) + Op->removeUser(*this); } void addOperand(VPValue *Operand) { @@ -170,7 +223,11 @@ public: return Operands[N]; } - void setOperand(unsigned I, VPValue *New) { Operands[I] = New; } + void setOperand(unsigned I, VPValue *New) { + Operands[I]->removeUser(*this); + Operands[I] = New; + New->addUser(*this); + } typedef SmallVectorImpl<VPValue *>::iterator operand_iterator; typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator; @@ -185,7 +242,110 @@ public: const_operand_range operands() const { return const_operand_range(op_begin(), op_end()); } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *Recipe); }; + +/// This class augments a recipe with a set of VPValues defined by the recipe. +/// It allows recipes to define zero, one or multiple VPValues. A VPDef owns +/// the VPValues it defines and is responsible for deleting its defined values. +/// Single-value VPDefs that also inherit from VPValue must make sure to inherit +/// from VPDef before VPValue. +class VPDef { + friend class VPValue; + + /// Subclass identifier (for isa/dyn_cast). + const unsigned char SubclassID; + + /// The VPValues defined by this VPDef. + TinyPtrVector<VPValue *> DefinedValues; + + /// Add \p V as a defined value by this VPDef. + void addDefinedValue(VPValue *V) { + assert(V->getDef() == this && + "can only add VPValue already linked with this VPDef"); + DefinedValues.push_back(V); + } + + /// Remove \p V from the values defined by this VPDef. \p V must be a defined + /// value of this VPDef. + void removeDefinedValue(VPValue *V) { + assert(V->getDef() == this && + "can only remove VPValue linked with this VPDef"); + assert(is_contained(DefinedValues, V) && + "VPValue to remove must be in DefinedValues"); + erase_value(DefinedValues, V); + V->Def = nullptr; + } + +public: + /// An enumeration for keeping track of the concrete subclass of VPRecipeBase + /// that is actually instantiated. Values of this enumeration are kept in the + /// SubclassID field of the VPRecipeBase objects. They are used for concrete + /// type identification. + using VPRecipeTy = enum { + VPBlendSC, + VPBranchOnMaskSC, + VPInstructionSC, + VPInterleaveSC, + VPPredInstPHISC, + VPReductionSC, + VPReplicateSC, + VPWidenCallSC, + VPWidenCanonicalIVSC, + VPWidenGEPSC, + VPWidenIntOrFpInductionSC, + VPWidenMemoryInstructionSC, + VPWidenPHISC, + VPWidenSC, + VPWidenSelectSC + }; + + VPDef(const unsigned char SC) : SubclassID(SC) {} + + virtual ~VPDef() { + for (VPValue *D : make_early_inc_range(DefinedValues)) { + assert(D->Def == this && + "all defined VPValues should point to the containing VPDef"); + assert(D->getNumUsers() == 0 && + "all defined VPValues should have no more users"); + D->Def = nullptr; + delete D; + } + } + + /// Returns the VPValue with index \p I defined by the VPDef. + VPValue *getVPValue(unsigned I = 0) { + assert(DefinedValues[I] && "defined value must be non-null"); + return DefinedValues[I]; + } + const VPValue *getVPValue(unsigned I = 0) const { + assert(DefinedValues[I] && "defined value must be non-null"); + return DefinedValues[I]; + } + + /// Returns an ArrayRef of the values defined by the VPDef. + ArrayRef<VPValue *> definedValues() { return DefinedValues; } + /// Returns an ArrayRef of the values defined by the VPDef. + ArrayRef<VPValue *> definedValues() const { return DefinedValues; } + + /// Returns the number of values defined by the VPDef. + unsigned getNumDefinedValues() const { return DefinedValues.size(); } + + /// \return an ID for the concrete type of this object. + /// This is used to implement the classof checks. This should not be used + /// for any other purpose, as the values may change as LLVM evolves. + unsigned getVPDefID() const { return SubclassID; } + + /// Dump the VPDef to stderr (for debugging). + void dump() const; + + /// Each concrete VPDef prints itself. + virtual void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const = 0; +}; + class VPlan; class VPBasicBlock; class VPRegionBlock; @@ -205,7 +365,7 @@ class VPSlotTracker { void assignSlots(const VPlan &Plan); public: - VPSlotTracker(const VPlan *Plan) { + VPSlotTracker(const VPlan *Plan = nullptr) { if (Plan) assignSlots(*Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index b384c94121e9..6eec8d14de4a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -65,9 +65,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) { for (const VPBlockBase *Succ : Successors) { // There must be a bi-directional link between block and successor. const auto &SuccPreds = Succ->getPredecessors(); - assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) != - SuccPreds.end() && - "Missing predecessor link."); + assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link."); (void)SuccPreds; } @@ -86,9 +84,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) { // There must be a bi-directional link between block and predecessor. const auto &PredSuccs = Pred->getSuccessors(); - assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) != - PredSuccs.end() && - "Missing successor link."); + assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link."); (void)PredSuccs; } } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 64b41bf9cefa..787f146bdddc 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -33,6 +34,7 @@ using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "vector-combine" +STATISTIC(NumVecLoad, "Number of vector loads formed"); STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); @@ -65,6 +67,7 @@ private: const TargetTransformInfo &TTI; const DominatorTree &DT; + bool vectorizeLoadInsert(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex) const; @@ -88,6 +91,138 @@ static void replaceValue(Value &Old, Value &New) { New.takeName(&Old); } +bool VectorCombine::vectorizeLoadInsert(Instruction &I) { + // Match insert into fixed vector of scalar value. + // TODO: Handle non-zero insert index. + auto *Ty = dyn_cast<FixedVectorType>(I.getType()); + Value *Scalar; + if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || + !Scalar->hasOneUse()) + return false; + + // Optionally match an extract from another vector. + Value *X; + bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); + if (!HasExtract) + X = Scalar; + + // Match source value as load of scalar or vector. + // Do not vectorize scalar load (widening) if atomic/volatile or under + // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions + // or create data races non-existent in the source. + auto *Load = dyn_cast<LoadInst>(X); + if (!Load || !Load->isSimple() || !Load->hasOneUse() || + Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || + mustSuppressSpeculation(*Load)) + return false; + + const DataLayout &DL = I.getModule()->getDataLayout(); + Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); + assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); + + // If original AS != Load's AS, we can't bitcast the original pointer and have + // to use Load's operand instead. Ideally we would want to strip pointer casts + // without changing AS, but there's no API to do that ATM. + unsigned AS = Load->getPointerAddressSpace(); + if (AS != SrcPtr->getType()->getPointerAddressSpace()) + SrcPtr = Load->getPointerOperand(); + + // We are potentially transforming byte-sized (8-bit) memory accesses, so make + // sure we have all of our type-based constraints in place for this target. + Type *ScalarTy = Scalar->getType(); + uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); + unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); + if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || + ScalarSize % 8 != 0) + return false; + + // Check safety of replacing the scalar load with a larger vector load. + // We use minimal alignment (maximum flexibility) because we only care about + // the dereferenceable region. When calculating cost and creating a new op, + // we may use a larger value based on alignment attributes. + unsigned MinVecNumElts = MinVectorSize / ScalarSize; + auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); + unsigned OffsetEltIndex = 0; + Align Alignment = Load->getAlign(); + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) { + // It is not safe to load directly from the pointer, but we can still peek + // through gep offsets and check if it safe to load from a base address with + // updated alignment. If it is, we can shuffle the element(s) into place + // after loading. + unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); + APInt Offset(OffsetBitWidth, 0); + SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + + // We want to shuffle the result down from a high element of a vector, so + // the offset must be positive. + if (Offset.isNegative()) + return false; + + // The offset must be a multiple of the scalar element to shuffle cleanly + // in the element's size. + uint64_t ScalarSizeInBytes = ScalarSize / 8; + if (Offset.urem(ScalarSizeInBytes) != 0) + return false; + + // If we load MinVecNumElts, will our target element still be loaded? + OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); + if (OffsetEltIndex >= MinVecNumElts) + return false; + + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) + return false; + + // Update alignment with offset value. Note that the offset could be negated + // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but + // negation does not change the result of the alignment calculation. + Alignment = commonAlignment(Alignment, Offset.getZExtValue()); + } + + // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 + // Use the greater of the alignment on the load or its source pointer. + Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); + Type *LoadTy = Load->getType(); + InstructionCost OldCost = + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); + APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); + OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, + /* Insert */ true, HasExtract); + + // New pattern: load VecPtr + InstructionCost NewCost = + TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); + // Optionally, we are shuffling the loaded vector element(s) into place. + if (OffsetEltIndex) + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy); + + // We can aggressively convert to the vector form because the backend can + // invert this transform if it does not result in a performance win. + if (OldCost < NewCost || !NewCost.isValid()) + return false; + + // It is safe and potentially profitable to load a vector directly: + // inselt undef, load Scalar, 0 --> load VecPtr + IRBuilder<> Builder(Load); + Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); + Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); + + // Set everything but element 0 to undef to prevent poison from propagating + // from the extra loaded memory. This will also optionally shrink/grow the + // vector from the loaded size to the output size. + // We assume this operation has no cost in codegen if there was no offset. + // Note that we could use freeze to avoid poison problems, but then we might + // still need a shuffle to change the vector size. + unsigned OutputNumElts = Ty->getNumElements(); + SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); + assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); + Mask[0] = OffsetEltIndex; + VecLd = Builder.CreateShuffleVector(VecLd, Mask); + + replaceValue(I, *VecLd); + ++NumVecLoad; + return true; +} + /// Determine which, if any, of the inputs should be replaced by a shuffle /// followed by extract from a different index. ExtractElementInst *VectorCombine::getShuffleExtract( @@ -106,8 +241,14 @@ ExtractElementInst *VectorCombine::getShuffleExtract( Type *VecTy = Ext0->getVectorOperand()->getType(); assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); - int Cost0 = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); - int Cost1 = TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + InstructionCost Cost0 = + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + InstructionCost Cost1 = + TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + + // If both costs are invalid no shuffle is needed + if (!Cost0.isValid() && !Cost1.isValid()) + return nullptr; // We are extracting from 2 different indexes, so one operand must be shuffled // before performing a vector operation and/or extract. The more expensive @@ -143,7 +284,7 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, "Expected constant extract indexes"); Type *ScalarTy = Ext0->getType(); auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); - int ScalarOpCost, VectorOpCost; + InstructionCost ScalarOpCost, VectorOpCost; // Get cost estimates for scalar and vector versions of the operation. bool IsBinOp = Instruction::isBinaryOp(Opcode); @@ -164,9 +305,9 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue(); unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue(); - int Extract0Cost = + InstructionCost Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); - int Extract1Cost = + InstructionCost Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index); // A more expensive extract will always be replaced by a splat shuffle. @@ -176,11 +317,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // TODO: Evaluate whether that always results in lowest cost. Alternatively, // check the cost of creating a broadcast shuffle and shuffling both // operands to element 0. - int CheapExtractCost = std::min(Extract0Cost, Extract1Cost); + InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); // Extra uses of the extracts mean that we include those costs in the // vector total because those instructions will not be eliminated. - int OldCost, NewCost; + InstructionCost OldCost, NewCost; if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { // Handle a special case. If the 2 extracts are identical, adjust the // formulas to account for that. The extra use charge allows for either the @@ -231,8 +372,7 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, auto *VecTy = cast<FixedVectorType>(Vec->getType()); SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); ShufMask[NewIndex] = OldIndex; - Value *Undef = UndefValue::get(VecTy); - return Builder.CreateShuffleVector(Vec, Undef, ShufMask, "shift"); + return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); } /// Given an extract element instruction with constant index operand, shuffle @@ -366,17 +506,23 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask)))))) return false; - // Disallow non-vector casts and length-changing shuffles. + // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for + // scalable type is unknown; Second, we cannot reason if the narrowed shuffle + // mask for scalable type is a splat or not. + // 2) Disallow non-vector casts and length-changing shuffles. // TODO: We could allow any shuffle. - auto *DestTy = dyn_cast<VectorType>(I.getType()); - auto *SrcTy = cast<VectorType>(V->getType()); - if (!DestTy || I.getOperand(0)->getType() != SrcTy) + auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); + auto *SrcTy = dyn_cast<FixedVectorType>(V->getType()); + if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy) return false; // The new shuffle must not cost more than the old shuffle. The bitcast is // moved ahead of the shuffle, so assume that it has the same cost as before. - if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) > - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy)) + InstructionCost DestCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy); + InstructionCost SrcCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy); + if (DestCost > SrcCost || !DestCost.isValid()) return false; unsigned DestNumElts = DestTy->getNumElements(); @@ -399,8 +545,7 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' ++NumShufOfBitcast; Value *CastV = Builder.CreateBitCast(V, DestTy); - Value *Shuf = - Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), NewMask); + Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); replaceValue(I, *Shuf); return true; } @@ -467,7 +612,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { "Unexpected types for insert element into binop or cmp"); unsigned Opcode = I.getOpcode(); - int ScalarOpCost, VectorOpCost; + InstructionCost ScalarOpCost, VectorOpCost; if (IsCmp) { ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy); VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy); @@ -478,16 +623,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { // Get cost estimate for the insert element. This cost will factor into // both sequences. - int InsertCost = + InstructionCost InsertCost = TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); - int OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + - VectorOpCost; - int NewCost = ScalarOpCost + InsertCost + - (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + - (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); + InstructionCost OldCost = + (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; + InstructionCost NewCost = ScalarOpCost + InsertCost + + (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + + (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); // We want to scalarize unless the vector variant actually has lower cost. - if (OldCost < NewCost) + if (OldCost < NewCost || !NewCost.isValid()) return false; // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> @@ -567,7 +712,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { if (!VecTy) return false; - int OldCost = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + InstructionCost OldCost = + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2; OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); @@ -578,7 +724,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); - int NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); + InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); @@ -587,7 +733,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { // Aggressively form vector ops if the cost is equal because the transform // may enable further optimization. // Codegen can reverse this transform (scalarize) if it was not profitable. - if (OldCost < NewCost) + if (OldCost < NewCost || !NewCost.isValid()) return false; // Create a vector constant from the 2 scalar constants. @@ -612,6 +758,10 @@ bool VectorCombine::run() { if (DisableVectorCombine) return false; + // Don't attempt vectorization if the target does not support vectors. + if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) + return false; + bool MadeChange = false; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. @@ -625,6 +775,7 @@ bool VectorCombine::run() { if (isa<DbgInfoIntrinsic>(I)) continue; Builder.SetInsertPoint(&I); + MadeChange |= vectorizeLoadInsert(I); MadeChange |= foldExtractExtract(I); MadeChange |= foldBitcastShuf(I); MadeChange |= scalarizeBinopOrCmp(I); |
