diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2023-12-18 20:30:12 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2024-04-06 20:11:55 +0000 |
| commit | 5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch) | |
| tree | 1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Transforms/Vectorize | |
| parent | 3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff) | |
| parent | 312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff) | |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
16 files changed, 5822 insertions, 3382 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 260d7889906b..fa2459d1ca02 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -103,13 +103,11 @@ #include "llvm/Support/ModRef.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <cassert> #include <cstdint> #include <cstdlib> #include <iterator> -#include <limits> #include <numeric> #include <optional> #include <tuple> @@ -900,9 +898,9 @@ bool Vectorizer::vectorizeChain(Chain &C) { // Chain is in offset order, so C[0] is the instr with the lowest offset, // i.e. the root of the vector. - Value *Bitcast = Builder.CreateBitCast( - getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS)); - VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); + VecInst = Builder.CreateAlignedLoad(VecTy, + getLoadStorePointerOperand(C[0].Inst), + Alignment); unsigned VecIdx = 0; for (const ChainElem &E : C) { @@ -976,8 +974,7 @@ bool Vectorizer::vectorizeChain(Chain &C) { // i.e. the root of the vector. VecInst = Builder.CreateAlignedStore( Vec, - Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst), - VecTy->getPointerTo(AS)), + getLoadStorePointerOperand(C[0].Inst), Alignment); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index f923f0be6621..37a356c43e29 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -289,7 +289,7 @@ void LoopVectorizeHints::getHintsFromMetadata() { } void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { - if (!Name.startswith(Prefix())) + if (!Name.starts_with(Prefix())) return; Name = Name.substr(Prefix().size(), StringRef::npos); @@ -943,6 +943,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } + // If we found a vectorized variant of a function, note that so LV can + // make better decisions about maximum VF. + if (CI && !VFDatabase::getMappings(*CI).empty()) + VecCallVariantsFound = true; + // Check that the instruction return type is vectorizable. // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(I.getType()) && @@ -1242,13 +1247,12 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { bool LoopVectorizationLegality::blockCanBePredicated( BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, - SmallPtrSetImpl<const Instruction *> &MaskedOp, - SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const { + SmallPtrSetImpl<const Instruction *> &MaskedOp) const { for (Instruction &I : *BB) { // We can predicate blocks with calls to assume, as long as we drop them in // case we flatten the CFG via predication. if (match(&I, m_Intrinsic<Intrinsic::assume>())) { - ConditionalAssumes.insert(&I); + MaskedOp.insert(&I); continue; } @@ -1345,16 +1349,13 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } // We must be able to predicate all blocks that need to be predicated. - if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointers, MaskedOp, - ConditionalAssumes)) { - reportVectorizationFailure( - "Control flow cannot be substituted for a select", - "control flow cannot be substituted for a select", - "NoCFGForSelect", ORE, TheLoop, - BB->getTerminator()); - return false; - } + if (blockNeedsPredication(BB) && + !blockCanBePredicated(BB, SafePointers, MaskedOp)) { + reportVectorizationFailure( + "Control flow cannot be substituted for a select", + "control flow cannot be substituted for a select", "NoCFGForSelect", + ORE, TheLoop, BB->getTerminator()); + return false; } } @@ -1554,14 +1555,14 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { // The list of pointers that we can safely read and write to remains empty. SmallPtrSet<Value *, 8> SafePointers; + // Collect masked ops in temporary set first to avoid partially populating + // MaskedOp if a block cannot be predicated. SmallPtrSet<const Instruction *, 8> TmpMaskedOp; - SmallPtrSet<Instruction *, 8> TmpConditionalAssumes; // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, - TmpConditionalAssumes)) { + if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } @@ -1570,9 +1571,6 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); - ConditionalAssumes.insert(TmpConditionalAssumes.begin(), - TmpConditionalAssumes.end()); - return true; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 13357cb06c55..577ce8000de2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -31,6 +31,7 @@ namespace llvm { class LoopInfo; +class DominatorTree; class LoopVectorizationLegality; class LoopVectorizationCostModel; class PredicatedScalarEvolution; @@ -45,13 +46,17 @@ class VPBuilder { VPBasicBlock *BB = nullptr; VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); + /// Insert \p VPI in BB at InsertPt if BB is set. + VPInstruction *tryInsertInstruction(VPInstruction *VPI) { + if (BB) + BB->insert(VPI, InsertPt); + return VPI; + } + VPInstruction *createInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL, const Twine &Name = "") { - VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL, Name); - if (BB) - BB->insert(Instr, InsertPt); - return Instr; + return tryInsertInstruction(new VPInstruction(Opcode, Operands, DL, Name)); } VPInstruction *createInstruction(unsigned Opcode, @@ -62,6 +67,7 @@ class VPBuilder { public: VPBuilder() = default; + VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); } /// Clear the insertion point: created instructions will not be inserted into /// a block. @@ -116,10 +122,11 @@ public: InsertPt = IP; } - /// Insert and return the specified instruction. - VPInstruction *insert(VPInstruction *I) const { - BB->insert(I, InsertPt); - return I; + /// This specifies that created instructions should be inserted at the + /// specified point. + void setInsertPoint(VPRecipeBase *IP) { + BB = IP->getParent(); + InsertPt = IP->getIterator(); } /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as @@ -138,6 +145,13 @@ public: return createInstruction(Opcode, Operands, DL, Name); } + VPInstruction *createOverflowingOp(unsigned Opcode, + std::initializer_list<VPValue *> Operands, + VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, + DebugLoc DL, const Twine &Name = "") { + return tryInsertInstruction( + new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); + } VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") { return createInstruction(VPInstruction::Not, {Operand}, DL, Name); } @@ -158,6 +172,12 @@ public: Name); } + /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A + /// and \p B. + /// TODO: add createFCmp when needed. + VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, + DebugLoc DL = {}, const Twine &Name = ""); + //===--------------------------------------------------------------------===// // RAII helpers. //===--------------------------------------------------------------------===// @@ -268,6 +288,9 @@ class LoopVectorizationPlanner { /// Loop Info analysis. LoopInfo *LI; + /// The dominator tree. + DominatorTree *DT; + /// Target Library Info. const TargetLibraryInfo *TLI; @@ -298,16 +321,14 @@ class LoopVectorizationPlanner { VPBuilder Builder; public: - LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, - const TargetTransformInfo &TTI, - LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM, - InterleavedAccessInfo &IAI, - PredicatedScalarEvolution &PSE, - const LoopVectorizeHints &Hints, - OptimizationRemarkEmitter *ORE) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), ORE(ORE) {} + LoopVectorizationPlanner( + Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal, + LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI, + PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, + OptimizationRemarkEmitter *ORE) + : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or /// std::nullopt if vectorization and interleaving should be avoided up front. @@ -333,7 +354,7 @@ public: executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, - DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr); + const DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printPlans(raw_ostream &O); @@ -377,8 +398,7 @@ private: /// returned VPlan is valid for. If no VPlan can be built for the input range, /// set the largest included VF to the maximum VF for which no plan could be /// built. - std::optional<VPlanPtr> tryToBuildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions); + VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b603bbe55dc9..f82e161fb846 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -27,7 +27,7 @@ // // There is a development effort going on to migrate loop vectorizer to the // VPlan infrastructure and to introduce outer loop vectorization support (see -// docs/Proposal/VectorizationPlan.rst and +// docs/VectorizationPlan.rst and // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this // purpose, we temporarily introduced the VPlan-native vectorization path: an // alternative vectorization path that is natively implemented on top of the @@ -57,6 +57,7 @@ #include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" #include "VPlan.h" +#include "VPlanAnalysis.h" #include "VPlanHCFGBuilder.h" #include "VPlanTransforms.h" #include "llvm/ADT/APInt.h" @@ -111,10 +112,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -390,6 +393,21 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor( cl::desc( "Override cost based safe divisor widening for div/rem instructions")); +static cl::opt<bool> UseWiderVFIfCallVariantsPresent( + "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), + cl::Hidden, + cl::desc("Try wider VFs if they enable the use of vector variants")); + +// Likelyhood of bypassing the vectorized loop because assumptions about SCEV +// variables not overflowing do not hold. See `emitSCEVChecks`. +static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; +// Likelyhood of bypassing the vectorized loop because pointers overlap. See +// `emitMemRuntimeChecks`. +static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; +// Likelyhood of bypassing the vectorized loop because there are zero trips left +// after prolog. See `emitIterationCountCheck`. +static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type. @@ -408,13 +426,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { /// we always assume predicated blocks have a 50% chance of executing. static unsigned getReciprocalPredBlockProb() { return 2; } -/// A helper function that returns an integer or floating-point constant with -/// value C. -static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { - return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) - : ConstantFP::get(Ty, C); -} - /// Returns "best known" trip count for the specified loop \p L as defined by /// the following procedure: /// 1) Returns exact trip count if it is known. @@ -556,10 +567,6 @@ public: const VPIteration &Instance, VPTransformState &State); - /// Construct the vector value of a scalarized value \p V one lane at a time. - void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, - VPTransformState &State); - /// Try to vectorize interleaved access group \p Group with the base address /// given in \p Addr, optionally masking the vector operations if \p /// BlockInMask is non-null. Use \p State to translate given VPValues to IR @@ -634,10 +641,6 @@ protected: /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally - /// represented as. - void truncateToMinimalBitwidths(VPTransformState &State); - /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); @@ -943,21 +946,21 @@ protected: /// Look for a meaningful debug location on the instruction or it's /// operands. -static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { +static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { if (!I) - return I; + return DebugLoc(); DebugLoc Empty; if (I->getDebugLoc() != Empty) - return I; + return I->getDebugLoc(); for (Use &Op : I->operands()) { if (Instruction *OpInst = dyn_cast<Instruction>(Op)) if (OpInst->getDebugLoc() != Empty) - return OpInst; + return OpInst->getDebugLoc(); } - return I; + return I->getDebugLoc(); } /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I @@ -1021,14 +1024,6 @@ const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); } -static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, - ElementCount VF) { - assert(FTy->isFloatingPointTy() && "Expected floating point type!"); - Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); - Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); - return B.CreateUIToFP(RuntimeVF, FTy); -} - void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, @@ -1050,6 +1045,23 @@ void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, << Msg); } +/// Report successful vectorization of the loop. In case an outer loop is +/// vectorized, prepend "outer" to the vectorization remark. +static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, + VectorizationFactor VF, unsigned IC) { + LLVM_DEBUG(debugVectorizationMessage( + "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", + nullptr)); + StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "vectorized " << LoopType << "loop (vectorization width: " + << ore::NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; + }); +} + } // end namespace llvm #ifndef NDEBUG @@ -1104,7 +1116,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { RecWithFlags->dropPoisonGeneratingFlags(); } else { - Instruction *Instr = CurRec->getUnderlyingInstr(); + Instruction *Instr = dyn_cast_or_null<Instruction>( + CurRec->getVPSingleValue()->getUnderlyingValue()); (void)Instr; assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && "found instruction with poison generating flags not covered by " @@ -1247,6 +1260,13 @@ public: /// avoid redundant calculations. void setCostBasedWideningDecision(ElementCount VF); + /// A call may be vectorized in different ways depending on whether we have + /// vectorized variants available and whether the target supports masking. + /// This function analyzes all calls in the function at the supplied VF, + /// makes a decision based on the costs of available options, and stores that + /// decision in a map for use in planning and plan execution. + void setVectorizedCallDecision(ElementCount VF); + /// A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { @@ -1270,7 +1290,7 @@ public: void collectElementTypesForWidening(); /// Split reductions into those that happen in the loop, and those that happen - /// outside. In loop reductions are collected into InLoopReductionChains. + /// outside. In loop reductions are collected into InLoopReductions. void collectInLoopReductions(); /// Returns true if we should use strict in-order reductions for the given @@ -1358,7 +1378,9 @@ public: CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, - CM_Scalarize + CM_Scalarize, + CM_VectorCall, + CM_IntrinsicCall }; /// Save vectorization decision \p W and \p Cost taken by the cost model for @@ -1414,6 +1436,29 @@ public: return WideningDecisions[InstOnVF].second; } + struct CallWideningDecision { + InstWidening Kind; + Function *Variant; + Intrinsic::ID IID; + std::optional<unsigned> MaskPos; + InstructionCost Cost; + }; + + void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, + Function *Variant, Intrinsic::ID IID, + std::optional<unsigned> MaskPos, + InstructionCost Cost) { + assert(!VF.isScalar() && "Expected vector VF"); + CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, + MaskPos, Cost}; + } + + CallWideningDecision getCallWideningDecision(CallInst *CI, + ElementCount VF) const { + assert(!VF.isScalar() && "Expected vector VF"); + return CallWideningDecisions.at(std::make_pair(CI, VF)); + } + /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. @@ -1447,11 +1492,15 @@ public: /// Collect Uniform and Scalar values for the given \p VF. /// The sets depend on CM decision for Load/Store instructions /// that may be vectorized as interleave, gather-scatter or scalarized. + /// Also make a decision on what to do about call instructions in the loop + /// at that VF -- scalarize, call a known vector routine, or call a + /// vector intrinsic. void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. if (VF.isScalar() || Uniforms.contains(VF)) return; setCostBasedWideningDecision(VF); + setVectorizedCallDecision(VF); collectLoopUniforms(VF); collectLoopScalars(VF); } @@ -1606,20 +1655,9 @@ public: return foldTailByMasking() || Legal->blockNeedsPredication(BB); } - /// A SmallMapVector to store the InLoop reduction op chains, mapping phi - /// nodes to the chain of instructions representing the reductions. Uses a - /// MapVector to ensure deterministic iteration order. - using ReductionChainMap = - SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; - - /// Return the chain of instructions representing an inloop reduction. - const ReductionChainMap &getInLoopReductionChains() const { - return InLoopReductionChains; - } - /// Returns true if the Phi is part of an inloop reduction. bool isInLoopReduction(PHINode *Phi) const { - return InLoopReductionChains.count(Phi); + return InLoopReductions.contains(Phi); } /// Estimate cost of an intrinsic call instruction CI if it were vectorized @@ -1629,16 +1667,13 @@ public: /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead - /// if it's needed. The flag NeedToScalarize shows if the call needs to be - /// scalarized - - /// i.e. either vector version isn't available, or is too expensive. - InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, - Function **Variant, - bool *NeedsMask = nullptr) const; + /// if it's needed. + InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { WideningDecisions.clear(); + CallWideningDecisions.clear(); Uniforms.clear(); Scalars.clear(); } @@ -1675,14 +1710,14 @@ private: /// elements is a power-of-2 larger than zero. If scalable vectorization is /// disabled or unsupported, then the scalable part will be equal to /// ElementCount::getScalable(0). - FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, + FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. /// This is a helper function of computeFeasibleMaxVF. - ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, + ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, ElementCount MaxSafeVF, @@ -1705,7 +1740,7 @@ private: /// part of that pattern. std::optional<InstructionCost> getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, - TTI::TargetCostKind CostKind); + TTI::TargetCostKind CostKind) const; /// Calculate vectorization cost of memory instruction \p I. InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); @@ -1783,15 +1818,12 @@ private: /// scalarized. DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; - /// PHINodes of the reductions that should be expanded in-loop along with - /// their associated chains of reduction operations, in program order from top - /// (PHI) to bottom - ReductionChainMap InLoopReductionChains; + /// PHINodes of the reductions that should be expanded in-loop. + SmallPtrSet<PHINode *, 4> InLoopReductions; /// A Map of inloop reduction operations and their immediate chain operand. /// FIXME: This can be removed once reductions can be costed correctly in - /// vplan. This was added to allow quick lookup to the inloop operations, - /// without having to loop through InLoopReductionChains. + /// VPlan. This was added to allow quick lookup of the inloop operations. DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; /// Returns the expected difference in cost from scalarizing the expression @@ -1830,6 +1862,11 @@ private: DecisionList WideningDecisions; + using CallDecisionList = + DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; + + CallDecisionList CallWideningDecisions; + /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. bool needsExtract(Value *V, ElementCount VF) const { @@ -1933,12 +1970,14 @@ class GeneratedRTChecks { SCEVExpander MemCheckExp; bool CostTooHigh = false; + const bool AddBranchWeights; public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - TargetTransformInfo *TTI, const DataLayout &DL) + TargetTransformInfo *TTI, const DataLayout &DL, + bool AddBranchWeights) : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), - MemCheckExp(SE, DL, "scev.check") {} + MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -1990,9 +2029,9 @@ public: }, IC); } else { - MemRuntimeCheckCond = - addRuntimeChecks(MemCheckBlock->getTerminator(), L, - RtPtrChecking.getChecks(), MemCheckExp); + MemRuntimeCheckCond = addRuntimeChecks( + MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), + MemCheckExp, VectorizerParams::HoistRuntimeChecks); } assert(MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " @@ -2131,8 +2170,10 @@ public: DT->addNewBlock(SCEVCheckBlock, Pred); DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); - ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); + BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); + if (AddBranchWeights) + setBranchWeights(BI, SCEVCheckBypassWeights); + ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); return SCEVCheckBlock; } @@ -2156,9 +2197,12 @@ public: if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) PL->addBasicBlockToLoop(MemCheckBlock, *LI); - ReplaceInstWithInst( - MemCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); + if (AddBranchWeights) { + setBranchWeights(BI, MemCheckBypassWeights); + } + ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); MemCheckBlock->getTerminator()->setDebugLoc( Pred->getTerminator()->getDebugLoc()); @@ -2252,157 +2296,17 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI, // LoopVectorizationCostModel and LoopVectorizationPlanner. //===----------------------------------------------------------------------===// -/// This function adds -/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) -/// to each vector element of Val. The sequence starts at StartIndex. -/// \p Opcode is relevant for FP induction variable. -static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, - Instruction::BinaryOps BinOp, ElementCount VF, - IRBuilderBase &Builder) { - assert(VF.isVector() && "only vector VFs are supported"); - - // Create and check the types. - auto *ValVTy = cast<VectorType>(Val->getType()); - ElementCount VLen = ValVTy->getElementCount(); - - Type *STy = Val->getType()->getScalarType(); - assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && - "Induction Step must be an integer or FP"); - assert(Step->getType() == STy && "Step has wrong type"); - - SmallVector<Constant *, 8> Indices; - - // Create a vector of consecutive numbers from zero to VF. - VectorType *InitVecValVTy = ValVTy; - if (STy->isFloatingPointTy()) { - Type *InitVecValSTy = - IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); - InitVecValVTy = VectorType::get(InitVecValSTy, VLen); - } - Value *InitVec = Builder.CreateStepVector(InitVecValVTy); - - // Splat the StartIdx - Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); - - if (STy->isIntegerTy()) { - InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); - Step = Builder.CreateVectorSplat(VLen, Step); - assert(Step->getType() == Val->getType() && "Invalid step vec"); - // FIXME: The newly created binary instructions should contain nsw/nuw - // flags, which can be found from the original scalar operations. - Step = Builder.CreateMul(InitVec, Step); - return Builder.CreateAdd(Val, Step, "induction"); - } - - // Floating point induction. - assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && - "Binary Opcode should be specified for FP induction"); - InitVec = Builder.CreateUIToFP(InitVec, ValVTy); - InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); - - Step = Builder.CreateVectorSplat(VLen, Step); - Value *MulOp = Builder.CreateFMul(InitVec, Step); - return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); -} - -/// Compute scalar induction steps. \p ScalarIV is the scalar induction -/// variable on which to base the steps, \p Step is the size of the step. -static void buildScalarSteps(Value *ScalarIV, Value *Step, - const InductionDescriptor &ID, VPValue *Def, - VPTransformState &State) { - IRBuilderBase &Builder = State.Builder; - - // Ensure step has the same type as that of scalar IV. - Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); - if (ScalarIVTy != Step->getType()) { - // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to - // avoid separate truncate here. - assert(Step->getType()->isIntegerTy() && - "Truncation requires an integer step"); - Step = State.Builder.CreateTrunc(Step, ScalarIVTy); - } - - // We build scalar steps for both integer and floating-point induction - // variables. Here, we determine the kind of arithmetic we will perform. - Instruction::BinaryOps AddOp; - Instruction::BinaryOps MulOp; - if (ScalarIVTy->isIntegerTy()) { - AddOp = Instruction::Add; - MulOp = Instruction::Mul; - } else { - AddOp = ID.getInductionOpcode(); - MulOp = Instruction::FMul; - } - - // Determine the number of scalars we need to generate for each unroll - // iteration. - bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); - // Compute the scalar steps and save the results in State. - Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), - ScalarIVTy->getScalarSizeInBits()); - Type *VecIVTy = nullptr; - Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; - if (!FirstLaneOnly && State.VF.isScalable()) { - VecIVTy = VectorType::get(ScalarIVTy, State.VF); - UnitStepVec = - Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); - SplatStep = Builder.CreateVectorSplat(State.VF, Step); - SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); - } - - unsigned StartPart = 0; - unsigned EndPart = State.UF; - unsigned StartLane = 0; - unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); - if (State.Instance) { - StartPart = State.Instance->Part; - EndPart = StartPart + 1; - StartLane = State.Instance->Lane.getKnownLane(); - EndLane = StartLane + 1; - } - for (unsigned Part = StartPart; Part < EndPart; ++Part) { - Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); - - if (!FirstLaneOnly && State.VF.isScalable()) { - auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); - auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); - if (ScalarIVTy->isFloatingPointTy()) - InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); - auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); - auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); - State.set(Def, Add, Part); - // It's useful to record the lane values too for the known minimum number - // of elements so we do those below. This improves the code quality when - // trying to extract the first element, for example. - } - - if (ScalarIVTy->isFloatingPointTy()) - StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); - - for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { - Value *StartIdx = Builder.CreateBinOp( - AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); - // The step returned by `createStepForVF` is a runtime-evaluated value - // when VF is scalable. Otherwise, it should be folded into a Constant. - assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && - "Expected StartIdx to be folded to a constant when VF is not " - "scalable"); - auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); - auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); - State.set(Def, Add, VPIteration(Part, Lane)); - } - } -} - /// Compute the transformed value of Index at offset StartValue using step /// StepValue. /// For integer induction, returns StartValue + Index * StepValue. /// For pointer induction, returns StartValue[Index * StepValue]. /// FIXME: The newly created binary instructions should contain nsw/nuw /// flags, which can be found from the original scalar operations. -static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, - Value *StartValue, Value *Step, - const InductionDescriptor &ID) { +static Value * +emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, + Value *Step, + InductionDescriptor::InductionKind InductionKind, + const BinaryOperator *InductionBinOp) { Type *StepTy = Step->getType(); Value *CastedIndex = StepTy->isIntegerTy() ? B.CreateSExtOrTrunc(Index, StepTy) @@ -2446,7 +2350,7 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, return B.CreateMul(X, Y); }; - switch (ID.getKind()) { + switch (InductionKind) { case InductionDescriptor::IK_IntInduction: { assert(!isa<VectorType>(Index->getType()) && "Vector indices not supported for integer inductions yet"); @@ -2464,7 +2368,6 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, assert(!isa<VectorType>(Index->getType()) && "Vector indices not supported for FP inductions yet"); assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); - auto InductionBinOp = ID.getInductionBinOp(); assert(InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && @@ -2524,17 +2427,6 @@ static bool isIndvarOverflowCheckKnownFalse( return false; } -void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, - const VPIteration &Instance, - VPTransformState &State) { - Value *ScalarInst = State.get(Def, Instance); - Value *VectorValue = State.get(Def, Instance.Part); - VectorValue = Builder.CreateInsertElement( - VectorValue, ScalarInst, - Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); - State.set(Def, VectorValue, Instance.Part); -} - // Return whether we allow using masked interleave-groups (for dealing with // strided loads/stores that reside in predicated blocks, or for dealing // with gaps). @@ -2612,7 +2504,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); - State.setDebugLocFromInst(AddrPart); + if (auto *I = dyn_cast<Instruction>(AddrPart)) + State.setDebugLocFrom(I->getDebugLoc()); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2630,14 +2523,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) InBounds = gep->isInBounds(); AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); - - // Cast to the vector pointer type. - unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); - Type *PtrTy = VecTy->getPointerTo(AddressSpace); - AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); + AddrParts.push_back(AddrPart); } - State.setDebugLocFromInst(Instr); + State.setDebugLocFrom(Instr->getDebugLoc()); Value *PoisonVec = PoisonValue::get(VecTy); auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( @@ -2835,13 +2724,20 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, bool IsVoidRetTy = Instr->getType()->isVoidTy(); Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) + if (!IsVoidRetTy) { Cloned->setName(Instr->getName() + ".cloned"); +#if !defined(NDEBUG) + // Verify that VPlan type inference results agree with the type of the + // generated values. + assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && + "inferred type and type from generated instructions do not match"); +#endif + } RepRecipe->setFlags(Cloned); - if (Instr->getDebugLoc()) - State.setDebugLocFromInst(Instr); + if (auto DL = Instr->getDebugLoc()) + State.setDebugLocFrom(DL); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. @@ -3019,9 +2915,11 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); - ReplaceInstWithInst( - TCCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) + setBranchWeights(BI, MinItersBypassWeights); + ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); LoopBypassBlocks.push_back(TCCheckBlock); } @@ -3151,15 +3049,17 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); - EndValue = - emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); + EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), + Step, II.getKind(), II.getInductionBinOp()); EndValue->setName("ind.end"); // Compute the end value for the additional bypass (if applicable). if (AdditionalBypass.first) { - B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); - EndValueFromAdditionalBypass = emitTransformedIndex( - B, AdditionalBypass.second, II.getStartValue(), Step, II); + B.SetInsertPoint(AdditionalBypass.first, + AdditionalBypass.first->getFirstInsertionPt()); + EndValueFromAdditionalBypass = + emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(), + Step, II.getKind(), II.getInductionBinOp()); EndValueFromAdditionalBypass->setName("ind.end"); } } @@ -3240,16 +3140,25 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { // 3) Otherwise, construct a runtime check. if (!Cost->requiresScalarEpilogue(VF.isVector()) && !Cost->foldTailByMasking()) { - Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - Count, VectorTripCount, "cmp.n", - LoopMiddleBlock->getTerminator()); - // Here we use the same DebugLoc as the scalar loop latch terminator instead // of the corresponding compare because they may have ended up with // different line numbers and we want to avoid awkward line stepping while // debugging. Eg. if the compare has got a line number inside the loop. - CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); - cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); + // TODO: At the moment, CreateICmpEQ will simplify conditions with constant + // operands. Perform simplification directly on VPlan once the branch is + // modeled there. + IRBuilder<> B(LoopMiddleBlock->getTerminator()); + B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); + Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); + BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator()); + BI.setCondition(CmpN); + if (hasBranchWeightMD(*ScalarLatchTerm)) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = UF * VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + const uint32_t Weights[] = {1, TripCount - 1}; + setBranchWeights(BI, Weights); + } } #ifdef EXPENSIVE_CHECKS @@ -3373,7 +3282,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() : State.get(StepVPV, {0, 0}); Value *Escape = - emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II); + emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, + II.getKind(), II.getInductionBinOp()); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } @@ -3445,76 +3355,33 @@ static void cse(BasicBlock *BB) { } } -InstructionCost LoopVectorizationCostModel::getVectorCallCost( - CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { - Function *F = CI->getCalledFunction(); - Type *ScalarRetTy = CI->getType(); - SmallVector<Type *, 4> Tys, ScalarTys; - bool MaskRequired = Legal->isMaskRequired(CI); - for (auto &ArgOp : CI->args()) - ScalarTys.push_back(ArgOp->getType()); +InstructionCost +LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, + ElementCount VF) const { + // We only need to calculate a cost if the VF is scalar; for actual vectors + // we should already have a pre-calculated cost at each VF. + if (!VF.isScalar()) + return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; - // Estimate cost of scalarized vector call. The source operands are assumed - // to be vectors, so we need to extract individual elements from there, - // execute VF scalar calls, and then gather the result into the vector return - // value. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost ScalarCallCost = - TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); - if (VF.isScalar()) - return ScalarCallCost; - - // Compute corresponding vector type for return value and arguments. - Type *RetTy = ToVectorTy(ScalarRetTy, VF); - for (Type *ScalarTy : ScalarTys) - Tys.push_back(ToVectorTy(ScalarTy, VF)); - - // Compute costs of unpacking argument values for the scalar calls and - // packing the return values to a vector. - InstructionCost ScalarizationCost = - getScalarizationOverhead(CI, VF, CostKind); + Type *RetTy = CI->getType(); + if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) + if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) + return *RedCost; - InstructionCost Cost = - ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; - - // If we can't emit a vector call for this function, then the currently found - // cost is the cost we need to return. - InstructionCost MaskCost = 0; - VFShape Shape = VFShape::get(*CI, VF, MaskRequired); - if (NeedsMask) - *NeedsMask = MaskRequired; - Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); - // If we want an unmasked vector function but can't find one matching the VF, - // maybe we can find vector function that does use a mask and synthesize - // an all-true mask. - if (!VecFunc && !MaskRequired) { - Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); - VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); - // If we found one, add in the cost of creating a mask - if (VecFunc) { - if (NeedsMask) - *NeedsMask = true; - MaskCost = TTI.getShuffleCost( - TargetTransformInfo::SK_Broadcast, - VectorType::get( - IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()), - VF)); - } - } + SmallVector<Type *, 4> Tys; + for (auto &ArgOp : CI->args()) + Tys.push_back(ArgOp->getType()); - // We don't support masked function calls yet, but we can scalarize a - // masked call with branches (unless VF is scalable). - if (!TLI || CI->isNoBuiltin() || !VecFunc) - return VF.isScalable() ? InstructionCost::getInvalid() : Cost; + InstructionCost ScalarCallCost = + TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); - // If the corresponding vector cost is cheaper, return its cost. - InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; - if (VectorCallCost < Cost) { - *Variant = VecFunc; - Cost = VectorCallCost; + // If this is an intrinsic we may have a lower cost for it. + if (getVectorIntrinsicIDForCall(CI, TLI)) { + InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); + return std::min(ScalarCallCost, IntrinsicCost); } - return Cost; + return ScalarCallCost; } static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { @@ -3558,146 +3425,8 @@ static Type *largestIntegerVectorType(Type *T1, Type *T2) { return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; } -void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { - // For every instruction `I` in MinBWs, truncate the operands, create a - // truncated version of `I` and reextend its result. InstCombine runs - // later and will remove any ext/trunc pairs. - SmallPtrSet<Value *, 4> Erased; - for (const auto &KV : Cost->getMinimalBitwidths()) { - // If the value wasn't vectorized, we must maintain the original scalar - // type. The absence of the value from State indicates that it - // wasn't vectorized. - // FIXME: Should not rely on getVPValue at this point. - VPValue *Def = State.Plan->getVPValue(KV.first, true); - if (!State.hasAnyVectorValue(Def)) - continue; - for (unsigned Part = 0; Part < UF; ++Part) { - Value *I = State.get(Def, Part); - if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) - continue; - Type *OriginalTy = I->getType(); - Type *ScalarTruncatedTy = - IntegerType::get(OriginalTy->getContext(), KV.second); - auto *TruncatedTy = VectorType::get( - ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); - if (TruncatedTy == OriginalTy) - continue; - - IRBuilder<> B(cast<Instruction>(I)); - auto ShrinkOperand = [&](Value *V) -> Value * { - if (auto *ZI = dyn_cast<ZExtInst>(V)) - if (ZI->getSrcTy() == TruncatedTy) - return ZI->getOperand(0); - return B.CreateZExtOrTrunc(V, TruncatedTy); - }; - - // The actual instruction modification depends on the instruction type, - // unfortunately. - Value *NewI = nullptr; - if (auto *BO = dyn_cast<BinaryOperator>(I)) { - NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), - ShrinkOperand(BO->getOperand(1))); - - // Any wrapping introduced by shrinking this operation shouldn't be - // considered undefined behavior. So, we can't unconditionally copy - // arithmetic wrapping flags to NewI. - cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); - } else if (auto *CI = dyn_cast<ICmpInst>(I)) { - NewI = - B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), - ShrinkOperand(CI->getOperand(1))); - } else if (auto *SI = dyn_cast<SelectInst>(I)) { - NewI = B.CreateSelect(SI->getCondition(), - ShrinkOperand(SI->getTrueValue()), - ShrinkOperand(SI->getFalseValue())); - } else if (auto *CI = dyn_cast<CastInst>(I)) { - switch (CI->getOpcode()) { - default: - llvm_unreachable("Unhandled cast!"); - case Instruction::Trunc: - NewI = ShrinkOperand(CI->getOperand(0)); - break; - case Instruction::SExt: - NewI = B.CreateSExtOrTrunc( - CI->getOperand(0), - smallestIntegerVectorType(OriginalTy, TruncatedTy)); - break; - case Instruction::ZExt: - NewI = B.CreateZExtOrTrunc( - CI->getOperand(0), - smallestIntegerVectorType(OriginalTy, TruncatedTy)); - break; - } - } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { - auto Elements0 = - cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); - auto *O0 = B.CreateZExtOrTrunc( - SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); - auto Elements1 = - cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); - auto *O1 = B.CreateZExtOrTrunc( - SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); - - NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); - } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { - // Don't do anything with the operands, just extend the result. - continue; - } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { - auto Elements = - cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); - auto *O0 = B.CreateZExtOrTrunc( - IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); - auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); - NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); - } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { - auto Elements = - cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); - auto *O0 = B.CreateZExtOrTrunc( - EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); - NewI = B.CreateExtractElement(O0, EE->getOperand(2)); - } else { - // If we don't know what to do, be conservative and don't do anything. - continue; - } - - // Lastly, extend the result. - NewI->takeName(cast<Instruction>(I)); - Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); - I->replaceAllUsesWith(Res); - cast<Instruction>(I)->eraseFromParent(); - Erased.insert(I); - State.reset(Def, Res, Part); - } - } - - // We'll have created a bunch of ZExts that are now parentless. Clean up. - for (const auto &KV : Cost->getMinimalBitwidths()) { - // If the value wasn't vectorized, we must maintain the original scalar - // type. The absence of the value from State indicates that it - // wasn't vectorized. - // FIXME: Should not rely on getVPValue at this point. - VPValue *Def = State.Plan->getVPValue(KV.first, true); - if (!State.hasAnyVectorValue(Def)) - continue; - for (unsigned Part = 0; Part < UF; ++Part) { - Value *I = State.get(Def, Part); - ZExtInst *Inst = dyn_cast<ZExtInst>(I); - if (Inst && Inst->use_empty()) { - Value *NewI = Inst->getOperand(0); - Inst->eraseFromParent(); - State.reset(Def, NewI, Part); - } - } - } -} - void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VPlan &Plan) { - // Insert truncates and extends for any truncated instructions as hints to - // InstCombine. - if (VF.isVector()) - truncateToMinimalBitwidths(State); - // Fix widened non-induction PHIs by setting up the PHI operands. if (EnableVPlanNativePath) fixNonInductionPHIs(Plan, State); @@ -3710,6 +3439,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); + PSE.getSE()->forgetBlockAndLoopDispositions(); // After vectorization, the exit blocks of the original loop will have // additional predecessors. Invalidate SCEVs for the exit phis in case SE @@ -3718,7 +3448,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, OrigLoop->getExitBlocks(ExitBlocks); for (BasicBlock *Exit : ExitBlocks) for (PHINode &PN : Exit->phis()) - PSE.getSE()->forgetValue(&PN); + PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); @@ -3744,7 +3474,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated // in the exit block, so update the builder. - State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); + State.Builder.SetInsertPoint(State.CFG.ExitBB, + State.CFG.ExitBB->getFirstNonPHIIt()); for (const auto &KV : Plan.getLiveOuts()) KV.second->fixPhi(Plan, State); @@ -3782,40 +3513,10 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { VPBasicBlock *Header = State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); - // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores - // sank outside of the loop would keep the same order as they had in the - // original loop. - SmallVector<VPReductionPHIRecipe *> ReductionPHIList; for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) - ReductionPHIList.emplace_back(ReductionPhi); + fixReduction(ReductionPhi, State); } - stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1, - const VPReductionPHIRecipe *R2) { - auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; - auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; - - // If neither of the recipes has an intermediate store, keep the order the - // same. - if (!IS1 && !IS2) - return false; - - // If only one of the recipes has an intermediate store, then move it - // towards the beginning of the list. - if (IS1 && !IS2) - return true; - - if (!IS1 && IS2) - return false; - - // If both recipes have an intermediate store, then the recipe with the - // later store should be processed earlier. So it should go to the beginning - // of the list. - return DT->dominates(IS2, IS1); - }); - - for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList) - fixReduction(ReductionPhi, State); for (VPRecipeBase &R : Header->phis()) { if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) @@ -3929,7 +3630,7 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence( } // Fix the initial value of the original recurrence in the scalar loop. - Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); + Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin()); PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); @@ -3953,90 +3654,56 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, RecurKind RK = RdxDesc.getRecurrenceKind(); TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - State.setDebugLocFromInst(ReductionStartValue); + if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue)) + State.setDebugLocFrom(I->getDebugLoc()); VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); - // This is the vector-clone of the value that leaves the loop. - Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); // Before each round, move the insertion point right between // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement // instructions. - Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(LoopMiddleBlock, + LoopMiddleBlock->getFirstInsertionPt()); - State.setDebugLocFromInst(LoopExitInst); + State.setDebugLocFrom(LoopExitInst->getDebugLoc()); Type *PhiTy = OrigPhi->getType(); - - VPBasicBlock *LatchVPBB = - PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); - BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; // If tail is folded by masking, the vector value to leave the loop should be // a Select choosing between the vectorized LoopExitInst and vectorized Phi, // instead of the former. For an inloop reduction the reduction will already // be predicated, and does not need to be handled here. if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { - for (unsigned Part = 0; Part < UF; ++Part) { - Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); - SelectInst *Sel = nullptr; - for (User *U : VecLoopExitInst->users()) { - if (isa<SelectInst>(U)) { - assert(!Sel && "Reduction exit feeding two selects"); - Sel = cast<SelectInst>(U); - } else - assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); - } - assert(Sel && "Reduction exit feeds no select"); - State.reset(LoopExitInstDef, Sel, Part); - - if (isa<FPMathOperator>(Sel)) - Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); - - // If the target can create a predicated operator for the reduction at no - // extra cost in the loop (for example a predicated vadd), it can be - // cheaper for the select to remain in the loop than be sunk out of it, - // and so use the select value for the phi instead of the old - // LoopExitValue. - if (PreferPredicatedReductionSelect || - TTI->preferPredicatedReductionSelect( - RdxDesc.getOpcode(), PhiTy, - TargetTransformInfo::ReductionFlags())) { - auto *VecRdxPhi = - cast<PHINode>(State.get(PhiR, Part)); - VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); + VPValue *Def = nullptr; + for (VPUser *U : LoopExitInstDef->users()) { + auto *S = dyn_cast<VPInstruction>(U); + if (S && S->getOpcode() == Instruction::Select) { + Def = S; + break; } } + if (Def) + LoopExitInstDef = Def; } + VectorParts RdxParts(UF); + for (unsigned Part = 0; Part < UF; ++Part) + RdxParts[Part] = State.get(LoopExitInstDef, Part); + // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { - assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); + Builder.SetInsertPoint(LoopMiddleBlock, + LoopMiddleBlock->getFirstInsertionPt()); Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); - Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); - VectorParts RdxParts(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - RdxParts[Part] = State.get(LoopExitInstDef, Part); - Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); - Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) - : Builder.CreateZExt(Trunc, VecTy); - for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) - if (U != Trunc) { - U->replaceUsesOfWith(RdxParts[Part], Extnd); - RdxParts[Part] = Extnd; - } - } - Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); for (unsigned Part = 0; Part < UF; ++Part) { RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); - State.reset(LoopExitInstDef, RdxParts[Part], Part); } } // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); + Value *ReducedPartRdx = RdxParts[0]; unsigned Op = RecurrenceDescriptor::getOpcode(RK); // The middle block terminator has already been assigned a DebugLoc here (the @@ -4046,21 +3713,21 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // conditional branch, and (c) other passes may add new predecessors which // terminate on this line. This is the easiest way to ensure we don't // accidentally cause an extra step back into the loop while debugging. - State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); + State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc()); if (PhiR->isOrdered()) - ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); + ReducedPartRdx = RdxParts[UF - 1]; else { // Floating-point operations should have some FMF to enable the reduction. IRBuilderBase::FastMathFlagGuard FMFG(Builder); Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = State.get(LoopExitInstDef, Part); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + Value *RdxPart = RdxParts[Part]; + if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) - ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, - ReducedPartRdx, RdxPart); + else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) + ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, + ReducedPartRdx, RdxPart); else ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); } @@ -4070,7 +3737,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // target reduction in the loop using a Reduction recipe. if (VF.isVector() && !PhiR->isInLoop()) { ReducedPartRdx = - createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); + createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -4107,7 +3774,8 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // inside the loop, create the final store here. if (StoreInst *SI = RdxDesc.IntermediateStore) { StoreInst *NewSI = - Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); + Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(), + SI->getAlign()); propagateMetadata(NewSI, SI); // If the reduction value is used in other places, @@ -4436,7 +4104,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication( default: return true; case Instruction::Call: - return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF); + if (VF.isScalar()) + return true; + return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) + .Kind == CM_Scalarize; case Instruction::Load: case Instruction::Store: { auto *Ptr = getLoadStorePointerOperand(I); @@ -4988,7 +4659,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { } FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( - unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { + unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5076,12 +4747,12 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( FixedScalableVFPair Result(ElementCount::getFixed(1), ElementCount::getScalable(0)); if (auto MaxVF = - getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, + getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, MaxSafeFixedVF, FoldTailByMasking)) Result.FixedVF = MaxVF; if (auto MaxVF = - getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, + getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, MaxSafeScalableVF, FoldTailByMasking)) if (MaxVF.isScalable()) { Result.ScalableVF = MaxVF; @@ -5105,6 +4776,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); if (TC == 1) { reportVectorizationFailure("Single iteration (non) loop", @@ -5115,7 +4787,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return computeFeasibleMaxVF(TC, UserVF, false); + return computeFeasibleMaxVF(MaxTC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: [[fallthrough]]; case CM_ScalarEpilogueNotNeededUsePredicate: @@ -5153,7 +4825,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(TC, UserVF, false); + return computeFeasibleMaxVF(MaxTC, UserVF, false); } return FixedScalableVFPair::getNone(); } @@ -5170,7 +4842,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); + FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); // Avoid tail folding if the trip count is known to be a multiple of any VF // we choose. @@ -5246,7 +4918,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( - unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, + unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, ElementCount MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); const TypeSize WidestRegister = TTI.getRegisterBitWidth( @@ -5285,31 +4957,35 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( } // When a scalar epilogue is required, at least one iteration of the scalar - // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a + // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a // max VF that results in a dead vector loop. - if (ConstTripCount > 0 && requiresScalarEpilogue(true)) - ConstTripCount -= 1; - - if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && - (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { - // If loop trip count (TC) is known at compile time there is no point in - // choosing VF greater than TC (as done in the loop below). Select maximum - // power of two which doesn't exceed TC. - // If MaxVectorElementCount is scalable, we only fall back on a fixed VF - // when the TC is less than or equal to the known number of lanes. - auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount); + if (MaxTripCount > 0 && requiresScalarEpilogue(true)) + MaxTripCount -= 1; + + if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && + (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { + // If upper bound loop trip count (TC) is known at compile time there is no + // point in choosing VF greater than TC (as done in the loop below). Select + // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is + // scalable, we only fall back on a fixed VF when the TC is less than or + // equal to the known number of lanes. + auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " "exceeding the constant trip count: " - << ClampedConstTripCount << "\n"); - return ElementCount::getFixed(ClampedConstTripCount); + << ClampedUpperTripCount << "\n"); + return ElementCount::get( + ClampedUpperTripCount, + FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); } TargetTransformInfo::RegisterKind RegKind = ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector; ElementCount MaxVF = MaxVectorElementCount; - if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && - TTI.shouldMaximizeVectorBandwidth(RegKind))) { + if (MaximizeBandwidth || + (MaximizeBandwidth.getNumOccurrences() == 0 && + (TTI.shouldMaximizeVectorBandwidth(RegKind) || + (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { auto MaxVectorElementCountMaxBW = ElementCount::get( llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), ComputeScalableMaxVF); @@ -5981,7 +5657,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, HasReductions && any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { const RecurrenceDescriptor &RdxDesc = Reduction.second; - return RecurrenceDescriptor::isSelectCmpRecurrenceKind( + return RecurrenceDescriptor::isAnyOfRecurrenceKind( RdxDesc.getRecurrenceKind()); }); if (HasSelectCmpReductions) { @@ -6149,6 +5825,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { if (ValuesToIgnore.count(I)) continue; + collectInLoopReductions(); + // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { // Count the number of registers used, per register class, given all open @@ -6668,10 +6346,11 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, std::optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( - Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { + Instruction *I, ElementCount VF, Type *Ty, + TTI::TargetCostKind CostKind) const { using namespace llvm::PatternMatch; // Early exit for no inloop reductions - if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) + if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) return std::nullopt; auto *VectorTy = cast<VectorType>(Ty); @@ -6706,10 +6385,10 @@ LoopVectorizationCostModel::getReductionPatternCost( // Find the reduction this chain is a part of and calculate the basic cost of // the reduction on its own. - Instruction *LastChain = InLoopReductionImmediateChains[RetI]; + Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); Instruction *ReductionPhi = LastChain; while (!isa<PHINode>(ReductionPhi)) - ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; + ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; @@ -7127,6 +6806,168 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { } } +void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { + assert(!VF.isScalar() && + "Trying to set a vectorization decision for a scalar VF"); + + for (BasicBlock *BB : TheLoop->blocks()) { + // For each instruction in the old loop. + for (Instruction &I : *BB) { + CallInst *CI = dyn_cast<CallInst>(&I); + + if (!CI) + continue; + + InstructionCost ScalarCost = InstructionCost::getInvalid(); + InstructionCost VectorCost = InstructionCost::getInvalid(); + InstructionCost IntrinsicCost = InstructionCost::getInvalid(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + Function *ScalarFunc = CI->getCalledFunction(); + Type *ScalarRetTy = CI->getType(); + SmallVector<Type *, 4> Tys, ScalarTys; + bool MaskRequired = Legal->isMaskRequired(CI); + for (auto &ArgOp : CI->args()) + ScalarTys.push_back(ArgOp->getType()); + + // Compute corresponding vector type for return value and arguments. + Type *RetTy = ToVectorTy(ScalarRetTy, VF); + for (Type *ScalarTy : ScalarTys) + Tys.push_back(ToVectorTy(ScalarTy, VF)); + + // An in-loop reduction using an fmuladd intrinsic is a special case; + // we don't want the normal cost for that intrinsic. + if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) + if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { + setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, + getVectorIntrinsicIDForCall(CI, TLI), + std::nullopt, *RedCost); + continue; + } + + // Estimate cost of scalarized vector call. The source operands are + // assumed to be vectors, so we need to extract individual elements from + // there, execute VF scalar calls, and then gather the result into the + // vector return value. + InstructionCost ScalarCallCost = + TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); + + // Compute costs of unpacking argument values for the scalar calls and + // packing the return values to a vector. + InstructionCost ScalarizationCost = + getScalarizationOverhead(CI, VF, CostKind); + + ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; + + // Find the cost of vectorizing the call, if we can find a suitable + // vector variant of the function. + bool UsesMask = false; + VFInfo FuncInfo; + Function *VecFunc = nullptr; + // Search through any available variants for one we can use at this VF. + for (VFInfo &Info : VFDatabase::getMappings(*CI)) { + // Must match requested VF. + if (Info.Shape.VF != VF) + continue; + + // Must take a mask argument if one is required + if (MaskRequired && !Info.isMasked()) + continue; + + // Check that all parameter kinds are supported + bool ParamsOk = true; + for (VFParameter Param : Info.Shape.Parameters) { + switch (Param.ParamKind) { + case VFParamKind::Vector: + break; + case VFParamKind::OMP_Uniform: { + Value *ScalarParam = CI->getArgOperand(Param.ParamPos); + // Make sure the scalar parameter in the loop is invariant. + if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), + TheLoop)) + ParamsOk = false; + break; + } + case VFParamKind::OMP_Linear: { + Value *ScalarParam = CI->getArgOperand(Param.ParamPos); + // Find the stride for the scalar parameter in this loop and see if + // it matches the stride for the variant. + // TODO: do we need to figure out the cost of an extract to get the + // first lane? Or do we hope that it will be folded away? + ScalarEvolution *SE = PSE.getSE(); + const auto *SAR = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); + + if (!SAR || SAR->getLoop() != TheLoop) { + ParamsOk = false; + break; + } + + const SCEVConstant *Step = + dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); + + if (!Step || + Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) + ParamsOk = false; + + break; + } + case VFParamKind::GlobalPredicate: + UsesMask = true; + break; + default: + ParamsOk = false; + break; + } + } + + if (!ParamsOk) + continue; + + // Found a suitable candidate, stop here. + VecFunc = CI->getModule()->getFunction(Info.VectorName); + FuncInfo = Info; + break; + } + + // Add in the cost of synthesizing a mask if one wasn't required. + InstructionCost MaskCost = 0; + if (VecFunc && UsesMask && !MaskRequired) + MaskCost = TTI.getShuffleCost( + TargetTransformInfo::SK_Broadcast, + VectorType::get(IntegerType::getInt1Ty( + VecFunc->getFunctionType()->getContext()), + VF)); + + if (TLI && VecFunc && !CI->isNoBuiltin()) + VectorCost = + TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; + + // Find the cost of an intrinsic; some targets may have instructions that + // perform the operation without needing an actual call. + Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); + if (IID != Intrinsic::not_intrinsic) + IntrinsicCost = getVectorIntrinsicCost(CI, VF); + + InstructionCost Cost = ScalarCost; + InstWidening Decision = CM_Scalarize; + + if (VectorCost <= Cost) { + Cost = VectorCost; + Decision = CM_VectorCall; + } + + if (IntrinsicCost <= Cost) { + Cost = IntrinsicCost; + Decision = CM_IntrinsicCall; + } + + setCallWideningDecision(CI, VF, Decision, VecFunc, IID, + FuncInfo.getParamIndexForOptionalMask(), Cost); + } + } +} + InstructionCost LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy) { @@ -7156,7 +6997,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // With the exception of GEPs and PHIs, after scalarization there should // only be one copy of the instruction generated in the loop. This is // because the VF is either 1, or any instructions that need scalarizing - // have already been dealt with by the the time we get here. As a result, + // have already been dealt with by the time we get here. As a result, // it means we don't have to multiply the instruction cost by VF. assert(I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || @@ -7384,6 +7225,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, return TTI::CastContextHint::Reversed; case LoopVectorizationCostModel::CM_Unknown: llvm_unreachable("Instr did not go through cost modelling?"); + case LoopVectorizationCostModel::CM_VectorCall: + case LoopVectorizationCostModel::CM_IntrinsicCall: + llvm_unreachable_internal("Instr has invalid widening decision"); } llvm_unreachable("Unhandled case!"); @@ -7441,19 +7285,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } - case Instruction::Call: { - if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) - if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) - return *RedCost; - Function *Variant; - CallInst *CI = cast<CallInst>(I); - InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant); - if (getVectorIntrinsicIDForCall(CI, TLI)) { - InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); - return std::min(CallCost, IntrinsicCost); - } - return CallCost; - } + case Instruction::Call: + return getVectorCallCost(cast<CallInst>(I), VF); case Instruction::ExtractValue: return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); case Instruction::Alloca: @@ -7521,8 +7354,9 @@ void LoopVectorizationCostModel::collectInLoopReductions() { SmallVector<Instruction *, 4> ReductionOperations = RdxDesc.getReductionOpChain(Phi, TheLoop); bool InLoop = !ReductionOperations.empty(); + if (InLoop) { - InLoopReductionChains[Phi] = ReductionOperations; + InLoopReductions.insert(Phi); // Add the elements to InLoopReductionImmediateChains for cost modelling. Instruction *LastChain = Phi; for (auto *I : ReductionOperations) { @@ -7535,21 +7369,38 @@ void LoopVectorizationCostModel::collectInLoopReductions() { } } +VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, + DebugLoc DL, const Twine &Name) { + assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && + Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); + return tryInsertInstruction( + new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name)); +} + +// This function will select a scalable VF if the target supports scalable +// vectors and a fixed one otherwise. // TODO: we could return a pair of values that specify the max VF and // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment // doesn't have a cost model that can choose which plan to execute if // more than one is generated. -static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, - LoopVectorizationCostModel &CM) { +static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, + LoopVectorizationCostModel &CM) { unsigned WidestType; std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); - return WidestVectorRegBits / WidestType; + + TargetTransformInfo::RegisterKind RegKind = + TTI.enableScalableVectorization() + ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector; + + TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); + unsigned N = RegSize.getKnownMinValue() / WidestType; + return ElementCount::get(N, RegSize.isScalable()); } VectorizationFactor LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { - assert(!UserVF.isScalable() && "scalable vectors not yet supported"); ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. @@ -7559,10 +7410,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. if (UserVF.isZero()) { - VF = ElementCount::getFixed(determineVPlanVF( - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) - .getFixedValue(), - CM)); + VF = determineVPlanVF(TTI, CM); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. @@ -7571,6 +7419,17 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { << "overriding computed VF.\n"); VF = ElementCount::getFixed(4); } + } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && + !ForceTargetSupportsScalableVectors) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " + << "not supported by the target.\n"); + reportVectorizationFailure( + "Scalable vectorization requested but not supported by the target", + "the scalable user-specified vectorization width for outer-loop " + "vectorization cannot be used because the target does not support " + "scalable vectors.", + "ScalableVFUnfeasible", ORE, OrigLoop); + return VectorizationFactor::Disabled(); } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); assert(isPowerOf2_32(VF.getKnownMinValue()) && @@ -7624,9 +7483,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. + CM.collectInLoopReductions(); if (CM.selectUserVectorizationFactor(UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); if (!hasPlanWithVF(UserVF)) { LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF @@ -7650,6 +7509,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) VFCandidates.insert(VF); + CM.collectInLoopReductions(); for (const auto &VF : VFCandidates) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); @@ -7660,7 +7520,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.collectInstsToScalarize(VF); } - CM.collectInLoopReductions(); buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); @@ -7705,7 +7564,7 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { if (MD) { const auto *S = dyn_cast<MDString>(MD->getOperand(0)); IsUnrollMetadata = - S && S->getString().startswith("llvm.loop.unroll.disable"); + S && S->getString().starts_with("llvm.loop.unroll.disable"); } MDs.push_back(LoopID->getOperand(i)); } @@ -7729,7 +7588,7 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { SCEV2ValueTy LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, - DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { + const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { assert(BestVPlan.hasVF(BestVF) && "Trying to execute plan with unsupported VF"); assert(BestVPlan.hasUF(BestUF) && @@ -7745,7 +7604,8 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan( VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); // Perform the actual loop transformation. - VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; + VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, + OrigLoop->getHeader()->getContext()); // 0. Generate SCEV-dependent code into the preheader, including TripCount, // before making any changes to the CFG. @@ -7798,9 +7658,9 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan( //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute( - ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr), - CanonicalIVStartValue, State, IsEpilogueVectorization); + BestVPlan.prepareToExecute(ILV.getTripCount(), + ILV.getOrCreateVectorTripCount(nullptr), + CanonicalIVStartValue, State); BestVPlan.execute(&State); @@ -7964,9 +7824,11 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, EPI.TripCount = Count; } - ReplaceInstWithInst( - TCCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) + setBranchWeights(BI, MinItersBypassWeights); + ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); return TCCheckBlock; } @@ -8064,8 +7926,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( // Generate a resume induction for the vector epilogue and put it in the // vector epilogue preheader Type *IdxTy = Legal->getWidestInductionType(); - PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", - LoopVectorPreHeader->getFirstNonPHI()); + PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val"); + EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt()); EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), EPI.MainLoopIterationCountCheck); @@ -8110,9 +7972,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( EPI.EpilogueVF, EPI.EpilogueUF), "min.epilog.iters.check"); - ReplaceInstWithInst( - Insert->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { + unsigned MainLoopStep = UF * VF.getKnownMinValue(); + unsigned EpilogueLoopStep = + EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); + // We assume the remaining `Count` is equally distributed in + // [0, MainLoopStep) + // So the probability for `Count < EpilogueLoopStep` should be + // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep + unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); + const uint32_t Weights[] = {EstimatedSkipCount, + MainLoopStep - EstimatedSkipCount}; + setBranchWeights(BI, Weights); + } + ReplaceInstWithInst(Insert->getTerminator(), &BI); LoopBypassBlocks.push_back(Insert); return Insert; @@ -8206,6 +8081,33 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, return EdgeMaskCache[Edge] = EdgeMask; } +void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { + BasicBlock *Header = OrigLoop->getHeader(); + + // When not folding the tail, use nullptr to model all-true mask. + if (!CM.foldTailByMasking()) { + BlockMaskCache[Header] = nullptr; + return; + } + + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + HeaderVPBB->insert(IV, NewInsertionPoint); + + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + VPValue *BlockMask = nullptr; + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + BlockMaskCache[Header] = BlockMask; +} + VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); @@ -8214,45 +8116,12 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { if (BCEntryIt != BlockMaskCache.end()) return BCEntryIt->second; + assert(OrigLoop->getHeader() != BB && + "Loop header must have cached block mask"); + // All-one mask is modelled as no-mask following the convention for masked // load/store/gather/scatter. Initialize BlockMask to no-mask. VPValue *BlockMask = nullptr; - - if (OrigLoop->getHeader() == BB) { - if (!CM.blockNeedsPredicationForAnyReason(BB)) - return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - - assert(CM.foldTailByMasking() && "must fold the tail"); - - // If we're using the active lane mask for control flow, then we get the - // mask from the active lane mask PHI that is cached in the VPlan. - TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); - if (useActiveLaneMaskForControlFlow(TFStyle)) - return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); - - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); - - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (useActiveLaneMask(TFStyle)) { - VPValue *TC = Plan.getTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, - nullptr, "active.lane.mask"); - } else { - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); - } - return BlockMaskCache[BB] = BlockMask; - } - // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); @@ -8458,22 +8327,15 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, bool ShouldUseVectorIntrinsic = ID && LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) -> bool { - Function *Variant; - // Is it beneficial to perform intrinsic call compared to lib - // call? - InstructionCost CallCost = - CM.getVectorCallCost(CI, VF, &Variant); - InstructionCost IntrinsicCost = - CM.getVectorIntrinsicCost(CI, VF); - return IntrinsicCost <= CallCost; + return CM.getCallWideningDecision(CI, VF).Kind == + LoopVectorizationCostModel::CM_IntrinsicCall; }, Range); if (ShouldUseVectorIntrinsic) return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); Function *Variant = nullptr; - ElementCount VariantVF; - bool NeedsMask = false; + std::optional<unsigned> MaskPos; // Is better to call a vectorized version of the function than to to scalarize // the call? auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8492,16 +8354,19 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, // finds a valid variant. if (Variant) return false; - CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask); - // If we found a valid vector variant at this VF, then store the VF - // in case we need to generate a mask. - if (Variant) - VariantVF = VF; - return Variant != nullptr; + LoopVectorizationCostModel::CallWideningDecision Decision = + CM.getCallWideningDecision(CI, VF); + if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { + Variant = Decision.Variant; + MaskPos = Decision.MaskPos; + return true; + } + + return false; }, Range); if (ShouldUseVectorCall) { - if (NeedsMask) { + if (MaskPos.has_value()) { // We have 2 cases that would require a mask: // 1) The block needs to be predicated, either due to a conditional // in the scalar loop or use of an active lane mask with @@ -8516,17 +8381,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); - VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true); - unsigned MaskPos = 0; - - for (const VFInfo &Info : VFDatabase::getMappings(*CI)) - if (Info.Shape == Shape) { - assert(Info.isMasked() && "Vector function info shape mismatch"); - MaskPos = Info.getParamIndexForOptionalMask().value(); - break; - } - - Ops.insert(Ops.begin() + MaskPos, Mask); + Ops.insert(Ops.begin() + *MaskPos, Mask); } return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), @@ -8747,8 +8602,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, } if (auto *CI = dyn_cast<CastInst>(Instr)) { - return toVPRecipeResult( - new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI)); + return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0], + CI->getType(), *CI)); } return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); @@ -8758,27 +8613,26 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - // Add assume instructions we need to drop to DeadInstructions, to prevent - // them from being added to the VPlan. - // TODO: We only need to drop assumes in blocks that get flattend. If the - // control flow is preserved, we should keep them. - SmallPtrSet<Instruction *, 4> DeadInstructions; - auto &ConditionalAssumes = Legal->getConditionalAssumes(); - DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); - auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; - if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions)) - VPlans.push_back(std::move(*Plan)); + if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { + // Now optimize the initial VPlan. + if (!Plan->hasVF(ElementCount::getFixed(1))) + VPlanTransforms::truncateToMinimalBitwidths( + *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); + VPlanTransforms::optimize(*Plan, *PSE.getSE()); + assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); + VPlans.push_back(std::move(Plan)); + } VF = SubRange.End; } } // Add the necessary canonical IV and branch recipes required to control the // loop. -static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - TailFoldingStyle Style) { +static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, + DebugLoc DL) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); @@ -8790,102 +8644,24 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar // IV by VF * UF. - bool HasNUW = Style == TailFoldingStyle::None; auto *CanonicalIVIncrement = - new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW - : VPInstruction::CanonicalIVIncrement, - {CanonicalIVPHI}, DL, "index.next"); + new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, + {HasNUW, false}, DL, "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); - if (useActiveLaneMaskForControlFlow(Style)) { - // Create the active lane mask instruction in the vplan preheader. - VPBasicBlock *VecPreheader = - cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()); - - // We can't use StartV directly in the ActiveLaneMask VPInstruction, since - // we have to take unrolling into account. Each part needs to start at - // Part * VF - auto *CanonicalIVIncrementParts = - new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW - : VPInstruction::CanonicalIVIncrementForPart, - {StartV}, DL, "index.part.next"); - VecPreheader->appendRecipe(CanonicalIVIncrementParts); - - // Create the ActiveLaneMask instruction using the correct start values. - VPValue *TC = Plan.getTripCount(); - - VPValue *TripCount, *IncrementValue; - if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { - // When avoiding a runtime check, the active.lane.mask inside the loop - // uses a modified trip count and the induction variable increment is - // done after the active.lane.mask intrinsic is called. - auto *TCMinusVF = - new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); - VecPreheader->appendRecipe(TCMinusVF); - IncrementValue = CanonicalIVPHI; - TripCount = TCMinusVF; - } else { - // When the loop is guarded by a runtime overflow check for the loop - // induction variable increment by VF, we can increment the value before - // the get.active.lane mask and use the unmodified tripcount. - EB->appendRecipe(CanonicalIVIncrement); - IncrementValue = CanonicalIVIncrement; - TripCount = TC; - } - - auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, - {CanonicalIVIncrementParts, TC}, DL, - "active.lane.mask.entry"); - VecPreheader->appendRecipe(EntryALM); - - // Now create the ActiveLaneMaskPhi recipe in the main loop using the - // preheader ActiveLaneMask instruction. - auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); - Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); - - // Create the active lane mask for the next iteration of the loop. - CanonicalIVIncrementParts = - new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW - : VPInstruction::CanonicalIVIncrementForPart, - {IncrementValue}, DL); - EB->appendRecipe(CanonicalIVIncrementParts); - - auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, - {CanonicalIVIncrementParts, TripCount}, DL, - "active.lane.mask.next"); - EB->appendRecipe(ALM); - LaneMaskPhi->addOperand(ALM); - - if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { - // Do the increment of the canonical IV after the active.lane.mask, because - // that value is still based off %CanonicalIVPHI - EB->appendRecipe(CanonicalIVIncrement); - } - - // We have to invert the mask here because a true condition means jumping - // to the exit block. - auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); - EB->appendRecipe(NotMask); - - VPInstruction *BranchBack = - new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); - EB->appendRecipe(BranchBack); - } else { - EB->appendRecipe(CanonicalIVIncrement); + EB->appendRecipe(CanonicalIVIncrement); - // Add the BranchOnCount VPInstruction to the latch. - VPInstruction *BranchBack = new VPInstruction( - VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchBack); - } + // Add the BranchOnCount VPInstruction to the latch. + VPInstruction *BranchBack = + new VPInstruction(VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchBack); } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the // original exit block. -static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, - VPBasicBlock *MiddleVPBB, Loop *OrigLoop, +static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, VPlan &Plan) { BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); @@ -8902,8 +8678,8 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, } } -std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) { +VPlanPtr +LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; @@ -8914,24 +8690,6 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // process after constructing the initial VPlan. // --------------------------------------------------------------------------- - for (const auto &Reduction : CM.getInLoopReductionChains()) { - PHINode *Phi = Reduction.first; - RecurKind Kind = - Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); - const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; - - RecipeBuilder.recordRecipeOf(Phi); - for (const auto &R : ReductionOperations) { - RecipeBuilder.recordRecipeOf(R); - // For min/max reductions, where we have a pair of icmp/select, we also - // need to record the ICmp recipe, so it can be removed later. - assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && - "Only min/max recurrences allowed for inloop reductions"); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) - RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); - } - } - // For each interleave group which is relevant for this (possibly trimmed) // Range, add it to the set of groups to be later applied to the VPlan and add // placeholders for its members' Recipes which we'll be replacing with a @@ -8972,23 +8730,27 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); - auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); - VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry()); - VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); - VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); + Plan->getVectorLoopRegion()->setEntry(HeaderVPBB); + Plan->getVectorLoopRegion()->setExiting(LatchVPBB); // Don't use getDecisionAndClampRange here, because we don't know the UF // so this function is better to be conservative, rather than to split // it up into different VPlans. + // TODO: Consider using getDecisionAndClampRange here to split up VPlans. bool IVUpdateMayOverflow = false; for (ElementCount VF : Range) IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); - Instruction *DLInst = - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), - DLInst ? DLInst->getDebugLoc() : DebugLoc(), - CM.getTailFoldingStyle(IVUpdateMayOverflow)); + DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); + TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); + // When not folding the tail, we know that the induction increment will not + // overflow. + bool HasNUW = Style == TailFoldingStyle::None; + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); + + // Proactively create header mask. Masks for other blocks are created on + // demand. + RecipeBuilder.createHeaderMask(*Plan); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9005,14 +8767,8 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. - for (Instruction &I : BB->instructionsWithoutDebug(false)) { + for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { Instruction *Instr = &I; - - // First filter out irrelevant instructions, to ensure no recipes are - // built for them. - if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) - continue; - SmallVector<VPValue *, 4> Operands; auto *Phi = dyn_cast<PHINode>(Instr); if (Phi && Phi->getParent() == OrigLoop->getHeader()) { @@ -9052,11 +8808,18 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( } RecipeBuilder.setRecipe(Instr, Recipe); - if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && - HeaderVPBB->getFirstNonPhi() != VPBB->end()) { - // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the - // phi section of HeaderVPBB. - assert(isa<TruncInst>(Instr)); + if (isa<VPHeaderPHIRecipe>(Recipe)) { + // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In + // the following cases, VPHeaderPHIRecipes may be created after non-phi + // recipes and need to be moved to the phi section of HeaderVPBB: + // * tail-folding (non-phi recipes computing the header mask are + // introduced earlier than regular header phi recipes, and should appear + // after them) + // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. + + assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || + CM.foldTailByMasking() || isa<TruncInst>(Instr)) && + "unexpected recipe needs moving"); Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); } else VPBB->appendRecipe(Recipe); @@ -9074,7 +8837,7 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. } else - addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); + addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan); assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && @@ -9088,8 +8851,7 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // --------------------------------------------------------------------------- // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, - RecipeBuilder, Range.Start); + adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start); // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a @@ -9150,21 +8912,18 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) - return std::nullopt; - - VPlanTransforms::removeRedundantCanonicalIVs(*Plan); - VPlanTransforms::removeRedundantInductionCasts(*Plan); - - VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); - VPlanTransforms::removeDeadRecipes(*Plan); - - VPlanTransforms::createAndOptimizeReplicateRegions(*Plan); - - VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); - VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); + return nullptr; - assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); - return std::make_optional(std::move(Plan)); + if (useActiveLaneMask(Style)) { + // TODO: Move checks to VPlanTransforms::addActiveLaneMask once + // TailFoldingStyle is visible there. + bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); + bool WithoutRuntimeCheck = + Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, + WithoutRuntimeCheck); + } + return Plan; } VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { @@ -9198,8 +8957,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); Term->eraseFromParent(); - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - CM.getTailFoldingStyle()); + // Tail folding is not supported for outer loops, so the induction increment + // is guaranteed to not wrap. + bool HasNUW = true; + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, + DebugLoc()); return Plan; } @@ -9211,105 +8973,211 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { - for (const auto &Reduction : CM.getInLoopReductionChains()) { - PHINode *Phi = Reduction.first; - const RecurrenceDescriptor &RdxDesc = - Legal->getReductionVars().find(Phi)->second; - const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; - - if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) + VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock(); + // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores + // sank outside of the loop would keep the same order as they had in the + // original loop. + SmallVector<VPReductionPHIRecipe *> ReductionPHIList; + for (VPRecipeBase &R : Header->phis()) { + if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) + ReductionPHIList.emplace_back(ReductionPhi); + } + bool HasIntermediateStore = false; + stable_sort(ReductionPHIList, + [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1, + const VPReductionPHIRecipe *R2) { + auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; + auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; + HasIntermediateStore |= IS1 || IS2; + + // If neither of the recipes has an intermediate store, keep the + // order the same. + if (!IS1 && !IS2) + return false; + + // If only one of the recipes has an intermediate store, then + // move it towards the beginning of the list. + if (IS1 && !IS2) + return true; + + if (!IS1 && IS2) + return false; + + // If both recipes have an intermediate store, then the recipe + // with the later store should be processed earlier. So it + // should go to the beginning of the list. + return DT->dominates(IS2, IS1); + }); + + if (HasIntermediateStore && ReductionPHIList.size() > 1) + for (VPRecipeBase *R : ReductionPHIList) + R->moveBefore(*Header, Header->getFirstNonPhi()); + + SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis; + for (VPRecipeBase &R : Header->phis()) { + auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); + if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) continue; + InLoopReductionPhis.push_back(PhiR); + } + + for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) { + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + RecurKind Kind = RdxDesc.getRecurrenceKind(); + assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && + "AnyOf reductions are not allowed for in-loop reductions"); + + // Collect the chain of "link" recipes for the reduction starting at PhiR. + SetVector<VPRecipeBase *> Worklist; + Worklist.insert(PhiR); + for (unsigned I = 0; I != Worklist.size(); ++I) { + VPRecipeBase *Cur = Worklist[I]; + for (VPUser *U : Cur->getVPSingleValue()->users()) { + auto *UserRecipe = dyn_cast<VPRecipeBase>(U); + if (!UserRecipe) + continue; + assert(UserRecipe->getNumDefinedValues() == 1 && + "recipes must define exactly one result value"); + Worklist.insert(UserRecipe); + } + } + + // Visit operation "Links" along the reduction chain top-down starting from + // the phi until LoopExitValue. We keep track of the previous item + // (PreviousLink) to tell which of the two operands of a Link will remain + // scalar and which will be reduced. For minmax by select(cmp), Link will be + // the select instructions. + VPRecipeBase *PreviousLink = PhiR; // Aka Worklist[0]. + for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) { + VPValue *PreviousLinkV = PreviousLink->getVPSingleValue(); + + Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); - // ReductionOperations are orders top-down from the phi's use to the - // LoopExitValue. We keep a track of the previous item (the Chain) to tell - // which of the two operands will remain scalar and which will be reduced. - // For minmax the chain will be the select instructions. - Instruction *Chain = Phi; - for (Instruction *R : ReductionOperations) { - VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); - RecurKind Kind = RdxDesc.getRecurrenceKind(); - - VPValue *ChainOp = Plan->getVPValue(Chain); - unsigned FirstOpId; - assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && - "Only min/max recurrences allowed for inloop reductions"); + // Index of the first operand which holds a non-mask vector operand. + unsigned IndexOfFirstOperand; // Recognize a call to the llvm.fmuladd intrinsic. bool IsFMulAdd = (Kind == RecurKind::FMulAdd); - assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && - "Expected instruction to be a call to the llvm.fmuladd intrinsic"); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - assert(isa<VPWidenSelectRecipe>(WidenRecipe) && - "Expected to replace a VPWidenSelectSC"); - FirstOpId = 1; + VPValue *VecOp; + VPBasicBlock *LinkVPBB = CurrentLink->getParent(); + if (IsFMulAdd) { + assert( + RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && + "Expected instruction to be a call to the llvm.fmuladd intrinsic"); + assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || + isa<VPWidenCallRecipe>(CurrentLink)) && + CurrentLink->getOperand(2) == PreviousLinkV && + "expected a call where the previous link is the added operand"); + + // If the instruction is a call to the llvm.fmuladd intrinsic then we + // need to create an fmul recipe (multiplying the first two operands of + // the fmuladd together) to use as the vector operand for the fadd + // reduction. + VPInstruction *FMulRecipe = new VPInstruction( + Instruction::FMul, + {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, + CurrentLinkI->getFastMathFlags()); + LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); + VecOp = FMulRecipe; } else { - assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || - (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && - "Expected to replace a VPWidenSC"); - FirstOpId = 0; + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + if (isa<VPWidenRecipe>(CurrentLink)) { + assert(isa<CmpInst>(CurrentLinkI) && + "need to have the compare of the select"); + continue; + } + assert(isa<VPWidenSelectRecipe>(CurrentLink) && + "must be a select recipe"); + IndexOfFirstOperand = 1; + } else { + assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && + "Expected to replace a VPWidenSC"); + IndexOfFirstOperand = 0; + } + // Note that for non-commutable operands (cmp-selects), the semantics of + // the cmp-select are captured in the recurrence kind. + unsigned VecOpId = + CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLinkV + ? IndexOfFirstOperand + 1 + : IndexOfFirstOperand; + VecOp = CurrentLink->getOperand(VecOpId); + assert(VecOp != PreviousLinkV && + CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - + (VecOpId - IndexOfFirstOperand)) == + PreviousLinkV && + "PreviousLinkV must be the operand other than VecOp"); } - unsigned VecOpId = - R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; - VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); + BasicBlock *BB = CurrentLinkI->getParent(); VPValue *CondOp = nullptr; - if (CM.blockNeedsPredicationForAnyReason(R->getParent())) { + if (CM.blockNeedsPredicationForAnyReason(BB)) { VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(WidenRecipe->getParent(), - WidenRecipe->getIterator()); - CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan); + Builder.setInsertPoint(CurrentLink); + CondOp = RecipeBuilder.createBlockInMask(BB, *Plan); } - if (IsFMulAdd) { - // If the instruction is a call to the llvm.fmuladd intrinsic then we - // need to create an fmul recipe to use as the vector operand for the - // fadd reduction. - VPInstruction *FMulRecipe = new VPInstruction( - Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); - FMulRecipe->setFastMathFlags(R->getFastMathFlags()); - WidenRecipe->getParent()->insert(FMulRecipe, - WidenRecipe->getIterator()); - VecOp = FMulRecipe; - } - VPReductionRecipe *RedRecipe = - new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI); - WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); - Plan->removeVPValueFor(R); - Plan->addVPValue(R, RedRecipe); + VPReductionRecipe *RedRecipe = new VPReductionRecipe( + RdxDesc, CurrentLinkI, PreviousLinkV, VecOp, CondOp); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. - WidenRecipe->getParent()->appendRecipe(RedRecipe); - WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); - WidenRecipe->eraseFromParent(); - - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - VPRecipeBase *CompareRecipe = - RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); - assert(isa<VPWidenRecipe>(CompareRecipe) && - "Expected to replace a VPWidenSC"); - assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && - "Expected no remaining users"); - CompareRecipe->eraseFromParent(); - } - Chain = R; + // Note that this transformation may leave over dead recipes (including + // CurrentLink), which will be cleaned by a later VPlan transform. + LinkVPBB->appendRecipe(RedRecipe); + CurrentLink->getVPSingleValue()->replaceAllUsesWith(RedRecipe); + PreviousLink = RedRecipe; } } - - // If tail is folded by masking, introduce selects between the phi - // and the live-out instruction of each reduction, at the beginning of the - // dedicated latch block. - if (CM.foldTailByMasking()) { - Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); + Builder.setInsertPoint(&*LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); - if (!PhiR || PhiR->isInLoop()) - continue; + VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); + if (!PhiR || PhiR->isInLoop()) + continue; + + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe(); + // If tail is folded by masking, introduce selects between the phi + // and the live-out instruction of each reduction, at the beginning of the + // dedicated latch block. + if (CM.foldTailByMasking()) { VPValue *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); VPValue *Red = PhiR->getBackedgeValue(); assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && "reduction recipe must be defined before latch"); - Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); + FastMathFlags FMFs = RdxDesc.getFastMathFlags(); + Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); + Result = + PhiTy->isFloatingPointTy() + ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs) + : new VPInstruction(Instruction::Select, {Cond, Red, PhiR}); + Result->insertBefore(&*Builder.getInsertPoint()); + Red->replaceUsesWithIf( + Result->getVPSingleValue(), + [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); }); + if (PreferPredicatedReductionSelect || + TTI.preferPredicatedReductionSelect( + PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, + TargetTransformInfo::ReductionFlags())) + PhiR->setOperand(1, Result->getVPSingleValue()); + } + // If the vector reduction can be performed in a smaller type, we truncate + // then extend the loop exit value to enable InstCombine to evaluate the + // entire expression in the smaller type. + Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); + if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { + assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); + Type *RdxTy = RdxDesc.getRecurrenceType(); + auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc, + Result->getVPSingleValue(), RdxTy); + auto *Extnd = + RdxDesc.isSigned() + ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) + : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); + + Trunc->insertAfter(Result); + Extnd->insertAfter(Trunc); + Result->getVPSingleValue()->replaceAllUsesWith(Extnd); + Trunc->setOperand(0, Result->getVPSingleValue()); } } @@ -9347,107 +9215,6 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "Int or FP induction being replicated."); - - Value *Start = getStartValue()->getLiveInIRValue(); - const InductionDescriptor &ID = getInductionDescriptor(); - TruncInst *Trunc = getTruncInst(); - IRBuilderBase &Builder = State.Builder; - assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); - assert(State.VF.isVector() && "must have vector VF"); - - // The value from the original loop to which we are mapping the new induction - // variable. - Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; - - // Fast-math-flags propagate from the original induction instruction. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) - Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); - - // Now do the actual transformations, and start with fetching the step value. - Value *Step = State.get(getStepValue(), VPIteration(0, 0)); - - assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && - "Expected either an induction phi-node or a truncate of it!"); - - // Construct the initial value of the vector IV in the vector loop preheader - auto CurrIP = Builder.saveIP(); - BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); - Builder.SetInsertPoint(VectorPH->getTerminator()); - if (isa<TruncInst>(EntryVal)) { - assert(Start->getType()->isIntegerTy() && - "Truncation requires an integer type"); - auto *TruncType = cast<IntegerType>(EntryVal->getType()); - Step = Builder.CreateTrunc(Step, TruncType); - Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); - } - - Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); - Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); - Value *SteppedStart = getStepVector( - SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); - - // We create vector phi nodes for both integer and floating-point induction - // variables. Here, we determine the kind of arithmetic we will perform. - Instruction::BinaryOps AddOp; - Instruction::BinaryOps MulOp; - if (Step->getType()->isIntegerTy()) { - AddOp = Instruction::Add; - MulOp = Instruction::Mul; - } else { - AddOp = ID.getInductionOpcode(); - MulOp = Instruction::FMul; - } - - // Multiply the vectorization factor by the step using integer or - // floating-point arithmetic as appropriate. - Type *StepType = Step->getType(); - Value *RuntimeVF; - if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); - else - RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); - Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); - - // Create a vector splat to use in the induction update. - // - // FIXME: If the step is non-constant, we create the vector splat with - // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't - // handle a constant vector splat. - Value *SplatVF = isa<Constant>(Mul) - ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(State.VF, Mul); - Builder.restoreIP(CurrIP); - - // We may need to add the step a number of times, depending on the unroll - // factor. The last of those goes into the PHI. - PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", - &*State.CFG.PrevBB->getFirstInsertionPt()); - VecInd->setDebugLoc(EntryVal->getDebugLoc()); - Instruction *LastInduction = VecInd; - for (unsigned Part = 0; Part < State.UF; ++Part) { - State.set(this, LastInduction, Part); - - if (isa<TruncInst>(EntryVal)) - State.addMetadata(LastInduction, EntryVal); - - LastInduction = cast<Instruction>( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); - LastInduction->setDebugLoc(EntryVal->getDebugLoc()); - } - - LastInduction->setName("vec.ind.next"); - VecInd->addIncoming(SteppedStart, VectorPH); - // Add induction update using an incorrect block temporarily. The phi node - // will be fixed after VPlan execution. Note that at this point the latch - // block cannot be used, as it does not exist yet. - // TODO: Model increment value in VPlan, by turning the recipe into a - // multi-def and a subclass of VPHeaderPHIRecipe. - VecInd->addIncoming(LastInduction, VectorPH); -} - void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"); @@ -9480,7 +9247,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); Value *SclrGep = emitTransformedIndex( - State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); + State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, + IndDesc.getKind(), IndDesc.getInductionBinOp()); SclrGep->setName("next.gep"); State.set(this, SclrGep, VPIteration(Part, Lane)); } @@ -9547,41 +9315,26 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); - if (IndDesc.getInductionBinOp() && - isa<FPMathOperator>(IndDesc.getInductionBinOp())) - State.Builder.setFastMathFlags( - IndDesc.getInductionBinOp()->getFastMathFlags()); + if (FPBinOp) + State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); Value *Step = State.get(getStepValue(), VPIteration(0, 0)); Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); - Value *DerivedIV = - emitTransformedIndex(State.Builder, CanonicalIV, - getStartValue()->getLiveInIRValue(), Step, IndDesc); + Value *DerivedIV = emitTransformedIndex( + State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, + Kind, cast_if_present<BinaryOperator>(FPBinOp)); DerivedIV->setName("offset.idx"); - if (ResultTy != DerivedIV->getType()) { - assert(Step->getType()->isIntegerTy() && + if (TruncResultTy) { + assert(TruncResultTy != DerivedIV->getType() && + Step->getType()->isIntegerTy() && "Truncation requires an integer step"); - DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy); + DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy); } assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); State.set(this, DerivedIV, VPIteration(0, 0)); } -void VPScalarIVStepsRecipe::execute(VPTransformState &State) { - // Fast-math-flags propagate from the original induction instruction. - IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); - if (IndDesc.getInductionBinOp() && - isa<FPMathOperator>(IndDesc.getInductionBinOp())) - State.Builder.setFastMathFlags( - IndDesc.getInductionBinOp()->getFastMathFlags()); - - Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); - Value *Step = State.get(getStepValue(), VPIteration(0, 0)); - - buildScalarSteps(BaseIV, Step, IndDesc, this, State); -} - void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), @@ -9592,48 +9345,51 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Reduction being replicated."); Value *PrevInChain = State.get(getChainOp(), 0); - RecurKind Kind = RdxDesc->getRecurrenceKind(); - bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); + RecurKind Kind = RdxDesc.getRecurrenceKind(); + bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc); // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); - State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); + State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewVecOp = State.get(getVecOp(), Part); if (VPValue *Cond = getCondOp()) { - Value *NewCond = State.get(Cond, Part); - VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); - Value *Iden = RdxDesc->getRecurrenceIdentity( - Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); - Value *IdenVec = - State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); - Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); + Value *NewCond = State.VF.isVector() ? State.get(Cond, Part) + : State.get(Cond, {Part, 0}); + VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); + Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); + Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, + RdxDesc.getFastMathFlags()); + if (State.VF.isVector()) { + Iden = + State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); + } + + Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); NewVecOp = Select; } Value *NewRed; Value *NextInChain; if (IsOrdered) { if (State.VF.isVector()) - NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, + NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain); else NewRed = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, + (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, NewVecOp); PrevInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), Part); - NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); + NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); } if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - NextInChain = - createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), - NewRed, PrevInChain); + NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), + NewRed, PrevInChain); } else if (IsOrdered) NextInChain = NewRed; else NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, - PrevInChain); + (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); State.set(this, NextInChain, Part); } } @@ -9652,7 +9408,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { VectorType::get(UI->getType(), State.VF)); State.set(this, Poison, State.Instance->Part); } - State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); + State.packScalarIntoVectorValue(this, *State.Instance); } return; } @@ -9718,9 +9474,16 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); bool isMaskRequired = getMask(); - if (isMaskRequired) - for (unsigned Part = 0; Part < State.UF; ++Part) - BlockInMaskParts[Part] = State.get(getMask(), Part); + if (isMaskRequired) { + // Mask reversal is only neede for non-all-one (null) masks, as reverse of a + // null all-one mask is a null mask. + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Mask = State.get(getMask(), Part); + if (isReverse()) + Mask = Builder.CreateVectorReverse(Mask, "reverse"); + BlockInMaskParts[Part] = Mask; + } + } const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. @@ -9731,7 +9494,8 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0) - ? DL.getIndexType(ScalarDataTy->getPointerTo()) + ? DL.getIndexType(PointerType::getUnqual( + ScalarDataTy->getContext())) : Builder.getInt32Ty(); bool InBounds = false; if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) @@ -9751,21 +9515,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds); PartPtr = Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - BlockInMaskParts[Part] = - Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); } else { Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); } - unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); + return PartPtr; }; // Handle Stores: if (SI) { - State.setDebugLocFromInst(SI); + State.setDebugLocFrom(SI->getDebugLoc()); for (unsigned Part = 0; Part < State.UF; ++Part) { Instruction *NewSI = nullptr; @@ -9798,7 +9558,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { // Handle loads. assert(LI && "Must have a load instruction"); - State.setDebugLocFromInst(LI); + State.setDebugLocFrom(LI->getDebugLoc()); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { @@ -9877,95 +9637,6 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( return CM_ScalarEpilogueAllowed; } -Value *VPTransformState::get(VPValue *Def, unsigned Part) { - // If Values have been set for this Def return the one relevant for \p Part. - if (hasVectorValue(Def, Part)) - return Data.PerPartOutput[Def][Part]; - - auto GetBroadcastInstrs = [this, Def](Value *V) { - bool SafeToHoist = Def->isDefinedOutsideVectorRegions(); - if (VF.isScalar()) - return V; - // Place the code for broadcasting invariant variables in the new preheader. - IRBuilder<>::InsertPointGuard Guard(Builder); - if (SafeToHoist) { - BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>( - Plan->getVectorLoopRegion()->getSinglePredecessor())]; - if (LoopVectorPreHeader) - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - } - - // Place the code for broadcasting invariant variables in the new preheader. - // Broadcast the scalar into all locations in the vector. - Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); - - return Shuf; - }; - - if (!hasScalarValue(Def, {Part, 0})) { - Value *IRV = Def->getLiveInIRValue(); - Value *B = GetBroadcastInstrs(IRV); - set(Def, B, Part); - return B; - } - - Value *ScalarValue = get(Def, {Part, 0}); - // If we aren't vectorizing, we can just copy the scalar map values over - // to the vector map. - if (VF.isScalar()) { - set(Def, ScalarValue, Part); - return ScalarValue; - } - - bool IsUniform = vputils::isUniformAfterVectorization(Def); - - unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; - // Check if there is a scalar value for the selected lane. - if (!hasScalarValue(Def, {Part, LastLane})) { - // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and - // VPExpandSCEVRecipes can also be uniform. - assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || - isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) || - isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && - "unexpected recipe found to be invariant"); - IsUniform = true; - LastLane = 0; - } - - auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); - // Set the insert point after the last scalarized instruction or after the - // last PHI, if LastInst is a PHI. This ensures the insertelement sequence - // will directly follow the scalar definitions. - auto OldIP = Builder.saveIP(); - auto NewIP = - isa<PHINode>(LastInst) - ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) - : std::next(BasicBlock::iterator(LastInst)); - Builder.SetInsertPoint(&*NewIP); - - // However, if we are vectorizing, we need to construct the vector values. - // If the value is known to be uniform after vectorization, we can just - // broadcast the scalar value corresponding to lane zero for each unroll - // iteration. Otherwise, we construct the vector values using - // insertelement instructions. Since the resulting vectors are stored in - // State, we will only generate the insertelements once. - Value *VectorValue = nullptr; - if (IsUniform) { - VectorValue = GetBroadcastInstrs(ScalarValue); - set(Def, VectorValue, Part); - } else { - // Initialize packing with insertelements to start from undef. - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); - set(Def, Undef, Part); - for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) - ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); - VectorValue = get(Def, Part); - } - Builder.restoreIP(OldIP); - return VectorValue; -} - // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the @@ -9994,7 +9665,8 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, + ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10013,8 +9685,10 @@ static bool processLoopInVPlanNativePath( VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { + bool AddBranchWeights = + hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getParent()->getDataLayout()); + F->getParent()->getDataLayout(), AddBranchWeights); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -10022,6 +9696,8 @@ static bool processLoopInVPlanNativePath( LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); } + reportVectorization(ORE, L, VF, 1); + // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); @@ -10076,7 +9752,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional<unsigned> VScale, Loop *L, - ScalarEvolution &SE) { + ScalarEvolution &SE, + ScalarEpilogueLowering SEL) { InstructionCost CheckCost = Checks.getCost(); if (!CheckCost.isValid()) return false; @@ -10146,11 +9823,13 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC double MinTC2 = RtC * 10 / ScalarC; - // Now pick the larger minimum. If it is not a multiple of VF, choose the - // next closest multiple of VF. This should partly compensate for ignoring - // the epilogue cost. + // Now pick the larger minimum. If it is not a multiple of VF and a scalar + // epilogue is allowed, choose the next closest multiple of VF. This should + // partly compensate for ignoring the epilogue cost. uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); - VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); + if (SEL == CM_ScalarEpilogueAllowed) + MinTC = alignTo(MinTC, IntVF); + VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); LLVM_DEBUG( dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" @@ -10270,7 +9949,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { else { if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { LLVM_DEBUG(dbgs() << "\n"); - SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + // Predicate tail-folded loops are efficient even when the loop + // iteration count is low. However, setting the epilogue policy to + // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops + // with runtime checks. It's more effective to let + // `areRuntimeChecksProfitable` determine if vectorization is beneficial + // for the loop. + if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; } else { LLVM_DEBUG(dbgs() << " But the target considers the trip count too " "small to consider vectorizing.\n"); @@ -10334,7 +10020,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, + LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. @@ -10347,8 +10033,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; + bool AddBranchWeights = + hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getParent()->getDataLayout()); + F->getParent()->getDataLayout(), AddBranchWeights); if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. @@ -10365,7 +10053,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, - *PSE.getSE())) { + *PSE.getSE(), SEL)) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), @@ -10587,13 +10275,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { DisableRuntimeUnroll = true; } // Report the vectorization decision. - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), - L->getHeader()) - << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) - << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; - }); + reportVectorization(ORE, L, VF, IC); } if (ORE->allowExtraAnalysis(LV_NAME)) @@ -10676,8 +10358,14 @@ LoopVectorizeResult LoopVectorizePass::runImpl( Changed |= CFGChanged |= processLoop(L); - if (Changed) + if (Changed) { LAIs->clear(); + +#ifndef NDEBUG + if (VerifySCEV) + SE->verify(); +#endif + } } // Process each loop nest in the function. @@ -10725,10 +10413,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, PA.preserve<LoopAnalysis>(); PA.preserve<DominatorTreeAnalysis>(); PA.preserve<ScalarEvolutionAnalysis>(); - -#ifdef EXPENSIVE_CHECKS - SE.verify(); -#endif } if (Result.MadeCFGChange) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9870ffbb586c..9d799124074c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19,7 +19,6 @@ #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" @@ -34,6 +33,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" @@ -97,7 +97,6 @@ #include <string> #include <tuple> #include <utility> -#include <vector> using namespace llvm; using namespace llvm::PatternMatch; @@ -108,8 +107,9 @@ using namespace slpvectorizer; STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); -cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, - cl::desc("Run the SLP vectorization passes")); +static cl::opt<bool> + RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, + cl::desc("Run the SLP vectorization passes")); static cl::opt<int> SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, @@ -140,10 +140,6 @@ static cl::opt<unsigned> MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)")); -static cl::opt<int> -MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, - cl::desc("Maximum depth of the lookup for consecutive stores.")); - /// Limits the size of scheduling regions in a block. /// It avoid long compile times for _very_ large blocks where vector /// instructions are spread over a wide range. @@ -232,6 +228,17 @@ static bool isVectorLikeInstWithConstOps(Value *V) { return isConstant(I->getOperand(2)); } +#if !defined(NDEBUG) +/// Print a short descriptor of the instruction bundle suitable for debug output. +static std::string shortBundleName(ArrayRef<Value *> VL) { + std::string Result; + raw_string_ostream OS(Result); + OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]"; + OS.flush(); + return Result; +} +#endif + /// \returns true if all of the instructions in \p VL are in the same block or /// false otherwise. static bool allSameBlock(ArrayRef<Value *> VL) { @@ -384,8 +391,10 @@ static SmallBitVector isUndefVector(const Value *V, if (isa<T>(II->getOperand(1))) continue; std::optional<unsigned> Idx = getInsertIndex(II); - if (!Idx) - continue; + if (!Idx) { + Res.reset(); + return Res; + } if (*Idx < UseMask.size() && !UseMask.test(*Idx)) Res.reset(*Idx); } @@ -429,26 +438,6 @@ static SmallBitVector isUndefVector(const Value *V, /// i32 6> /// %2 = mul <4 x i8> %1, %1 /// ret <4 x i8> %2 -/// We convert this initially to something like: -/// %x0 = extractelement <4 x i8> %x, i32 0 -/// %x3 = extractelement <4 x i8> %x, i32 3 -/// %y1 = extractelement <4 x i8> %y, i32 1 -/// %y2 = extractelement <4 x i8> %y, i32 2 -/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 -/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 -/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 -/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 -/// %5 = mul <4 x i8> %4, %4 -/// %6 = extractelement <4 x i8> %5, i32 0 -/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 -/// %7 = extractelement <4 x i8> %5, i32 1 -/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 -/// %8 = extractelement <4 x i8> %5, i32 2 -/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 -/// %9 = extractelement <4 x i8> %5, i32 3 -/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 -/// ret <4 x i8> %ins4 -/// InstCombiner transforms this into a shuffle and vector mul /// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// ShuffleVectorInst/getShuffleCost? @@ -539,117 +528,6 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) { return *EI->idx_begin(); } -/// Tries to find extractelement instructions with constant indices from fixed -/// vector type and gather such instructions into a bunch, which highly likely -/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was -/// successful, the matched scalars are replaced by poison values in \p VL for -/// future analysis. -static std::optional<TTI::ShuffleKind> -tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, - SmallVectorImpl<int> &Mask) { - // Scan list of gathered scalars for extractelements that can be represented - // as shuffles. - MapVector<Value *, SmallVector<int>> VectorOpToIdx; - SmallVector<int> UndefVectorExtracts; - for (int I = 0, E = VL.size(); I < E; ++I) { - auto *EI = dyn_cast<ExtractElementInst>(VL[I]); - if (!EI) { - if (isa<UndefValue>(VL[I])) - UndefVectorExtracts.push_back(I); - continue; - } - auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType()); - if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand())) - continue; - std::optional<unsigned> Idx = getExtractIndex(EI); - // Undefined index. - if (!Idx) { - UndefVectorExtracts.push_back(I); - continue; - } - SmallBitVector ExtractMask(VecTy->getNumElements(), true); - ExtractMask.reset(*Idx); - if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { - UndefVectorExtracts.push_back(I); - continue; - } - VectorOpToIdx[EI->getVectorOperand()].push_back(I); - } - // Sort the vector operands by the maximum number of uses in extractelements. - MapVector<unsigned, SmallVector<Value *>> VFToVector; - for (const auto &Data : VectorOpToIdx) - VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()] - .push_back(Data.first); - for (auto &Data : VFToVector) { - stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { - return VectorOpToIdx.find(V1)->second.size() > - VectorOpToIdx.find(V2)->second.size(); - }); - } - // Find the best pair of the vectors with the same number of elements or a - // single vector. - const int UndefSz = UndefVectorExtracts.size(); - unsigned SingleMax = 0; - Value *SingleVec = nullptr; - unsigned PairMax = 0; - std::pair<Value *, Value *> PairVec(nullptr, nullptr); - for (auto &Data : VFToVector) { - Value *V1 = Data.second.front(); - if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { - SingleMax = VectorOpToIdx[V1].size() + UndefSz; - SingleVec = V1; - } - Value *V2 = nullptr; - if (Data.second.size() > 1) - V2 = *std::next(Data.second.begin()); - if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + - UndefSz) { - PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; - PairVec = std::make_pair(V1, V2); - } - } - if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) - return std::nullopt; - // Check if better to perform a shuffle of 2 vectors or just of a single - // vector. - SmallVector<Value *> SavedVL(VL.begin(), VL.end()); - SmallVector<Value *> GatheredExtracts( - VL.size(), PoisonValue::get(VL.front()->getType())); - if (SingleMax >= PairMax && SingleMax) { - for (int Idx : VectorOpToIdx[SingleVec]) - std::swap(GatheredExtracts[Idx], VL[Idx]); - } else { - for (Value *V : {PairVec.first, PairVec.second}) - for (int Idx : VectorOpToIdx[V]) - std::swap(GatheredExtracts[Idx], VL[Idx]); - } - // Add extracts from undefs too. - for (int Idx : UndefVectorExtracts) - std::swap(GatheredExtracts[Idx], VL[Idx]); - // Check that gather of extractelements can be represented as just a - // shuffle of a single/two vectors the scalars are extracted from. - std::optional<TTI::ShuffleKind> Res = - isFixedVectorShuffle(GatheredExtracts, Mask); - if (!Res) { - // TODO: try to check other subsets if possible. - // Restore the original VL if attempt was not successful. - VL.swap(SavedVL); - return std::nullopt; - } - // Restore unused scalars from mask, if some of the extractelements were not - // selected for shuffle. - for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { - auto *EI = dyn_cast<ExtractElementInst>(VL[I]); - if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) || - !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) || - is_contained(UndefVectorExtracts, I)) - continue; - if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I])) - std::swap(VL[I], GatheredExtracts[I]); - } - return Res; -} - namespace { /// Main data required for vectorization of instructions. @@ -695,7 +573,7 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) { return S.OpValue; } -/// \returns true if \p Opcode is allowed as part of of the main/alternate +/// \returns true if \p Opcode is allowed as part of the main/alternate /// instruction for SLP vectorization. /// /// Example of unsupported opcode is SDIV that can potentially cause UB if the @@ -889,18 +767,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, /// \returns true if all of the values in \p VL have the same type or false /// otherwise. static bool allSameType(ArrayRef<Value *> VL) { - Type *Ty = VL[0]->getType(); - for (int i = 1, e = VL.size(); i < e; i++) - if (VL[i]->getType() != Ty) - return false; - - return true; + Type *Ty = VL.front()->getType(); + return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; }); } /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. -static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, - TargetLibraryInfo *TLI) { +static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, + TargetLibraryInfo *TLI) { unsigned Opcode = UserInst->getOpcode(); switch (Opcode) { case Instruction::Load: { @@ -914,11 +788,10 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, case Instruction::Call: { CallInst *CI = cast<CallInst>(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { - if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) - return (CI->getArgOperand(i) == Scalar); - } - [[fallthrough]]; + return any_of(enumerate(CI->args()), [&](auto &&Arg) { + return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) && + Arg.value().get() == Scalar; + }); } default: return false; @@ -1181,6 +1054,7 @@ public: void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntry.clear(); + MultiNodeScalars.clear(); MustGather.clear(); EntryToLastInstruction.clear(); ExternalUses.clear(); @@ -1273,7 +1147,7 @@ public: /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. - unsigned canMapToVector(Type *T, const DataLayout &DL) const; + unsigned canMapToVector(Type *T) const; /// \returns True if the VectorizableTree is both tiny and not fully /// vectorizable. We do not vectorize such trees. @@ -1324,6 +1198,9 @@ public: } LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } #endif + bool operator == (const EdgeInfo &Other) const { + return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx; + } }; /// A helper class used for scoring candidates for two consecutive lanes. @@ -1764,7 +1641,7 @@ public: auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV); if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV)) return 0; - return R.areAllUsersVectorized(IdxLaneI, std::nullopt) + return R.areAllUsersVectorized(IdxLaneI) ? LookAheadHeuristics::ScoreAllUserVectorized : 0; } @@ -1941,7 +1818,7 @@ public: HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); } else if (NumFreeOpsHash.NumOfAPOs == Min && NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { - auto It = HashMap.find(NumFreeOpsHash.Hash); + auto *It = HashMap.find(NumFreeOpsHash.Hash); if (It == HashMap.end()) HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); else @@ -2203,7 +2080,7 @@ public: for (int Pass = 0; Pass != 2; ++Pass) { // Check if no need to reorder operands since they're are perfect or // shuffled diamond match. - // Need to to do it to avoid extra external use cost counting for + // Need to do it to avoid extra external use cost counting for // shuffled matches, which may cause regressions. if (SkipReordering()) break; @@ -2388,6 +2265,18 @@ public: ~BoUpSLP(); private: + /// Determine if a vectorized value \p V in can be demoted to + /// a smaller type with a truncation. We collect the values that will be + /// demoted in ToDemote and additional roots that require investigating in + /// Roots. + /// \param DemotedConsts list of Instruction/OperandIndex pairs that are + /// constant and to be demoted. Required to correctly identify constant nodes + /// to be demoted. + bool collectValuesToDemote( + Value *V, SmallVectorImpl<Value *> &ToDemote, + DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts, + SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const; + /// Check if the operands on the edges \p Edges of the \p UserTE allows /// reordering (i.e. the operands can be reordered because they have only one /// user and reordarable). @@ -2410,12 +2299,25 @@ private: TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { ArrayRef<Value *> VL = UserTE->getOperand(OpIdx); TreeEntry *TE = nullptr; - const auto *It = find_if(VL, [this, &TE](Value *V) { + const auto *It = find_if(VL, [&](Value *V) { TE = getTreeEntry(V); - return TE; + if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) + return true; + auto It = MultiNodeScalars.find(V); + if (It != MultiNodeScalars.end()) { + for (TreeEntry *E : It->second) { + if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) { + TE = E; + return true; + } + } + } + return false; }); - if (It != VL.end() && TE->isSame(VL)) + if (It != VL.end()) { + assert(TE->isSame(VL) && "Expected same scalars."); return TE; + } return nullptr; } @@ -2428,13 +2330,16 @@ private: } /// Checks if all users of \p I are the part of the vectorization tree. - bool areAllUsersVectorized(Instruction *I, - ArrayRef<Value *> VectorizedVals) const; + bool areAllUsersVectorized( + Instruction *I, + const SmallDenseSet<Value *> *VectorizedVals = nullptr) const; /// Return information about the vector formed for the specified index /// of a vector of (the same) instruction. - TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL, - unsigned OpIdx); + TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops); + + /// \ returns the graph entry for the \p Idx operand of the \p E entry. + const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const; /// \returns the cost of the vectorizable entry. InstructionCost getEntryCost(const TreeEntry *E, @@ -2450,15 +2355,22 @@ private: /// vector) and sets \p CurrentOrder to the identity permutation; otherwise /// returns false, setting \p CurrentOrder to either an empty vector or a /// non-identity permutation that allows to reuse extract instructions. + /// \param ResizeAllowed indicates whether it is allowed to handle subvector + /// extract order. bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, - SmallVectorImpl<unsigned> &CurrentOrder) const; + SmallVectorImpl<unsigned> &CurrentOrder, + bool ResizeAllowed = false) const; /// Vectorize a single entry in the tree. - Value *vectorizeTree(TreeEntry *E); + /// \param PostponedPHIs true, if need to postpone emission of phi nodes to + /// avoid issues with def-use order. + Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs); /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry /// \p E. - Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx); + /// \param PostponedPHIs true, if need to postpone emission of phi nodes to + /// avoid issues with def-use order. + Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs); /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts @@ -2477,17 +2389,50 @@ private: /// instruction in the list). Instruction &getLastInstructionInBundle(const TreeEntry *E); - /// Checks if the gathered \p VL can be represented as shuffle(s) of previous - /// tree entries. + /// Tries to find extractelement instructions with constant indices from fixed + /// vector type and gather such instructions into a bunch, which highly likely + /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt + /// was successful, the matched scalars are replaced by poison values in \p VL + /// for future analysis. + std::optional<TargetTransformInfo::ShuffleKind> + tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL, + SmallVectorImpl<int> &Mask) const; + + /// Tries to find extractelement instructions with constant indices from fixed + /// vector type and gather such instructions into a bunch, which highly likely + /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt + /// was successful, the matched scalars are replaced by poison values in \p VL + /// for future analysis. + SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> + tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, + SmallVectorImpl<int> &Mask, + unsigned NumParts) const; + + /// Checks if the gathered \p VL can be represented as a single register + /// shuffle(s) of previous tree entries. /// \param TE Tree entry checked for permutation. /// \param VL List of scalars (a subset of the TE scalar), checked for - /// permutations. + /// permutations. Must form single-register vector. /// \returns ShuffleKind, if gathered values can be represented as shuffles of - /// previous tree entries. \p Mask is filled with the shuffle mask. + /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask. std::optional<TargetTransformInfo::ShuffleKind> - isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, - SmallVectorImpl<int> &Mask, - SmallVectorImpl<const TreeEntry *> &Entries); + isGatherShuffledSingleRegisterEntry( + const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, + SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part); + + /// Checks if the gathered \p VL can be represented as multi-register + /// shuffle(s) of previous tree entries. + /// \param TE Tree entry checked for permutation. + /// \param VL List of scalars (a subset of the TE scalar), checked for + /// permutations. + /// \returns per-register series of ShuffleKind, if gathered values can be + /// represented as shuffles of previous tree entries. \p Mask is filled with + /// the shuffle mask (also on per-register base). + SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> + isGatherShuffledEntry( + const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, + SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, + unsigned NumParts); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -2517,14 +2462,14 @@ private: /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the /// users of \p TE and collects the stores. It returns the map from the store /// pointers to the collected stores. - DenseMap<Value *, SmallVector<StoreInst *, 4>> + DenseMap<Value *, SmallVector<StoreInst *>> collectUserStores(const BoUpSLP::TreeEntry *TE) const; /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the - /// stores in \p StoresVec can form a vector instruction. If so it returns true - /// and populates \p ReorderIndices with the shuffle indices of the the stores - /// when compared to the sorted vector. - bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec, + /// stores in \p StoresVec can form a vector instruction. If so it returns + /// true and populates \p ReorderIndices with the shuffle indices of the + /// stores when compared to the sorted vector. + bool canFormVector(ArrayRef<StoreInst *> StoresVec, OrdersType &ReorderIndices) const; /// Iterates through the users of \p TE, looking for scalar stores that can be @@ -2621,10 +2566,18 @@ private: /// The Scalars are vectorized into this value. It is initialized to Null. WeakTrackingVH VectorizedValue = nullptr; + /// New vector phi instructions emitted for the vectorized phi nodes. + PHINode *PHI = nullptr; + /// Do we need to gather this sequence or vectorize it /// (either with vector instruction or with scatter/gather /// intrinsics for store/load)? - enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; + enum EntryState { + Vectorize, + ScatterVectorize, + PossibleStridedVectorize, + NeedToGather + }; EntryState State; /// Does this sequence require some shuffling? @@ -2772,6 +2725,14 @@ private: return FoundLane; } + /// Build a shuffle mask for graph entry which represents a merge of main + /// and alternate operations. + void + buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp, + SmallVectorImpl<int> &Mask, + SmallVectorImpl<Value *> *OpScalars = nullptr, + SmallVectorImpl<Value *> *AltScalars = nullptr) const; + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { @@ -2792,6 +2753,9 @@ private: case ScatterVectorize: dbgs() << "ScatterVectorize\n"; break; + case PossibleStridedVectorize: + dbgs() << "PossibleStridedVectorize\n"; + break; case NeedToGather: dbgs() << "NeedToGather\n"; break; @@ -2892,7 +2856,14 @@ private: } if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { - assert(!getTreeEntry(V) && "Scalar already in tree!"); + const TreeEntry *TE = getTreeEntry(V); + assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) && + "Scalar already in tree!"); + if (TE) { + if (TE != Last) + MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last); + continue; + } ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. @@ -2905,7 +2876,8 @@ private: for (Value *V : VL) { if (doesNotNeedToBeScheduled(V)) continue; - assert(BundleMember && "Unexpected end of bundle."); + if (!BundleMember) + continue; BundleMember->TE = Last; BundleMember = BundleMember->NextInBundle; } @@ -2913,6 +2885,10 @@ private: assert(!BundleMember && "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); + // Build a map for gathered scalars to the nodes where they are used. + for (Value *V : VL) + if (!isConstant(V)) + ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); } if (UserTreeIdx.UserTE) @@ -2950,6 +2926,10 @@ private: /// Maps a specific scalar to its tree entry. SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry; + /// List of scalars, used in several vectorize nodes, and the list of the + /// nodes. + SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars; + /// Maps a value to the proposed vectorizable size. SmallDenseMap<Value *, unsigned> InstrElementSize; @@ -2995,25 +2975,25 @@ private: /// is invariant in the calling loop. bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, Instruction *Inst2) { + if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2)) + return true; // First check if the result is already in the cache. - AliasCacheKey key = std::make_pair(Inst1, Inst2); - std::optional<bool> &result = AliasCache[key]; - if (result) { - return *result; - } - bool aliased = true; - if (Loc1.Ptr && isSimple(Inst1)) - aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); + AliasCacheKey Key = std::make_pair(Inst1, Inst2); + auto It = AliasCache.find(Key); + if (It != AliasCache.end()) + return It->second; + bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); // Store the result in the cache. - result = aliased; - return aliased; + AliasCache.try_emplace(Key, Aliased); + AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased); + return Aliased; } using AliasCacheKey = std::pair<Instruction *, Instruction *>; /// Cache for alias results. /// TODO: consider moving this to the AliasAnalysis itself. - DenseMap<AliasCacheKey, std::optional<bool>> AliasCache; + DenseMap<AliasCacheKey, bool> AliasCache; // Cache for pointerMayBeCaptured calls inside AA. This is preserved // globally through SLP because we don't perform any action which @@ -3047,7 +3027,7 @@ private: SetVector<Instruction *> GatherShuffleExtractSeq; /// A list of blocks that we are going to CSE. - SetVector<BasicBlock *> CSEBlocks; + DenseSet<BasicBlock *> CSEBlocks; /// Contains all scheduling relevant data for an instruction. /// A ScheduleData either represents a single instruction or a member of an @@ -3497,7 +3477,7 @@ private: BasicBlock *BB; /// Simple memory allocation for ScheduleData. - std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; + SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; /// The size of a ScheduleData array in ScheduleDataChunks. int ChunkSize; @@ -3607,7 +3587,7 @@ private: /// where "width" indicates the minimum bit width and "signed" is True if the /// value must be signed-extended, rather than zero-extended, back to its /// original width. - MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; + DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs; }; } // end namespace slpvectorizer @@ -3676,7 +3656,7 @@ template <> struct GraphTraits<BoUpSLP *> { template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { using TreeEntry = BoUpSLP::TreeEntry; - DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { std::string Str; @@ -3699,7 +3679,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { const BoUpSLP *) { if (Entry->State == TreeEntry::NeedToGather) return "color=red"; - if (Entry->State == TreeEntry::ScatterVectorize) + if (Entry->State == TreeEntry::ScatterVectorize || + Entry->State == TreeEntry::PossibleStridedVectorize) return "color=blue"; return ""; } @@ -3761,7 +3742,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) { inversePermutation(Order, MaskOrder); } reorderReuses(MaskOrder, Mask); - if (ShuffleVectorInst::isIdentityMask(MaskOrder)) { + if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) { Order.clear(); return; } @@ -3779,7 +3760,40 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { OrdersType CurrentOrder(NumScalars, NumScalars); SmallVector<int> Positions; SmallBitVector UsedPositions(NumScalars); - const TreeEntry *STE = nullptr; + DenseMap<const TreeEntry *, unsigned> UsedEntries; + DenseMap<Value *, std::pair<const TreeEntry *, unsigned>> ValueToEntryPos; + for (Value *V : TE.Scalars) { + if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V)) + continue; + const auto *LocalSTE = getTreeEntry(V); + if (!LocalSTE) + continue; + unsigned Lane = + std::distance(LocalSTE->Scalars.begin(), find(LocalSTE->Scalars, V)); + if (Lane >= NumScalars) + continue; + ++UsedEntries.try_emplace(LocalSTE, 0).first->getSecond(); + ValueToEntryPos.try_emplace(V, LocalSTE, Lane); + } + if (UsedEntries.empty()) + return std::nullopt; + const TreeEntry &BestSTE = + *std::max_element(UsedEntries.begin(), UsedEntries.end(), + [](const std::pair<const TreeEntry *, unsigned> &P1, + const std::pair<const TreeEntry *, unsigned> &P2) { + return P1.second < P2.second; + }) + ->first; + UsedEntries.erase(&BestSTE); + const TreeEntry *SecondBestSTE = nullptr; + if (!UsedEntries.empty()) + SecondBestSTE = + std::max_element(UsedEntries.begin(), UsedEntries.end(), + [](const std::pair<const TreeEntry *, unsigned> &P1, + const std::pair<const TreeEntry *, unsigned> &P2) { + return P1.second < P2.second; + }) + ->first; // Try to find all gathered scalars that are gets vectorized in other // vectorize node. Here we can have only one single tree vector node to // correctly identify order of the gathered scalars. @@ -3787,58 +3801,56 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { Value *V = TE.Scalars[I]; if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V)) continue; - if (const auto *LocalSTE = getTreeEntry(V)) { - if (!STE) - STE = LocalSTE; - else if (STE != LocalSTE) - // Take the order only from the single vector node. - return std::nullopt; - unsigned Lane = - std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); - if (Lane >= NumScalars) - return std::nullopt; - if (CurrentOrder[Lane] != NumScalars) { - if (Lane != I) - continue; - UsedPositions.reset(CurrentOrder[Lane]); - } - // The partial identity (where only some elements of the gather node are - // in the identity order) is good. - CurrentOrder[Lane] = I; - UsedPositions.set(I); + const auto [LocalSTE, Lane] = ValueToEntryPos.lookup(V); + if (!LocalSTE || (LocalSTE != &BestSTE && LocalSTE != SecondBestSTE)) + continue; + if (CurrentOrder[Lane] != NumScalars) { + if ((CurrentOrder[Lane] >= BestSTE.Scalars.size() || + BestSTE.Scalars[CurrentOrder[Lane]] == V) && + (Lane != I || LocalSTE == SecondBestSTE)) + continue; + UsedPositions.reset(CurrentOrder[Lane]); } + // The partial identity (where only some elements of the gather node are + // in the identity order) is good. + CurrentOrder[Lane] = I; + UsedPositions.set(I); } // Need to keep the order if we have a vector entry and at least 2 scalars or // the vectorized entry has just 2 scalars. - if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) { - auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) { - for (unsigned I = 0; I < NumScalars; ++I) - if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars) - return false; - return true; - }; - if (IsIdentityOrder(CurrentOrder)) - return OrdersType(); - auto *It = CurrentOrder.begin(); - for (unsigned I = 0; I < NumScalars;) { - if (UsedPositions.test(I)) { - ++I; - continue; - } - if (*It == NumScalars) { - *It = I; - ++I; - } - ++It; + if (BestSTE.Scalars.size() != 2 && UsedPositions.count() <= 1) + return std::nullopt; + auto IsIdentityOrder = [&](ArrayRef<unsigned> CurrentOrder) { + for (unsigned I = 0; I < NumScalars; ++I) + if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars) + return false; + return true; + }; + if (IsIdentityOrder(CurrentOrder)) + return OrdersType(); + auto *It = CurrentOrder.begin(); + for (unsigned I = 0; I < NumScalars;) { + if (UsedPositions.test(I)) { + ++I; + continue; } - return std::move(CurrentOrder); + if (*It == NumScalars) { + *It = I; + ++I; + } + ++It; } - return std::nullopt; + return std::move(CurrentOrder); } namespace { /// Tracks the state we can represent the loads in the given sequence. -enum class LoadsState { Gather, Vectorize, ScatterVectorize }; +enum class LoadsState { + Gather, + Vectorize, + ScatterVectorize, + PossibleStridedVectorize +}; } // anonymous namespace static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, @@ -3898,6 +3910,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, if (IsSorted || all_of(PointerOps, [&](Value *P) { return arePointersCompatible(P, PointerOps.front(), TLI); })) { + bool IsPossibleStrided = false; if (IsSorted) { Value *Ptr0; Value *PtrN; @@ -3913,6 +3926,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, // Check that the sorted loads are consecutive. if (static_cast<unsigned>(*Diff) == VL.size() - 1) return LoadsState::Vectorize; + // Simple check if not a strided access - clear order. + IsPossibleStrided = *Diff % (VL.size() - 1) == 0; } // TODO: need to improve analysis of the pointers, if not all of them are // GEPs or have > 2 operands, we end up with a gather node, which just @@ -3934,7 +3949,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::ScatterVectorize; + return IsPossibleStrided ? LoadsState::PossibleStridedVectorize + : LoadsState::ScatterVectorize; } } @@ -4050,7 +4066,8 @@ static bool areTwoInsertFromSameBuildVector( // Go through the vector operand of insertelement instructions trying to find // either VU as the original vector for IE2 or V as the original vector for // IE1. - SmallSet<int, 8> ReusedIdx; + SmallBitVector ReusedIdx( + cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue()); bool IsReusedIdx = false; do { if (IE2 == VU && !IE1) @@ -4058,16 +4075,18 @@ static bool areTwoInsertFromSameBuildVector( if (IE1 == V && !IE2) return V->hasOneUse(); if (IE1 && IE1 != V) { - IsReusedIdx |= - !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second; + unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2); + IsReusedIdx |= ReusedIdx.test(Idx1); + ReusedIdx.set(Idx1); if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx) IE1 = nullptr; else IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1)); } if (IE2 && IE2 != VU) { - IsReusedIdx |= - !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second; + unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1); + IsReusedIdx |= ReusedIdx.test(Idx2); + ReusedIdx.set(Idx2); if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx) IE2 = nullptr; else @@ -4135,13 +4154,16 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { return std::nullopt; // No need to reorder. return std::move(ResOrder); } - if (TE.State == TreeEntry::Vectorize && + if ((TE.State == TreeEntry::Vectorize || + TE.State == TreeEntry::PossibleStridedVectorize) && (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) && !TE.isAltShuffle()) return TE.ReorderIndices; if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { - auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) { + auto PHICompare = [&](unsigned I1, unsigned I2) { + Value *V1 = TE.Scalars[I1]; + Value *V2 = TE.Scalars[I2]; if (V1 == V2) return false; if (!V1->hasOneUse() || !V2->hasOneUse()) @@ -4180,14 +4202,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { }; if (!TE.ReorderIndices.empty()) return TE.ReorderIndices; - DenseMap<Value *, unsigned> PhiToId; - SmallVector<Value *, 4> Phis; + DenseMap<unsigned, unsigned> PhiToId; + SmallVector<unsigned> Phis(TE.Scalars.size()); + std::iota(Phis.begin(), Phis.end(), 0); OrdersType ResOrder(TE.Scalars.size()); - for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) { - PhiToId[TE.Scalars[Id]] = Id; - Phis.push_back(TE.Scalars[Id]); - } - llvm::stable_sort(Phis, PHICompare); + for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) + PhiToId[Id] = Id; + stable_sort(Phis, PHICompare); for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id) ResOrder[Id] = PhiToId[Phis[Id]]; if (IsIdentityOrder(ResOrder)) @@ -4214,7 +4235,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // Check that gather of extractelements can be represented as // just a shuffle of a single vector. OrdersType CurrentOrder; - bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder); + bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder, + /*ResizeAllowed=*/true); if (Reuse || !CurrentOrder.empty()) { if (!CurrentOrder.empty()) fixupOrderingIndices(CurrentOrder); @@ -4270,7 +4292,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask, unsigned Sz) { ArrayRef<int> FirstCluster = Mask.slice(0, Sz); - if (ShuffleVectorInst::isIdentityMask(FirstCluster)) + if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz)) return false; for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) { ArrayRef<int> Cluster = Mask.slice(I, Sz); @@ -4386,7 +4408,9 @@ void BoUpSLP::reorderTopToBottom() { ++Cnt; } VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); - if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) + if (!(TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::PossibleStridedVectorize) || + !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && TE->getOpcode() == Instruction::PHI) @@ -4409,6 +4433,9 @@ void BoUpSLP::reorderTopToBottom() { MapVector<OrdersType, unsigned, DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> OrdersUses; + // Last chance orders - scatter vectorize. Try to use their orders if no + // other orders or the order is counted already. + SmallVector<OrdersType> StridedVectorizeOrders; SmallPtrSet<const TreeEntry *, 4> VisitedOps; for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, @@ -4455,6 +4482,11 @@ void BoUpSLP::reorderTopToBottom() { if (Order.empty()) continue; } + // Postpone scatter orders. + if (OpTE->State == TreeEntry::PossibleStridedVectorize) { + StridedVectorizeOrders.push_back(Order); + continue; + } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -4472,8 +4504,21 @@ void BoUpSLP::reorderTopToBottom() { } } // Set order of the user node. - if (OrdersUses.empty()) - continue; + if (OrdersUses.empty()) { + if (StridedVectorizeOrders.empty()) + continue; + // Add (potentially!) strided vectorize orders. + for (OrdersType &Order : StridedVectorizeOrders) + ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + } else { + // Account (potentially!) strided vectorize orders only if it was used + // already. + for (OrdersType &Order : StridedVectorizeOrders) { + auto *It = OrdersUses.find(Order); + if (It != OrdersUses.end()) + ++It->second; + } + } // Choose the most used order. ArrayRef<unsigned> BestOrder = OrdersUses.front().first; unsigned Cnt = OrdersUses.front().second; @@ -4514,7 +4559,8 @@ void BoUpSLP::reorderTopToBottom() { } continue; } - if (TE->State == TreeEntry::Vectorize && + if ((TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::PossibleStridedVectorize) && isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst, InsertElementInst>(TE->getMainOp()) && !TE->isAltShuffle()) { @@ -4555,6 +4601,10 @@ bool BoUpSLP::canReorderOperands( })) continue; if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { + // FIXME: Do not reorder (possible!) strided vectorized nodes, they + // require reordering of the operands, which is not implemented yet. + if (TE->State == TreeEntry::PossibleStridedVectorize) + return false; // Do not reorder if operand node is used by many user nodes. if (any_of(TE->UserTreeIndices, [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) @@ -4567,7 +4617,8 @@ bool BoUpSLP::canReorderOperands( // simply add to the list of gathered ops. // If there are reused scalars, process this node as a regular vectorize // node, just reorder reuses mask. - if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty()) + if (TE->State != TreeEntry::Vectorize && + TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); continue; } @@ -4602,18 +4653,19 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Currently the are vectorized loads,extracts without alternate operands + // some gathering of extracts. SmallVector<TreeEntry *> NonVectorized; - for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, - &NonVectorized]( - const std::unique_ptr<TreeEntry> &TE) { - if (TE->State != TreeEntry::Vectorize) + for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { + if (TE->State != TreeEntry::Vectorize && + TE->State != TreeEntry::PossibleStridedVectorize) NonVectorized.push_back(TE.get()); if (std::optional<OrdersType> CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) + if (!(TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::PossibleStridedVectorize) || + !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } - }); + } // 1. Propagate order to the graph nodes, which use only reordered nodes. // I.e., if the node has operands, that are reordered, try to make at least @@ -4627,6 +4679,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SmallVector<TreeEntry *> Filtered; for (TreeEntry *TE : OrderedEntries) { if (!(TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::PossibleStridedVectorize || (TE->State == TreeEntry::NeedToGather && GathersToOrders.count(TE))) || TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || @@ -4649,8 +4702,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } } // Erase filtered entries. - for_each(Filtered, - [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); + for (TreeEntry *TE : Filtered) + OrderedEntries.remove(TE); SmallVector< std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>> UsersVec(Users.begin(), Users.end()); @@ -4662,10 +4715,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SmallVector<TreeEntry *> GatherOps; if (!canReorderOperands(Data.first, Data.second, NonVectorized, GatherOps)) { - for_each(Data.second, - [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { - OrderedEntries.remove(Op.second); - }); + for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) + OrderedEntries.remove(Op.second); continue; } // All operands are reordered and used only in this node - propagate the @@ -4673,6 +4724,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { MapVector<OrdersType, unsigned, DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> OrdersUses; + // Last chance orders - scatter vectorize. Try to use their orders if no + // other orders or the order is counted already. + SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders; // Do the analysis for each tree entry only once, otherwise the order of // the same node my be considered several times, though might be not // profitable. @@ -4694,6 +4748,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) { return P.second == OpTE; }); + // Postpone scatter orders. + if (OpTE->State == TreeEntry::PossibleStridedVectorize) { + StridedVectorizeOrders.emplace_back(Order, NumOps); + continue; + } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -4754,11 +4813,27 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } // If no orders - skip current nodes and jump to the next one, if any. if (OrdersUses.empty()) { - for_each(Data.second, - [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { - OrderedEntries.remove(Op.second); - }); - continue; + if (StridedVectorizeOrders.empty() || + (Data.first->ReorderIndices.empty() && + Data.first->ReuseShuffleIndices.empty() && + !(IgnoreReorder && + Data.first == VectorizableTree.front().get()))) { + for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) + OrderedEntries.remove(Op.second); + continue; + } + // Add (potentially!) strided vectorize orders. + for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) + OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second += + Pair.second; + } else { + // Account (potentially!) strided vectorize orders only if it was used + // already. + for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) { + auto *It = OrdersUses.find(Pair.first); + if (It != OrdersUses.end()) + It->second += Pair.second; + } } // Choose the best order. ArrayRef<unsigned> BestOrder = OrdersUses.front().first; @@ -4771,10 +4846,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } // Set order of the user node (reordering of operands and user nodes). if (BestOrder.empty()) { - for_each(Data.second, - [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { - OrderedEntries.remove(Op.second); - }); + for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) + OrderedEntries.remove(Op.second); continue; } // Erase operands from OrderedEntries list and adjust their orders. @@ -4796,7 +4869,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { continue; } // Gathers are processed separately. - if (TE->State != TreeEntry::Vectorize) + if (TE->State != TreeEntry::Vectorize && + TE->State != TreeEntry::PossibleStridedVectorize && + (TE->State != TreeEntry::ScatterVectorize || + TE->ReorderIndices.empty())) continue; assert((BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && @@ -4825,7 +4901,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.first->isAltShuffle()) Data.first->reorderOperands(Mask); if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) || - Data.first->isAltShuffle()) { + Data.first->isAltShuffle() || + Data.first->State == TreeEntry::PossibleStridedVectorize) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder); if (Data.first->ReuseShuffleIndices.empty() && @@ -4859,10 +4936,12 @@ void BoUpSLP::buildExternalUses( // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (!isa<Instruction>(Scalar)) + continue; int FoundLane = Entry->findLaneForValue(Scalar); // Check if the scalar is externally used as an extra arg. - auto ExtI = ExternallyUsedValues.find(Scalar); + const auto *ExtI = ExternallyUsedValues.find(Scalar); if (ExtI != ExternallyUsedValues.end()) { LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " << Lane << " from " << *Scalar << ".\n"); @@ -4886,7 +4965,8 @@ void BoUpSLP::buildExternalUses( // be used. if (UseScalar != U || UseEntry->State == TreeEntry::ScatterVectorize || - !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { + UseEntry->State == TreeEntry::PossibleStridedVectorize || + !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); @@ -4906,9 +4986,9 @@ void BoUpSLP::buildExternalUses( } } -DenseMap<Value *, SmallVector<StoreInst *, 4>> +DenseMap<Value *, SmallVector<StoreInst *>> BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { - DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap; + DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap; for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) { Value *V = TE->Scalars[Lane]; // To save compilation time we don't visit if we have too many users. @@ -4947,14 +5027,14 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { return PtrToStoresMap; } -bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec, +bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec, OrdersType &ReorderIndices) const { // We check whether the stores in StoreVec can form a vector by sorting them // and checking whether they are consecutive. // To avoid calling getPointersDiff() while sorting we create a vector of // pairs {store, offset from first} and sort this instead. - SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size()); + SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size()); StoreInst *S0 = StoresVec[0]; StoreOffsetVec[0] = {S0, 0}; Type *S0Ty = S0->getValueOperand()->getType(); @@ -5023,7 +5103,7 @@ SmallVector<BoUpSLP::OrdersType, 1> BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { unsigned NumLanes = TE->Scalars.size(); - DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap = + DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap = collectUserStores(TE); // Holds the reorder indices for each candidate store vector that is a user of @@ -5244,6 +5324,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( return TreeEntry::Vectorize; case LoadsState::ScatterVectorize: return TreeEntry::ScatterVectorize; + case LoadsState::PossibleStridedVectorize: + return TreeEntry::PossibleStridedVectorize; case LoadsState::Gather: #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -5416,7 +5498,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); VFShape Shape = VFShape::get( - *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), + CI->getFunctionType(), + ElementCount::getFixed(static_cast<unsigned int>(VL.size())), false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); @@ -5488,9 +5571,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, SmallVector<int> ReuseShuffleIndicies; SmallVector<Value *> UniqueValues; - auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues, - &UserTreeIdx, - this](const InstructionsState &S) { + SmallVector<Value *> NonUniqueValueVL; + auto TryToFindDuplicates = [&](const InstructionsState &S, + bool DoNotFail = false) { // Check that every instruction appears once in this bundle. DenseMap<Value *, unsigned> UniquePositions(VL.size()); for (Value *V : VL) { @@ -5517,6 +5600,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, !isConstant(V); })) || !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) { + if (DoNotFail && UniquePositions.size() > 1 && + NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() && + all_of(UniqueValues, [=](Value *V) { + return isa<ExtractElementInst>(V) || + areAllUsersVectorized(cast<Instruction>(V), + UserIgnoreList); + })) { + unsigned PWSz = PowerOf2Ceil(UniqueValues.size()); + if (PWSz == VL.size()) { + ReuseShuffleIndicies.clear(); + } else { + NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end()); + NonUniqueValueVL.append(PWSz - UniqueValues.size(), + UniqueValues.back()); + VL = NonUniqueValueVL; + } + return true; + } LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return false; @@ -5528,6 +5629,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, InstructionsState S = getSameOpcode(VL, *TLI); + // Don't vectorize ephemeral values. + if (!EphValues.empty()) { + for (Value *V : VL) { + if (EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V + << ") is ephemeral.\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); + return; + } + } + } + // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of // a load), in which case peek through to include it in the tree, without // ballooning over-budget. @@ -5633,7 +5746,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, BasicBlock *BB = nullptr; bool IsScatterVectorizeUserTE = UserTreeIdx.UserTE && - UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; + (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize || + UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize); bool AreAllSameInsts = (S.getOpcode() && allSameBlock(VL)) || (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE && @@ -5665,39 +5779,44 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // We now know that this is a vector of instructions of the same type from // the same block. - // Don't vectorize ephemeral values. - if (!EphValues.empty()) { - for (Value *V : VL) { - if (EphValues.count(V)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V - << ") is ephemeral.\n"); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); - return; - } - } - } - // Check if this is a duplicate of another entry. if (TreeEntry *E = getTreeEntry(S.OpValue)) { LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - if (TryToFindDuplicates(S)) - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + auto It = MultiNodeScalars.find(S.OpValue); + if (It != MultiNodeScalars.end()) { + auto *TEIt = find_if(It->getSecond(), + [&](TreeEntry *ME) { return ME->isSame(VL); }); + if (TEIt != It->getSecond().end()) + E = *TEIt; + else + E = nullptr; + } else { + E = nullptr; + } + } + if (!E) { + if (!doesNotNeedToBeScheduled(S.OpValue)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + return; + } + } else { + // Record the reuse of the tree node. FIXME, currently this is only used + // to properly draw the graph rather than for the actual vectorization. + E->UserTreeIndices.push_back(UserTreeIdx); + LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue + << ".\n"); return; } - // Record the reuse of the tree node. FIXME, currently this is only used to - // properly draw the graph rather than for the actual vectorization. - E->UserTreeIndices.push_back(UserTreeIdx); - LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue - << ".\n"); - return; } // Check that none of the instructions in the bundle are already in the tree. for (Value *V : VL) { - if (!IsScatterVectorizeUserTE && !isa<Instruction>(V)) + if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) || + doesNotNeedToBeScheduled(V)) continue; if (getTreeEntry(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V @@ -5725,7 +5844,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Special processing for sorted pointers for ScatterVectorize node with // constant indeces only. if (AreAllSameInsts && UserTreeIdx.UserTE && - UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize && + (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize || + UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) && !(S.getOpcode() && allSameBlock(VL))) { assert(S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= @@ -5760,7 +5880,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // Check that every instruction appears once in this bundle. - if (!TryToFindDuplicates(S)) + if (!TryToFindDuplicates(S, /*DoNotFail=*/true)) return; // Perform specific checks for each particular instruction kind. @@ -5780,7 +5900,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, BlockScheduling &BS = *BSRef; - std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); + std::optional<ScheduleData *> Bundle = + BS.tryScheduleBundle(UniqueValues, this, S); #ifdef EXPENSIVE_CHECKS // Make sure we didn't break any internal invariants BS.verify(); @@ -5905,6 +6026,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. TreeEntry *TE = nullptr; + fixupOrderingIndices(CurrentOrder); switch (State) { case TreeEntry::Vectorize: if (CurrentOrder.empty()) { @@ -5913,7 +6035,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { - fixupOrderingIndices(CurrentOrder); // Need to reorder. TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); @@ -5921,6 +6042,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } TE->setOperandsInOrder(); break; + case TreeEntry::PossibleStridedVectorize: + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + if (CurrentOrder.empty()) { + TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies); + } else { + TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); + } + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + break; case TreeEntry::ScatterVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, @@ -5951,13 +6085,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); TE->setOperandsInOrder(); - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) { ValueList Operands; // Prepare the operand vector. for (Value *V : VL) - Operands.push_back(cast<Instruction>(V)->getOperand(i)); + Operands.push_back(cast<Instruction>(V)->getOperand(I)); - buildTree_rec(Operands, Depth + 1, {TE, i}); + buildTree_rec(Operands, Depth + 1, {TE, I}); } return; } @@ -6031,13 +6165,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } TE->setOperandsInOrder(); - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) { ValueList Operands; // Prepare the operand vector. for (Value *V : VL) - Operands.push_back(cast<Instruction>(V)->getOperand(i)); + Operands.push_back(cast<Instruction>(V)->getOperand(I)); - buildTree_rec(Operands, Depth + 1, {TE, i}); + buildTree_rec(Operands, Depth + 1, {TE, I}); } return; } @@ -6087,8 +6221,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (!CI) Operands.back().push_back(Op); else - Operands.back().push_back(ConstantExpr::getIntegerCast( - CI, Ty, CI->getValue().isSignBitSet())); + Operands.back().push_back(ConstantFoldIntegerCast( + CI, Ty, CI->getValue().isSignBitSet(), *DL)); } TE->setOperand(IndexIdx, Operands.back()); @@ -6132,18 +6266,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); - for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { - // For scalar operands no need to to create an entry since no need to + for (unsigned I : seq<unsigned>(0, CI->arg_size())) { + // For scalar operands no need to create an entry since no need to // vectorize it. - if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, I)) continue; ValueList Operands; // Prepare the operand vector. for (Value *V : VL) { auto *CI2 = cast<CallInst>(V); - Operands.push_back(CI2->getArgOperand(i)); + Operands.push_back(CI2->getArgOperand(I)); } - buildTree_rec(Operands, Depth + 1, {TE, i}); + buildTree_rec(Operands, Depth + 1, {TE, I}); } return; } @@ -6194,13 +6328,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } TE->setOperandsInOrder(); - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) { ValueList Operands; // Prepare the operand vector. for (Value *V : VL) - Operands.push_back(cast<Instruction>(V)->getOperand(i)); + Operands.push_back(cast<Instruction>(V)->getOperand(I)); - buildTree_rec(Operands, Depth + 1, {TE, i}); + buildTree_rec(Operands, Depth + 1, {TE, I}); } return; } @@ -6210,7 +6344,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, llvm_unreachable("Unexpected vectorization of the instructions."); } -unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { +unsigned BoUpSLP::canMapToVector(Type *T) const { unsigned N = 1; Type *EltTy = T; @@ -6234,15 +6368,16 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { if (!isValidElementType(EltTy)) return 0; - uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N)); + uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N)); if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || - VTSize != DL.getTypeStoreSizeInBits(T)) + VTSize != DL->getTypeStoreSizeInBits(T)) return 0; return N; } bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, - SmallVectorImpl<unsigned> &CurrentOrder) const { + SmallVectorImpl<unsigned> &CurrentOrder, + bool ResizeAllowed) const { const auto *It = find_if(VL, [](Value *V) { return isa<ExtractElementInst, ExtractValueInst>(V); }); @@ -6263,8 +6398,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, // We have to extract from a vector/aggregate with the same number of elements. unsigned NElts; if (E0->getOpcode() == Instruction::ExtractValue) { - const DataLayout &DL = E0->getModule()->getDataLayout(); - NElts = canMapToVector(Vec->getType(), DL); + NElts = canMapToVector(Vec->getType()); if (!NElts) return false; // Check if load can be rewritten as load of vector. @@ -6275,46 +6409,55 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); } - if (NElts != VL.size()) - return false; - - // Check that all of the indices extract from the correct offset. - bool ShouldKeepOrder = true; unsigned E = VL.size(); - // Assign to all items the initial value E + 1 so we can check if the extract - // instruction index was used already. - // Also, later we can check that all the indices are used and we have a - // consecutive access in the extract instructions, by checking that no - // element of CurrentOrder still has value E + 1. - CurrentOrder.assign(E, E); - unsigned I = 0; - for (; I < E; ++I) { - auto *Inst = dyn_cast<Instruction>(VL[I]); + if (!ResizeAllowed && NElts != E) + return false; + SmallVector<int> Indices(E, PoisonMaskElem); + unsigned MinIdx = NElts, MaxIdx = 0; + for (auto [I, V] : enumerate(VL)) { + auto *Inst = dyn_cast<Instruction>(V); if (!Inst) continue; if (Inst->getOperand(0) != Vec) - break; + return false; if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) if (isa<UndefValue>(EE->getIndexOperand())) continue; std::optional<unsigned> Idx = getExtractIndex(Inst); if (!Idx) - break; + return false; const unsigned ExtIdx = *Idx; - if (ExtIdx != I) { - if (ExtIdx >= E || CurrentOrder[ExtIdx] != E) - break; - ShouldKeepOrder = false; - CurrentOrder[ExtIdx] = I; - } else { - if (CurrentOrder[I] != E) - break; - CurrentOrder[I] = I; - } + if (ExtIdx >= NElts) + continue; + Indices[I] = ExtIdx; + if (MinIdx > ExtIdx) + MinIdx = ExtIdx; + if (MaxIdx < ExtIdx) + MaxIdx = ExtIdx; } - if (I < E) { - CurrentOrder.clear(); + if (MaxIdx - MinIdx + 1 > E) return false; + if (MaxIdx + 1 <= E) + MinIdx = 0; + + // Check that all of the indices extract from the correct offset. + bool ShouldKeepOrder = true; + // Assign to all items the initial value E + 1 so we can check if the extract + // instruction index was used already. + // Also, later we can check that all the indices are used and we have a + // consecutive access in the extract instructions, by checking that no + // element of CurrentOrder still has value E + 1. + CurrentOrder.assign(E, E); + for (unsigned I = 0; I < E; ++I) { + if (Indices[I] == PoisonMaskElem) + continue; + const unsigned ExtIdx = Indices[I] - MinIdx; + if (CurrentOrder[ExtIdx] != E) { + CurrentOrder.clear(); + return false; + } + ShouldKeepOrder &= ExtIdx == I; + CurrentOrder[ExtIdx] = I; } if (ShouldKeepOrder) CurrentOrder.clear(); @@ -6322,9 +6465,9 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, return ShouldKeepOrder; } -bool BoUpSLP::areAllUsersVectorized(Instruction *I, - ArrayRef<Value *> VectorizedVals) const { - return (I->hasOneUse() && is_contained(VectorizedVals, I)) || +bool BoUpSLP::areAllUsersVectorized( + Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const { + return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) || all_of(I->users(), [this](User *U) { return ScalarToTreeEntry.count(U) > 0 || isVectorLikeInstWithConstOps(U) || @@ -6351,8 +6494,8 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, auto IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); - auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( - VecTy->getNumElements())), + auto Shape = VFShape::get(CI->getFunctionType(), + ElementCount::getFixed(VecTy->getNumElements()), false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); auto LibCost = IntrinsicCost; @@ -6365,16 +6508,11 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, return {IntrinsicCost, LibCost}; } -/// Build shuffle mask for shuffle graph entries and lists of main and alternate -/// operations operands. -static void -buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, - ArrayRef<int> ReusesIndices, - const function_ref<bool(Instruction *)> IsAltOp, - SmallVectorImpl<int> &Mask, - SmallVectorImpl<Value *> *OpScalars = nullptr, - SmallVectorImpl<Value *> *AltScalars = nullptr) { - unsigned Sz = VL.size(); +void BoUpSLP::TreeEntry::buildAltOpShuffleMask( + const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask, + SmallVectorImpl<Value *> *OpScalars, + SmallVectorImpl<Value *> *AltScalars) const { + unsigned Sz = Scalars.size(); Mask.assign(Sz, PoisonMaskElem); SmallVector<int> OrderMask; if (!ReorderIndices.empty()) @@ -6383,7 +6521,7 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, unsigned Idx = I; if (!ReorderIndices.empty()) Idx = OrderMask[I]; - auto *OpInst = cast<Instruction>(VL[Idx]); + auto *OpInst = cast<Instruction>(Scalars[Idx]); if (IsAltOp(OpInst)) { Mask[I] = Sz + Idx; if (AltScalars) @@ -6394,9 +6532,9 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, OpScalars->push_back(OpInst); } } - if (!ReusesIndices.empty()) { - SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem); - transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) { + if (!ReuseShuffleIndices.empty()) { + SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem); + transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) { return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem; }); Mask.swap(NewMask); @@ -6429,52 +6567,27 @@ static bool isAlternateInstruction(const Instruction *I, return I->getOpcode() == AltOp->getOpcode(); } -TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL, - unsigned OpIdx) { - assert(!VL.empty()); - const auto *I0 = cast<Instruction>(*find_if(VL, Instruction::classof)); - const auto *Op0 = I0->getOperand(OpIdx); +TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) { + assert(!Ops.empty()); + const auto *Op0 = Ops.front(); - const bool IsConstant = all_of(VL, [&](Value *V) { + const bool IsConstant = all_of(Ops, [](Value *V) { // TODO: We should allow undef elements here - const auto *I = dyn_cast<Instruction>(V); - if (!I) - return true; - auto *Op = I->getOperand(OpIdx); - return isConstant(Op) && !isa<UndefValue>(Op); + return isConstant(V) && !isa<UndefValue>(V); }); - const bool IsUniform = all_of(VL, [&](Value *V) { + const bool IsUniform = all_of(Ops, [=](Value *V) { // TODO: We should allow undef elements here - const auto *I = dyn_cast<Instruction>(V); - if (!I) - return false; - return I->getOperand(OpIdx) == Op0; + return V == Op0; }); - const bool IsPowerOfTwo = all_of(VL, [&](Value *V) { + const bool IsPowerOfTwo = all_of(Ops, [](Value *V) { // TODO: We should allow undef elements here - const auto *I = dyn_cast<Instruction>(V); - if (!I) { - assert((isa<UndefValue>(V) || - I0->getOpcode() == Instruction::GetElementPtr) && - "Expected undef or GEP."); - return true; - } - auto *Op = I->getOperand(OpIdx); - if (auto *CI = dyn_cast<ConstantInt>(Op)) + if (auto *CI = dyn_cast<ConstantInt>(V)) return CI->getValue().isPowerOf2(); return false; }); - const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) { + const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) { // TODO: We should allow undef elements here - const auto *I = dyn_cast<Instruction>(V); - if (!I) { - assert((isa<UndefValue>(V) || - I0->getOpcode() == Instruction::GetElementPtr) && - "Expected undef or GEP."); - return true; - } - const auto *Op = I->getOperand(OpIdx); - if (auto *CI = dyn_cast<ConstantInt>(Op)) + if (auto *CI = dyn_cast<ConstantInt>(V)) return CI->getValue().isNegatedPowerOf2(); return false; }); @@ -6505,9 +6618,24 @@ protected: bool IsStrict) { int Limit = Mask.size(); int VF = VecTy->getNumElements(); - return (VF == Limit || !IsStrict) && - all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && - ShuffleVectorInst::isIdentityMask(Mask); + int Index = -1; + if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit)) + return true; + if (!IsStrict) { + // Consider extract subvector starting from index 0. + if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) && + Index == 0) + return true; + // All VF-size submasks are identity (e.g. + // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4). + if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) { + ArrayRef<int> Slice = Mask.slice(Idx * VF, VF); + return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) || + ShuffleVectorInst::isIdentityMask(Slice, VF); + })) + return true; + } + return false; } /// Tries to combine 2 different masks into single one. @@ -6577,7 +6705,8 @@ protected: if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) { if (!IdentityOp || !SinglePermute || (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) && - !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) { + !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask, + IdentityMask.size()))) { IdentityOp = SV; // Store current mask in the IdentityMask so later we did not lost // this info if IdentityOp is selected as the best candidate for the @@ -6647,7 +6776,7 @@ protected: } if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType()); !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) || - ShuffleVectorInst::isZeroEltSplatMask(Mask)) { + ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) { if (IdentityOp) { V = IdentityOp; assert(Mask.size() == IdentityMask.size() && @@ -6663,7 +6792,7 @@ protected: /*IsStrict=*/true) || (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() && Shuffle->isZeroEltSplat() && - ShuffleVectorInst::isZeroEltSplatMask(Mask))); + ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size()))); } V = Op; return false; @@ -6768,11 +6897,9 @@ protected: CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); } } - const int Limit = CombinedMask1.size() * 2; - if (Op1 == Op2 && Limit == 2 * VF && - all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) && - (ShuffleVectorInst::isIdentityMask(CombinedMask1) || - (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) && + if (Op1 == Op2 && + (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) || + (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) && isa<ShuffleVectorInst>(Op1) && cast<ShuffleVectorInst>(Op1)->getShuffleMask() == ArrayRef(CombinedMask1)))) @@ -6807,10 +6934,29 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors; const TargetTransformInfo &TTI; InstructionCost Cost = 0; - ArrayRef<Value *> VectorizedVals; + SmallDenseSet<Value *> VectorizedVals; BoUpSLP &R; SmallPtrSetImpl<Value *> &CheckedExtracts; constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + /// While set, still trying to estimate the cost for the same nodes and we + /// can delay actual cost estimation (virtual shuffle instruction emission). + /// May help better estimate the cost if same nodes must be permuted + allows + /// to move most of the long shuffles cost estimation to TTI. + bool SameNodesEstimated = true; + + static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) { + if (Ty->getScalarType()->isPointerTy()) { + Constant *Res = ConstantExpr::getIntToPtr( + ConstantInt::getAllOnesValue( + IntegerType::get(Ty->getContext(), + DL.getTypeStoreSizeInBits(Ty->getScalarType()))), + Ty->getScalarType()); + if (auto *VTy = dyn_cast<VectorType>(Ty)) + Res = ConstantVector::getSplat(VTy->getElementCount(), Res); + return Res; + } + return Constant::getAllOnesValue(Ty); + } InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) { if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof)) @@ -6821,20 +6967,35 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { // Improve gather cost for gather of loads, if we can group some of the // loads into vector loads. InstructionsState S = getSameOpcode(VL, *R.TLI); - if (VL.size() > 2 && S.getOpcode() == Instruction::Load && - !S.isAltShuffle() && + const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType()); + unsigned MinVF = R.getMinVF(2 * Sz); + if (VL.size() > 2 && + ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) || + (InVectors.empty() && + any_of(seq<unsigned>(0, VL.size() / MinVF), + [&](unsigned Idx) { + ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF); + InstructionsState S = getSameOpcode(SubVL, *R.TLI); + return S.getOpcode() == Instruction::Load && + !S.isAltShuffle(); + }))) && !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && !isSplat(Gathers)) { - BoUpSLP::ValueSet VectorizedLoads; + SetVector<Value *> VectorizedLoads; + SmallVector<LoadInst *> VectorizedStarts; + SmallVector<std::pair<unsigned, unsigned>> ScatterVectorized; unsigned StartIdx = 0; unsigned VF = VL.size() / 2; - unsigned VectorizedCnt = 0; - unsigned ScatterVectorizeCnt = 0; - const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType()); - for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { + for (; VF >= MinVF; VF /= 2) { for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; Cnt += VF) { ArrayRef<Value *> Slice = VL.slice(Cnt, VF); + if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) { + InstructionsState SliceS = getSameOpcode(Slice, *R.TLI); + if (SliceS.getOpcode() != Instruction::Load || + SliceS.isAltShuffle()) + continue; + } if (!VectorizedLoads.count(Slice.front()) && !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { SmallVector<Value *> PointerOps; @@ -6845,12 +7006,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { switch (LS) { case LoadsState::Vectorize: case LoadsState::ScatterVectorize: + case LoadsState::PossibleStridedVectorize: // Mark the vectorized loads so that we don't vectorize them // again. - if (LS == LoadsState::Vectorize) - ++VectorizedCnt; + // TODO: better handling of loads with reorders. + if (LS == LoadsState::Vectorize && CurrentOrder.empty()) + VectorizedStarts.push_back(cast<LoadInst>(Slice.front())); else - ++ScatterVectorizeCnt; + ScatterVectorized.emplace_back(Cnt, VF); VectorizedLoads.insert(Slice.begin(), Slice.end()); // If we vectorized initial block, no need to try to vectorize // it again. @@ -6881,8 +7044,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } // Exclude potentially vectorized loads from list of gathered // scalars. - auto *LI = cast<LoadInst>(S.MainOp); - Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType())); + Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType())); // The cost for vectorized loads. InstructionCost ScalarsCost = 0; for (Value *V : VectorizedLoads) { @@ -6892,17 +7054,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { LI->getAlign(), LI->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo(), LI); } - auto *LoadTy = FixedVectorType::get(LI->getType(), VF); - Align Alignment = LI->getAlign(); - GatherCost += - VectorizedCnt * - TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo(), LI); - GatherCost += ScatterVectorizeCnt * - TTI.getGatherScatterOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); + auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF); + for (LoadInst *LI : VectorizedStarts) { + Align Alignment = LI->getAlign(); + GatherCost += + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), LI); + } + for (std::pair<unsigned, unsigned> P : ScatterVectorized) { + auto *LI0 = cast<LoadInst>(VL[P.first]); + Align CommonAlignment = LI0->getAlign(); + for (Value *V : VL.slice(P.first + 1, VF - 1)) + CommonAlignment = + std::min(CommonAlignment, cast<LoadInst>(V)->getAlign()); + GatherCost += TTI.getGatherScatterOpCost( + Instruction::Load, LoadTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind, LI0); + } if (NeedInsertSubvectorAnalysis) { // Add the cost for the subvectors insert. for (int I = VF, E = VL.size(); I < E; I += VF) @@ -6938,77 +7107,137 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); }; - /// Compute the cost of creating a vector of type \p VecTy containing the - /// extracted values from \p VL. - InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask, - TTI::ShuffleKind ShuffleKind) { - auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); - unsigned NumOfParts = TTI.getNumberOfParts(VecTy); - - if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || - !NumOfParts || VecTy->getNumElements() < NumOfParts) - return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); - - bool AllConsecutive = true; - unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; - unsigned Idx = -1; + /// Compute the cost of creating a vector containing the extracted values from + /// \p VL. + InstructionCost + computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask, + ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, + unsigned NumParts) { + assert(VL.size() > NumParts && "Unexpected scalarized shuffle."); + unsigned NumElts = + std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) { + auto *EE = dyn_cast<ExtractElementInst>(V); + if (!EE) + return Sz; + auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType()); + return std::max(Sz, VecTy->getNumElements()); + }); + unsigned NumSrcRegs = TTI.getNumberOfParts( + FixedVectorType::get(VL.front()->getType(), NumElts)); + if (NumSrcRegs == 0) + NumSrcRegs = 1; + // FIXME: this must be moved to TTI for better estimation. + unsigned EltsPerVector = PowerOf2Ceil(std::max( + divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs))); + auto CheckPerRegistersShuffle = + [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> { + DenseSet<int> RegIndices; + // Check that if trying to permute same single/2 input vectors. + TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc; + int FirstRegId = -1; + for (int &I : Mask) { + if (I == PoisonMaskElem) + continue; + int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector; + if (FirstRegId < 0) + FirstRegId = RegId; + RegIndices.insert(RegId); + if (RegIndices.size() > 2) + return std::nullopt; + if (RegIndices.size() == 2) + ShuffleKind = TTI::SK_PermuteTwoSrc; + I = (I % NumElts) % EltsPerVector + + (RegId == FirstRegId ? 0 : EltsPerVector); + } + return ShuffleKind; + }; InstructionCost Cost = 0; // Process extracts in blocks of EltsPerVector to check if the source vector // operand can be re-used directly. If not, add the cost of creating a // shuffle to extract the values into a vector register. - SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem); - for (auto *V : VL) { - ++Idx; - - // Reached the start of a new vector registers. - if (Idx % EltsPerVector == 0) { - RegMask.assign(EltsPerVector, PoisonMaskElem); - AllConsecutive = true; + for (unsigned Part = 0; Part < NumParts; ++Part) { + if (!ShuffleKinds[Part]) continue; - } - - // Need to exclude undefs from analysis. - if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem) + ArrayRef<int> MaskSlice = + Mask.slice(Part * EltsPerVector, + (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0) + ? Mask.size() % EltsPerVector + : EltsPerVector); + SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem); + copy(MaskSlice, SubMask.begin()); + std::optional<TTI::ShuffleKind> RegShuffleKind = + CheckPerRegistersShuffle(SubMask); + if (!RegShuffleKind) { + Cost += TTI.getShuffleCost( + *ShuffleKinds[Part], + FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice); continue; - - // Check all extracts for a vector register on the target directly - // extract values in order. - unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); - if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) { - unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); - AllConsecutive &= PrevIdx + 1 == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; - RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; } - - if (AllConsecutive) - continue; - - // Skip all indices, except for the last index per vector block. - if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) - continue; - - // If we have a series of extracts which are not consecutive and hence - // cannot re-use the source vector register directly, compute the shuffle - // cost to extract the vector with EltsPerVector elements. - Cost += TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(VecTy->getElementType(), EltsPerVector), - RegMask); + if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || + !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) { + Cost += TTI.getShuffleCost( + *RegShuffleKind, + FixedVectorType::get(VL.front()->getType(), EltsPerVector), + SubMask); + } } return Cost; } + /// Transforms mask \p CommonMask per given \p Mask to make proper set after + /// shuffle emission. + static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask, + ArrayRef<int> Mask) { + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + } + /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given + /// mask \p Mask, register number \p Part, that includes \p SliceSize + /// elements. + void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2, + ArrayRef<int> Mask, unsigned Part, + unsigned SliceSize) { + if (SameNodesEstimated) { + // Delay the cost estimation if the same nodes are reshuffling. + // If we already requested the cost of reshuffling of E1 and E2 before, no + // need to estimate another cost with the sub-Mask, instead include this + // sub-Mask into the CommonMask to estimate it later and avoid double cost + // estimation. + if ((InVectors.size() == 2 && + InVectors.front().get<const TreeEntry *>() == &E1 && + InVectors.back().get<const TreeEntry *>() == E2) || + (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) { + assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize), + [](int Idx) { return Idx == PoisonMaskElem; }) && + "Expected all poisoned elements."); + ArrayRef<int> SubMask = + ArrayRef(Mask).slice(Part * SliceSize, SliceSize); + copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part)); + return; + } + // Found non-matching nodes - need to estimate the cost for the matched + // and transform mask. + Cost += createShuffle(InVectors.front(), + InVectors.size() == 1 ? nullptr : InVectors.back(), + CommonMask); + transformMaskAfterShuffle(CommonMask, CommonMask); + } + SameNodesEstimated = false; + Cost += createShuffle(&E1, E2, Mask); + transformMaskAfterShuffle(CommonMask, Mask); + } class ShuffleCostBuilder { const TargetTransformInfo &TTI; static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) { - int Limit = 2 * VF; + int Index = -1; return Mask.empty() || (VF == Mask.size() && - all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && - ShuffleVectorInst::isIdentityMask(Mask)); + ShuffleVectorInst::isIdentityMask(Mask, VF)) || + (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) && + Index == 0); } public: @@ -7021,21 +7250,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); if (isEmptyOrIdentity(Mask, VF)) return TTI::TCC_Free; - return TTI.getShuffleCost( - TTI::SK_PermuteTwoSrc, - FixedVectorType::get( - cast<VectorType>(V1->getType())->getElementType(), Mask.size()), - Mask); + return TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, + cast<VectorType>(V1->getType()), Mask); } InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const { // Empty mask or identity mask are free. - if (isEmptyOrIdentity(Mask, Mask.size())) + unsigned VF = + cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); + if (isEmptyOrIdentity(Mask, VF)) return TTI::TCC_Free; - return TTI.getShuffleCost( - TTI::SK_PermuteSingleSrc, - FixedVectorType::get( - cast<VectorType>(V1->getType())->getElementType(), Mask.size()), - Mask); + return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, + cast<VectorType>(V1->getType()), Mask); } InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; } InstructionCost createPoison(Type *Ty, unsigned VF) const { @@ -7052,139 +7277,226 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { const PointerUnion<Value *, const TreeEntry *> &P2, ArrayRef<int> Mask) { ShuffleCostBuilder Builder(TTI); + SmallVector<int> CommonMask(Mask.begin(), Mask.end()); Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>(); - unsigned CommonVF = 0; - if (!V1) { + unsigned CommonVF = Mask.size(); + if (!V1 && !V2 && !P2.isNull()) { + // Shuffle 2 entry nodes. const TreeEntry *E = P1.get<const TreeEntry *>(); unsigned VF = E->getVectorFactor(); - if (V2) { - unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements(); - if (V2VF != VF && V2VF == E->Scalars.size()) - VF = E->Scalars.size(); - } else if (!P2.isNull()) { - const TreeEntry *E2 = P2.get<const TreeEntry *>(); - if (E->Scalars.size() == E2->Scalars.size()) - CommonVF = VF = E->Scalars.size(); - } else { - // P2 is empty, check that we have same node + reshuffle (if any). - if (E->Scalars.size() == Mask.size() && VF != Mask.size()) { - VF = E->Scalars.size(); - SmallVector<int> CommonMask(Mask.begin(), Mask.end()); - ::addMask(CommonMask, E->getCommonMask()); - V1 = Constant::getNullValue( - FixedVectorType::get(E->Scalars.front()->getType(), VF)); - return BaseShuffleAnalysis::createShuffle<InstructionCost>( - V1, nullptr, CommonMask, Builder); + const TreeEntry *E2 = P2.get<const TreeEntry *>(); + CommonVF = std::max(VF, E2->getVectorFactor()); + assert(all_of(Mask, + [=](int Idx) { + return Idx < 2 * static_cast<int>(CommonVF); + }) && + "All elements in mask must be less than 2 * CommonVF."); + if (E->Scalars.size() == E2->Scalars.size()) { + SmallVector<int> EMask = E->getCommonMask(); + SmallVector<int> E2Mask = E2->getCommonMask(); + if (!EMask.empty() || !E2Mask.empty()) { + for (int &Idx : CommonMask) { + if (Idx == PoisonMaskElem) + continue; + if (Idx < static_cast<int>(CommonVF) && !EMask.empty()) + Idx = EMask[Idx]; + else if (Idx >= static_cast<int>(CommonVF)) + Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) + + E->Scalars.size(); + } } + CommonVF = E->Scalars.size(); } V1 = Constant::getNullValue( - FixedVectorType::get(E->Scalars.front()->getType(), VF)); - } - if (!V2 && !P2.isNull()) { - const TreeEntry *E = P2.get<const TreeEntry *>(); + FixedVectorType::get(E->Scalars.front()->getType(), CommonVF)); + V2 = getAllOnesValue( + *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF)); + } else if (!V1 && P2.isNull()) { + // Shuffle single entry node. + const TreeEntry *E = P1.get<const TreeEntry *>(); unsigned VF = E->getVectorFactor(); - unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements(); - if (!CommonVF && V1VF == E->Scalars.size()) + CommonVF = VF; + assert( + all_of(Mask, + [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) && + "All elements in mask must be less than CommonVF."); + if (E->Scalars.size() == Mask.size() && VF != Mask.size()) { + SmallVector<int> EMask = E->getCommonMask(); + assert(!EMask.empty() && "Expected non-empty common mask."); + for (int &Idx : CommonMask) { + if (Idx != PoisonMaskElem) + Idx = EMask[Idx]; + } CommonVF = E->Scalars.size(); - if (CommonVF) - VF = CommonVF; - V2 = Constant::getNullValue( - FixedVectorType::get(E->Scalars.front()->getType(), VF)); - } - return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask, - Builder); + } + V1 = Constant::getNullValue( + FixedVectorType::get(E->Scalars.front()->getType(), CommonVF)); + } else if (V1 && P2.isNull()) { + // Shuffle single vector. + CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements(); + assert( + all_of(Mask, + [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) && + "All elements in mask must be less than CommonVF."); + } else if (V1 && !V2) { + // Shuffle vector and tree node. + unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements(); + const TreeEntry *E2 = P2.get<const TreeEntry *>(); + CommonVF = std::max(VF, E2->getVectorFactor()); + assert(all_of(Mask, + [=](int Idx) { + return Idx < 2 * static_cast<int>(CommonVF); + }) && + "All elements in mask must be less than 2 * CommonVF."); + if (E2->Scalars.size() == VF && VF != CommonVF) { + SmallVector<int> E2Mask = E2->getCommonMask(); + assert(!E2Mask.empty() && "Expected non-empty common mask."); + for (int &Idx : CommonMask) { + if (Idx == PoisonMaskElem) + continue; + if (Idx >= static_cast<int>(CommonVF)) + Idx = E2Mask[Idx - CommonVF] + VF; + } + CommonVF = VF; + } + V1 = Constant::getNullValue( + FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF)); + V2 = getAllOnesValue( + *R.DL, + FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF)); + } else if (!V1 && V2) { + // Shuffle vector and tree node. + unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements(); + const TreeEntry *E1 = P1.get<const TreeEntry *>(); + CommonVF = std::max(VF, E1->getVectorFactor()); + assert(all_of(Mask, + [=](int Idx) { + return Idx < 2 * static_cast<int>(CommonVF); + }) && + "All elements in mask must be less than 2 * CommonVF."); + if (E1->Scalars.size() == VF && VF != CommonVF) { + SmallVector<int> E1Mask = E1->getCommonMask(); + assert(!E1Mask.empty() && "Expected non-empty common mask."); + for (int &Idx : CommonMask) { + if (Idx == PoisonMaskElem) + continue; + if (Idx >= static_cast<int>(CommonVF)) + Idx = E1Mask[Idx - CommonVF] + VF; + } + CommonVF = VF; + } + V1 = Constant::getNullValue( + FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF)); + V2 = getAllOnesValue( + *R.DL, + FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF)); + } else { + assert(V1 && V2 && "Expected both vectors."); + unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements(); + CommonVF = + std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements()); + assert(all_of(Mask, + [=](int Idx) { + return Idx < 2 * static_cast<int>(CommonVF); + }) && + "All elements in mask must be less than 2 * CommonVF."); + if (V1->getType() != V2->getType()) { + V1 = Constant::getNullValue(FixedVectorType::get( + cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF)); + V2 = getAllOnesValue( + *R.DL, FixedVectorType::get( + cast<FixedVectorType>(V1->getType())->getElementType(), + CommonVF)); + } + } + InVectors.front() = Constant::getNullValue(FixedVectorType::get( + cast<FixedVectorType>(V1->getType())->getElementType(), + CommonMask.size())); + if (InVectors.size() == 2) + InVectors.pop_back(); + return BaseShuffleAnalysis::createShuffle<InstructionCost>( + V1, V2, CommonMask, Builder); } public: ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef<Value *> VectorizedVals, BoUpSLP &R, SmallPtrSetImpl<Value *> &CheckedExtracts) - : TTI(TTI), VectorizedVals(VectorizedVals), R(R), - CheckedExtracts(CheckedExtracts) {} - Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask, - TTI::ShuffleKind ShuffleKind) { + : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), + R(R), CheckedExtracts(CheckedExtracts) {} + Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask, + ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, + unsigned NumParts, bool &UseVecBaseAsInput) { + UseVecBaseAsInput = false; if (Mask.empty()) return nullptr; Value *VecBase = nullptr; ArrayRef<Value *> VL = E->Scalars; - auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); // If the resulting type is scalarized, do not adjust the cost. - unsigned VecNumParts = TTI.getNumberOfParts(VecTy); - if (VecNumParts == VecTy->getNumElements()) + if (NumParts == VL.size()) return nullptr; - DenseMap<Value *, int> ExtractVectorsTys; - for (auto [I, V] : enumerate(VL)) { - // Ignore non-extractelement scalars. - if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem)) - continue; - // If all users of instruction are going to be vectorized and this - // instruction itself is not going to be vectorized, consider this - // instruction as dead and remove its cost from the final cost of the - // vectorized tree. - // Also, avoid adjusting the cost for extractelements with multiple uses - // in different graph entries. - const TreeEntry *VE = R.getTreeEntry(V); - if (!CheckedExtracts.insert(V).second || - !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || - (VE && VE != E)) - continue; - auto *EE = cast<ExtractElementInst>(V); - VecBase = EE->getVectorOperand(); - std::optional<unsigned> EEIdx = getExtractIndex(EE); - if (!EEIdx) - continue; - unsigned Idx = *EEIdx; - if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) { - auto It = - ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; - It->getSecond() = std::min<int>(It->second, Idx); - } - // Take credit for instruction that will become dead. - if (EE->hasOneUse()) { - Instruction *Ext = EE->user_back(); - if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) { - return isa<GetElementPtrInst>(U); - })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), - EE->getVectorOperandType(), Idx); - // Add back the cost of s|zext which is subtracted separately. - Cost += TTI.getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EE->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); + // Check if it can be considered reused if same extractelements were + // vectorized already. + bool PrevNodeFound = any_of( + ArrayRef(R.VectorizableTree).take_front(E->Idx), + [&](const std::unique_ptr<TreeEntry> &TE) { + return ((!TE->isAltShuffle() && + TE->getOpcode() == Instruction::ExtractElement) || + TE->State == TreeEntry::NeedToGather) && + all_of(enumerate(TE->Scalars), [&](auto &&Data) { + return VL.size() > Data.index() && + (Mask[Data.index()] == PoisonMaskElem || + isa<UndefValue>(VL[Data.index()]) || + Data.value() == VL[Data.index()]); + }); + }); + SmallPtrSet<Value *, 4> UniqueBases; + unsigned SliceSize = VL.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize); + for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) { + // Ignore non-extractelement scalars. + if (isa<UndefValue>(V) || + (!SubMask.empty() && SubMask[I] == PoisonMaskElem)) continue; - } - } - Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, - Idx); - } - // Add a cost for subvector extracts/inserts if required. - for (const auto &Data : ExtractVectorsTys) { - auto *EEVTy = cast<FixedVectorType>(Data.first->getType()); - unsigned NumElts = VecTy->getNumElements(); - if (Data.second % NumElts == 0) - continue; - if (TTI.getNumberOfParts(EEVTy) > VecNumParts) { - unsigned Idx = (Data.second / NumElts) * NumElts; - unsigned EENumElts = EEVTy->getNumElements(); - if (Idx % NumElts == 0) + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of the + // vectorized tree. + // Also, avoid adjusting the cost for extractelements with multiple uses + // in different graph entries. + auto *EE = cast<ExtractElementInst>(V); + VecBase = EE->getVectorOperand(); + UniqueBases.insert(VecBase); + const TreeEntry *VE = R.getTreeEntry(V); + if (!CheckedExtracts.insert(V).second || + !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) || + (VE && VE != E)) continue; - if (Idx + NumElts <= EENumElts) { - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, VecTy); - } else { - // Need to round up the subvector type vectorization factor to avoid a - // crash in cost model functions. Make SubVT so that Idx + VF of SubVT - // <= EENumElts. - auto *SubVT = - FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, SubVT); + std::optional<unsigned> EEIdx = getExtractIndex(EE); + if (!EEIdx) + continue; + unsigned Idx = *EEIdx; + // Take credit for instruction that will become dead. + if (EE->hasOneUse() || !PrevNodeFound) { + Instruction *Ext = EE->user_back(); + if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) { + return isa<GetElementPtrInst>(U); + })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + Cost -= + TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); + // Add back the cost of s|zext which is subtracted separately. + Cost += TTI.getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EE->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; + } } - } else { - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, std::nullopt, CostKind, 0, EEVTy); + Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), + CostKind, Idx); } } // Check that gather of extractelements can be represented as just a @@ -7192,31 +7504,152 @@ public: // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. - Cost += computeExtractCost(VL, Mask, ShuffleKind); + // Done for reused if same extractelements were vectorized already. + if (!PrevNodeFound) + Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts); + InVectors.assign(1, E); + CommonMask.assign(Mask.begin(), Mask.end()); + transformMaskAfterShuffle(CommonMask, CommonMask); + SameNodesEstimated = false; + if (NumParts != 1 && UniqueBases.size() != 1) { + UseVecBaseAsInput = true; + VecBase = Constant::getNullValue( + FixedVectorType::get(VL.front()->getType(), CommonMask.size())); + } return VecBase; } - void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) { - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign({E1, E2}); + /// Checks if the specified entry \p E needs to be delayed because of its + /// dependency nodes. + std::optional<InstructionCost> + needToDelay(const TreeEntry *, + ArrayRef<SmallVector<const TreeEntry *>>) const { + // No need to delay the cost estimation during analysis. + return std::nullopt; } - void add(const TreeEntry *E1, ArrayRef<int> Mask) { - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign(1, E1); + void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { + if (&E1 == &E2) { + assert(all_of(Mask, + [&](int Idx) { + return Idx < static_cast<int>(E1.getVectorFactor()); + }) && + "Expected single vector shuffle mask."); + add(E1, Mask); + return; + } + if (InVectors.empty()) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign({&E1, &E2}); + return; + } + assert(!CommonMask.empty() && "Expected non-empty common mask."); + auto *MaskVecTy = + FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size()); + unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); + if (NumParts == 0 || NumParts >= Mask.size()) + NumParts = 1; + unsigned SliceSize = Mask.size() / NumParts; + const auto *It = + find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); + unsigned Part = std::distance(Mask.begin(), It) / SliceSize; + estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize); + } + void add(const TreeEntry &E1, ArrayRef<int> Mask) { + if (InVectors.empty()) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign(1, &E1); + return; + } + assert(!CommonMask.empty() && "Expected non-empty common mask."); + auto *MaskVecTy = + FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size()); + unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); + if (NumParts == 0 || NumParts >= Mask.size()) + NumParts = 1; + unsigned SliceSize = Mask.size() / NumParts; + const auto *It = + find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); + unsigned Part = std::distance(Mask.begin(), It) / SliceSize; + estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize); + if (!SameNodesEstimated && InVectors.size() == 1) + InVectors.emplace_back(&E1); + } + /// Adds 2 input vectors and the mask for their shuffling. + void add(Value *V1, Value *V2, ArrayRef<int> Mask) { + // May come only for shuffling of 2 vectors with extractelements, already + // handled in adjustExtracts. + assert(InVectors.size() == 1 && + all_of(enumerate(CommonMask), + [&](auto P) { + if (P.value() == PoisonMaskElem) + return Mask[P.index()] == PoisonMaskElem; + auto *EI = + cast<ExtractElementInst>(InVectors.front() + .get<const TreeEntry *>() + ->Scalars[P.index()]); + return EI->getVectorOperand() == V1 || + EI->getVectorOperand() == V2; + }) && + "Expected extractelement vectors."); } /// Adds another one input vector and the mask for the shuffling. - void add(Value *V1, ArrayRef<int> Mask) { - assert(CommonMask.empty() && InVectors.empty() && - "Expected empty input mask/vectors."); - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign(1, V1); + void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) { + if (InVectors.empty()) { + assert(CommonMask.empty() && !ForExtracts && + "Expected empty input mask/vectors."); + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign(1, V1); + return; + } + if (ForExtracts) { + // No need to add vectors here, already handled them in adjustExtracts. + assert(InVectors.size() == 1 && + InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() && + all_of(enumerate(CommonMask), + [&](auto P) { + Value *Scalar = InVectors.front() + .get<const TreeEntry *>() + ->Scalars[P.index()]; + if (P.value() == PoisonMaskElem) + return P.value() == Mask[P.index()] || + isa<UndefValue>(Scalar); + if (isa<Constant>(V1)) + return true; + auto *EI = cast<ExtractElementInst>(Scalar); + return EI->getVectorOperand() == V1; + }) && + "Expected only tree entry for extractelement vectors."); + return; + } + assert(!InVectors.empty() && !CommonMask.empty() && + "Expected only tree entries from extracts/reused buildvectors."); + unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements(); + if (InVectors.size() == 2) { + Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask); + transformMaskAfterShuffle(CommonMask, CommonMask); + VF = std::max<unsigned>(VF, CommonMask.size()); + } else if (const auto *InTE = + InVectors.front().dyn_cast<const TreeEntry *>()) { + VF = std::max(VF, InTE->getVectorFactor()); + } else { + VF = std::max( + VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType()) + ->getNumElements()); + } + InVectors.push_back(V1); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) + CommonMask[Idx] = Mask[Idx] + VF; } - Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) { + Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0, + Value *Root = nullptr) { Cost += getBuildVectorCost(VL, Root); if (!Root) { - assert(InVectors.empty() && "Unexpected input vectors for buildvector."); // FIXME: Need to find a way to avoid use of getNullValue here. SmallVector<Constant *> Vals; - for (Value *V : VL) { + unsigned VF = VL.size(); + if (MaskVF != 0) + VF = std::min(VF, MaskVF); + for (Value *V : VL.take_front(VF)) { if (isa<UndefValue>(V)) { Vals.push_back(cast<Constant>(V)); continue; @@ -7226,9 +7659,11 @@ public: return ConstantVector::get(Vals); } return ConstantVector::getSplat( - ElementCount::getFixed(VL.size()), - Constant::getNullValue(VL.front()->getType())); + ElementCount::getFixed( + cast<FixedVectorType>(Root->getType())->getNumElements()), + getAllOnesValue(*R.DL, VL.front()->getType())); } + InstructionCost createFreeze(InstructionCost Cost) { return Cost; } /// Finalize emission of the shuffles. InstructionCost finalize(ArrayRef<int> ExtMask, unsigned VF = 0, @@ -7236,31 +7671,24 @@ public: IsFinalized = true; if (Action) { const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front(); - if (InVectors.size() == 2) { + if (InVectors.size() == 2) Cost += createShuffle(Vec, InVectors.back(), CommonMask); - InVectors.pop_back(); - } else { + else Cost += createShuffle(Vec, nullptr, CommonMask); - } for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; assert(VF > 0 && "Expected vector length for the final value before action."); - Value *V = Vec.dyn_cast<Value *>(); - if (!Vec.isNull() && !V) - V = Constant::getNullValue(FixedVectorType::get( - Vec.get<const TreeEntry *>()->Scalars.front()->getType(), - CommonMask.size())); + Value *V = Vec.get<Value *>(); Action(V, CommonMask); + InVectors.front() = V; } ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); - if (CommonMask.empty()) - return Cost; - int Limit = CommonMask.size() * 2; - if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) && - ShuffleVectorInst::isIdentityMask(CommonMask)) + if (CommonMask.empty()) { + assert(InVectors.size() == 1 && "Expected only one vector with no mask"); return Cost; + } return Cost + createShuffle(InVectors.front(), InVectors.size() == 2 ? InVectors.back() : nullptr, @@ -7273,28 +7701,63 @@ public: } }; +const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, + unsigned Idx) const { + Value *Op = E->getOperand(Idx).front(); + if (const TreeEntry *TE = getTreeEntry(Op)) { + if (find_if(E->UserTreeIndices, [&](const EdgeInfo &EI) { + return EI.EdgeIdx == Idx && EI.UserTE == E; + }) != TE->UserTreeIndices.end()) + return TE; + auto MIt = MultiNodeScalars.find(Op); + if (MIt != MultiNodeScalars.end()) { + for (const TreeEntry *TE : MIt->second) { + if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { + return EI.EdgeIdx == Idx && EI.UserTE == E; + }) != TE->UserTreeIndices.end()) + return TE; + } + } + } + const auto *It = + find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { + return TE->State == TreeEntry::NeedToGather && + find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { + return EI.EdgeIdx == Idx && EI.UserTE == E; + }) != TE->UserTreeIndices.end(); + }); + assert(It != VectorizableTree.end() && "Expected vectorizable entry."); + return It->get(); +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, SmallPtrSetImpl<Value *> &CheckedExtracts) { ArrayRef<Value *> VL = E->Scalars; Type *ScalarTy = VL[0]->getType(); - if (auto *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - else if (auto *CI = dyn_cast<CmpInst>(VL[0])) - ScalarTy = CI->getOperand(0)->getType(); - else if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) - ScalarTy = IE->getOperand(1)->getType(); + if (E->State != TreeEntry::NeedToGather) { + if (auto *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + else if (auto *CI = dyn_cast<CmpInst>(VL[0])) + ScalarTy = CI->getOperand(0)->getType(); + else if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) + ScalarTy = IE->getOperand(1)->getType(); + } + if (!FixedVectorType::isValidElementType(ScalarTy)) + return InstructionCost::getInvalid(); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. - if (MinBWs.count(VL[0])) - VecTy = FixedVectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + auto It = MinBWs.find(E); + if (It != MinBWs.end()) { + ScalarTy = IntegerType::get(F->getContext(), It->second.first); + VecTy = FixedVectorType::get(ScalarTy, VL.size()); + } unsigned EntryVF = E->getVectorFactor(); - auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); + auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { @@ -7302,121 +7765,13 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, return 0; if (isa<InsertElementInst>(VL[0])) return InstructionCost::getInvalid(); - ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this, - CheckedExtracts); - unsigned VF = E->getVectorFactor(); - SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), - E->ReuseShuffleIndices.end()); - SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); - // Build a mask out of the reorder indices and reorder scalars per this - // mask. - SmallVector<int> ReorderMask; - inversePermutation(E->ReorderIndices, ReorderMask); - if (!ReorderMask.empty()) - reorderScalars(GatheredScalars, ReorderMask); - SmallVector<int> Mask; - SmallVector<int> ExtractMask; - std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle; - std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle; - SmallVector<const TreeEntry *> Entries; - Type *ScalarTy = GatheredScalars.front()->getType(); - // Check for gathered extracts. - ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); - SmallVector<Value *> IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); - - bool Resized = false; - if (Value *VecBase = Estimator.adjustExtracts( - E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) - if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } - - // Do not try to look for reshuffled loads for gathered loads (they will be - // handled later), for vectorized scalars, and cases, which are definitely - // not profitable (splats and small gather nodes.) - if (ExtractShuffle || E->getOpcode() != Instruction::Load || - E->isAltShuffle() || - all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || - isSplat(E->Scalars) || - (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) - GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); - if (GatherShuffle) { - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - if (*GatherShuffle == TTI::SK_PermuteSingleSrc && - Entries.front()->isSame(E->Scalars)) { - // Perfect match in the graph, will reuse the previously vectorized - // node. Cost is 0. - LLVM_DEBUG( - dbgs() - << "SLP: perfect diamond match for gather bundle that starts with " - << *VL.front() << ".\n"); - // Restore the mask for previous partially matched values. - for (auto [I, V] : enumerate(E->Scalars)) { - if (isa<PoisonValue>(V)) { - Mask[I] = PoisonMaskElem; - continue; - } - if (Mask[I] == PoisonMaskElem) - Mask[I] = Entries.front()->findLaneForValue(V); - } - Estimator.add(Entries.front(), Mask); - return Estimator.finalize(E->ReuseShuffleIndices); - } - if (!Resized) { - unsigned VF1 = Entries.front()->getVectorFactor(); - unsigned VF2 = Entries.back()->getVectorFactor(); - if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } - // Remove shuffled elements from list of gathers. - for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { - if (Mask[I] != PoisonMaskElem) - GatheredScalars[I] = PoisonValue::get(ScalarTy); - } - LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() - << " entries for bundle that starts with " - << *VL.front() << ".\n";); - if (Entries.size() == 1) - Estimator.add(Entries.front(), Mask); - else - Estimator.add(Entries.front(), Entries.back(), Mask); - if (all_of(GatheredScalars, PoisonValue ::classof)) - return Estimator.finalize(E->ReuseShuffleIndices); - return Estimator.finalize( - E->ReuseShuffleIndices, E->Scalars.size(), - [&](Value *&Vec, SmallVectorImpl<int> &Mask) { - Vec = Estimator.gather(GatheredScalars, - Constant::getNullValue(FixedVectorType::get( - GatheredScalars.front()->getType(), - GatheredScalars.size()))); - }); - } - if (!all_of(GatheredScalars, PoisonValue::classof)) { - auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size()); - bool SameGathers = VL.equals(Gathers); - Value *BV = Estimator.gather( - Gathers, SameGathers ? nullptr - : Constant::getNullValue(FixedVectorType::get( - GatheredScalars.front()->getType(), - GatheredScalars.size()))); - SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem); - std::iota(ReuseMask.begin(), ReuseMask.end(), 0); - Estimator.add(BV, ReuseMask); - } - if (ExtractShuffle) - Estimator.add(E, std::nullopt); - return Estimator.finalize(E->ReuseShuffleIndices); + return processBuildVector<ShuffleCostEstimator, InstructionCost>( + E, *TTI, VectorizedVals, *this, CheckedExtracts); } InstructionCost CommonCost = 0; SmallVector<int> Mask; - if (!E->ReorderIndices.empty()) { + if (!E->ReorderIndices.empty() && + E->State != TreeEntry::PossibleStridedVectorize) { SmallVector<int> NewMask; if (E->getOpcode() == Instruction::Store) { // For stores the order is actually a mask. @@ -7429,11 +7784,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, } if (NeedToShuffleReuses) ::addMask(Mask, E->ReuseShuffleIndices); - if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask)) + if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) CommonCost = TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::ScatterVectorize) && + E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) && "Unhandled state"); assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || @@ -7443,7 +7799,34 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); - const unsigned Sz = VL.size(); + SetVector<Value *> UniqueValues(VL.begin(), VL.end()); + const unsigned Sz = UniqueValues.size(); + SmallBitVector UsedScalars(Sz, false); + for (unsigned I = 0; I < Sz; ++I) { + if (getTreeEntry(UniqueValues[I]) == E) + continue; + UsedScalars.set(I); + } + auto GetCastContextHint = [&](Value *V) { + if (const TreeEntry *OpTE = getTreeEntry(V)) { + if (OpTE->State == TreeEntry::ScatterVectorize) + return TTI::CastContextHint::GatherScatter; + if (OpTE->State == TreeEntry::Vectorize && + OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) { + if (OpTE->ReorderIndices.empty()) + return TTI::CastContextHint::Normal; + SmallVector<int> Mask; + inversePermutation(OpTE->ReorderIndices, Mask); + if (ShuffleVectorInst::isReverseMask(Mask, Mask.size())) + return TTI::CastContextHint::Reversed; + } + } else { + InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI); + if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle()) + return TTI::CastContextHint::GatherScatter; + } + return TTI::CastContextHint::None; + }; auto GetCostDiff = [=](function_ref<InstructionCost(unsigned)> ScalarEltCost, function_ref<InstructionCost(InstructionCost)> VectorCost) { @@ -7453,13 +7836,49 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, // For some of the instructions no need to calculate cost for each // particular instruction, we can use the cost of the single // instruction x total number of scalar instructions. - ScalarCost = Sz * ScalarEltCost(0); + ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0); } else { - for (unsigned I = 0; I < Sz; ++I) + for (unsigned I = 0; I < Sz; ++I) { + if (UsedScalars.test(I)) + continue; ScalarCost += ScalarEltCost(I); + } } InstructionCost VecCost = VectorCost(CommonCost); + // Check if the current node must be resized, if the parent node is not + // resized. + if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) { + const EdgeInfo &EI = E->UserTreeIndices.front(); + if ((EI.UserTE->getOpcode() != Instruction::Select || + EI.EdgeIdx != 0) && + It != MinBWs.end()) { + auto UserBWIt = MinBWs.find(EI.UserTE); + Type *UserScalarTy = + EI.UserTE->getOperand(EI.EdgeIdx).front()->getType(); + if (UserBWIt != MinBWs.end()) + UserScalarTy = IntegerType::get(ScalarTy->getContext(), + UserBWIt->second.first); + if (ScalarTy != UserScalarTy) { + unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); + unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy); + unsigned VecOpcode; + auto *SrcVecTy = + FixedVectorType::get(UserScalarTy, E->getVectorFactor()); + if (BWSz > SrcBWSz) + VecOpcode = Instruction::Trunc; + else + VecOpcode = + It->second.second ? Instruction::SExt : Instruction::ZExt; + TTI::CastContextHint CCH = GetCastContextHint(VL0); + VecCost += TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, + CostKind); + ScalarCost += + Sz * TTI->getCastInstrCost(VecOpcode, ScalarTy, UserScalarTy, + CCH, CostKind); + } + } + } LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost, "Calculated costs for Tree")); return VecCost - ScalarCost; @@ -7550,7 +7969,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, // Count reused scalars. InstructionCost ScalarCost = 0; SmallPtrSet<const TreeEntry *, 4> CountedOps; - for (Value *V : VL) { + for (Value *V : UniqueValues) { auto *PHI = dyn_cast<PHINode>(V); if (!PHI) continue; @@ -7571,8 +7990,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, } case Instruction::ExtractValue: case Instruction::ExtractElement: { - auto GetScalarCost = [=](unsigned Idx) { - auto *I = cast<Instruction>(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *I = cast<Instruction>(UniqueValues[Idx]); VectorType *SrcVecTy; if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast<ExtractElementInst>(I); @@ -7680,8 +8099,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, // need to shift the vector. // Do not calculate the cost if the actual size is the register size and // we can merge this shuffle with the following SK_Select. - auto *InsertVecTy = - FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz); + auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz); if (!IsIdentity) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, InsertVecTy, Mask); @@ -7697,8 +8115,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask)); if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { if (InsertVecSz != VecSz) { - auto *ActualVecTy = - FixedVectorType::get(SrcVecTy->getElementType(), VecSz); + auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz); Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, std::nullopt, CostKind, OffsetBeg - Offset, InsertVecTy); @@ -7729,22 +8146,52 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast<Instruction>(VL[Idx]); - return TTI->getCastInstrCost(E->getOpcode(), ScalarTy, - VI->getOperand(0)->getType(), + auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); + Type *SrcScalarTy = VL0->getOperand(0)->getType(); + auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size()); + unsigned Opcode = ShuffleOrOp; + unsigned VecOpcode = Opcode; + if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() && + (SrcIt != MinBWs.end() || It != MinBWs.end())) { + // Check if the values are candidates to demote. + unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy); + if (SrcIt != MinBWs.end()) { + SrcBWSz = SrcIt->second.first; + SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz); + SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size()); + } + unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); + if (BWSz == SrcBWSz) { + VecOpcode = Instruction::BitCast; + } else if (BWSz < SrcBWSz) { + VecOpcode = Instruction::Trunc; + } else if (It != MinBWs.end()) { + assert(BWSz > SrcBWSz && "Invalid cast!"); + VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt; + } + } + auto GetScalarCost = [&](unsigned Idx) -> InstructionCost { + // Do not count cost here if minimum bitwidth is in effect and it is just + // a bitcast (here it is just a noop). + if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast) + return TTI::TCC_Free; + auto *VI = VL0->getOpcode() == Opcode + ? cast<Instruction>(UniqueValues[Idx]) + : nullptr; + return TTI->getCastInstrCost(Opcode, VL0->getType(), + VL0->getOperand(0)->getType(), TTI::getCastContextHint(VI), CostKind, VI); }; auto GetVectorCost = [=](InstructionCost CommonCost) { - Type *SrcTy = VL0->getOperand(0)->getType(); - auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); - InstructionCost VecCost = CommonCost; - // Check if the values are candidates to demote. - if (!MinBWs.count(VL0) || VecTy != SrcVecTy) - VecCost += - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, - TTI::getCastContextHint(VL0), CostKind, VL0); - return VecCost; + // Do not count cost here if minimum bitwidth is in effect and it is just + // a bitcast (here it is just a noop). + if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast) + return CommonCost; + auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr; + TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0)); + return CommonCost + + TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind, + VecOpcode == Opcode ? VI : nullptr); }; return GetCostDiff(GetScalarCost, GetVectorCost); } @@ -7761,7 +8208,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; auto GetScalarCost = [&](unsigned Idx) { - auto *VI = cast<Instruction>(VL[Idx]); + auto *VI = cast<Instruction>(UniqueValues[Idx]); CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy() ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; @@ -7821,8 +8268,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, case Instruction::And: case Instruction::Or: case Instruction::Xor: { - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast<Instruction>(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast<Instruction>(UniqueValues[Idx]); unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1; TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); TTI::OperandValueInfo Op2Info = @@ -7833,8 +8280,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, }; auto GetVectorCost = [=](InstructionCost CommonCost) { unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1; - TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0); - TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx); + TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0)); + TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx)); return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info, Op2Info) + CommonCost; @@ -7845,23 +8292,25 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, return CommonCost + GetGEPCostDiff(VL, VL0); } case Instruction::Load: { - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast<LoadInst>(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast<LoadInst>(UniqueValues[Idx]); return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(), VI->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo(), VI); }; auto *LI0 = cast<LoadInst>(VL0); - auto GetVectorCost = [=](InstructionCost CommonCost) { + auto GetVectorCost = [&](InstructionCost CommonCost) { InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { VecLdCost = TTI->getMemoryOpCost( Instruction::Load, VecTy, LI0->getAlign(), LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); } else { - assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); + assert((E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) && + "Unknown EntryState"); Align CommonAlignment = LI0->getAlign(); - for (Value *V : VL) + for (Value *V : UniqueValues) CommonAlignment = std::min(CommonAlignment, cast<LoadInst>(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( @@ -7874,7 +8323,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); // If this node generates masked gather load then it is not a terminal node. // Hence address operand cost is estimated separately. - if (E->State == TreeEntry::ScatterVectorize) + if (E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) return Cost; // Estimate cost of GEPs since this tree node is a terminator. @@ -7887,7 +8337,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, bool IsReorder = !E->ReorderIndices.empty(); auto GetScalarCost = [=](unsigned Idx) { auto *VI = cast<StoreInst>(VL[Idx]); - TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand()); return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(), VI->getPointerAddressSpace(), CostKind, OpInfo, VI); @@ -7896,7 +8346,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); auto GetVectorCost = [=](InstructionCost CommonCost) { // We know that we can merge the stores. Calculate the cost. - TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0); + TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0)); return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind, OpInfo) + @@ -7912,8 +8362,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand()); } case Instruction::Call: { - auto GetScalarCost = [=](unsigned Idx) { - auto *CI = cast<CallInst>(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *CI = cast<CallInst>(UniqueValues[Idx]); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (ID != Intrinsic::not_intrinsic) { IntrinsicCostAttributes CostAttrs(ID, *CI, 1); @@ -7954,8 +8404,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, } return false; }; - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast<Instruction>(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast<Instruction>(UniqueValues[Idx]); assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); (void)E; return TTI->getInstructionCost(VI, CostKind); @@ -7995,21 +8445,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, TTI::CastContextHint::None, CostKind); } - if (E->ReuseShuffleIndices.empty()) { - VecCost += - TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); - } else { - SmallVector<int> Mask; - buildShuffleEntryMask( - E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, - [E](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return I->getOpcode() == E->getAltOpcode(); - }, - Mask); - VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); - } + SmallVector<int> Mask; + E->buildAltOpShuffleMask( + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask); + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + FinalVecTy, Mask); return VecCost; }; return GetCostDiff(GetScalarCost, GetVectorCost); @@ -8065,7 +8509,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { // Gathering cost would be too much for tiny trees. if (VectorizableTree[0]->State == TreeEntry::NeedToGather || (VectorizableTree[1]->State == TreeEntry::NeedToGather && - VectorizableTree[0]->State != TreeEntry::ScatterVectorize)) + VectorizableTree[0]->State != TreeEntry::ScatterVectorize && + VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize)) return false; return true; @@ -8144,6 +8589,23 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { allConstant(VectorizableTree[1]->Scalars)))) return true; + // If the graph includes only PHI nodes and gathers, it is defnitely not + // profitable for the vectorization, we can skip it, if the cost threshold is + // default. The cost of vectorized PHI nodes is almost always 0 + the cost of + // gathers/buildvectors. + constexpr int Limit = 4; + if (!ForReduction && !SLPCostThreshold.getNumOccurrences() && + !VectorizableTree.empty() && + all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { + return (TE->State == TreeEntry::NeedToGather && + TE->getOpcode() != Instruction::ExtractElement && + count_if(TE->Scalars, + [](Value *V) { return isa<ExtractElementInst>(V); }) <= + Limit) || + TE->getOpcode() == Instruction::PHI; + })) + return true; + // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. if (VectorizableTree.size() >= MinTreeSize) @@ -8435,16 +8897,6 @@ static T *performExtractsShuffleAction( } InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { - // Build a map for gathered scalars to the nodes where they are used. - ValueToGatherNodes.clear(); - for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) { - if (EntryPtr->State != TreeEntry::NeedToGather) - continue; - for (Value *V : EntryPtr->Scalars) - if (!isConstant(V)) - ValueToGatherNodes.try_emplace(V).first->getSecond().insert( - EntryPtr.get()); - } InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); @@ -8460,8 +8912,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { E->isSame(TE.Scalars)) { // Some gather nodes might be absolutely the same as some vectorizable // nodes after reordering, need to handle it. - LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with " - << *TE.Scalars[0] << ".\n" + LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle " + << shortBundleName(TE.Scalars) << ".\n" << "SLP: Current total cost = " << Cost << "\n"); continue; } @@ -8469,9 +8921,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); Cost += C; - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for bundle that starts with " << *TE.Scalars[0] - << ".\n" + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle " + << shortBundleName(TE.Scalars) << ".\n" << "SLP: Current total cost = " << Cost << "\n"); } @@ -8480,6 +8931,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks; SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers; SmallVector<APInt> DemandedElts; + SmallDenseSet<Value *, 4> UsedInserts; + DenseSet<Value *> VectorCasts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!isa_and_nonnull<InsertElementInst>(EU.User) && @@ -8500,6 +8953,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { // to detect it as a final shuffled/identity match. if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) { if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { + if (!UsedInserts.insert(VU).second) + continue; std::optional<unsigned> InsertIdx = getInsertIndex(VU); if (InsertIdx) { const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar); @@ -8546,6 +9001,28 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { FirstUsers.emplace_back(VU, ScalarTE); DemandedElts.push_back(APInt::getZero(FTy->getNumElements())); VecId = FirstUsers.size() - 1; + auto It = MinBWs.find(ScalarTE); + if (It != MinBWs.end() && VectorCasts.insert(EU.Scalar).second) { + unsigned BWSz = It->second.second; + unsigned SrcBWSz = DL->getTypeSizeInBits(FTy->getElementType()); + unsigned VecOpcode; + if (BWSz < SrcBWSz) + VecOpcode = Instruction::Trunc; + else + VecOpcode = + It->second.second ? Instruction::SExt : Instruction::ZExt; + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost C = TTI->getCastInstrCost( + VecOpcode, FTy, + FixedVectorType::get( + IntegerType::get(FTy->getContext(), It->second.first), + FTy->getNumElements()), + TTI::CastContextHint::None, CostKind); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for extending externally used vector with " + "non-equal minimum bitwidth.\n"); + Cost += C; + } } else { if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first))) It->first = VU; @@ -8567,11 +9044,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { // for the extract and the added cost of the sign extend if needed. auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; + auto It = MinBWs.find(getTreeEntry(EU.Scalar)); + if (It != MinBWs.end()) { + auto *MinTy = IntegerType::get(F->getContext(), It->second.first); + unsigned Extend = + It->second.second ? Instruction::SExt : Instruction::ZExt; VecTy = FixedVectorType::get(MinTy, BundleWidth); ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), VecTy, EU.Lane); @@ -8580,6 +9057,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { CostKind, EU.Lane); } } + // Add reduced value cost, if resized. + if (!VectorizedVals.empty()) { + auto BWIt = MinBWs.find(VectorizableTree.front().get()); + if (BWIt != MinBWs.end()) { + Type *DstTy = VectorizableTree.front()->Scalars.front()->getType(); + unsigned OriginalSz = DL->getTypeSizeInBits(DstTy); + unsigned Opcode = Instruction::Trunc; + if (OriginalSz < BWIt->second.first) + Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt; + Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first); + Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy, + TTI::CastContextHint::None, + TTI::TCK_RecipThroughput); + } + } InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; @@ -8590,9 +9082,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { unsigned VecVF = TE->getVectorFactor(); if (VF != VecVF && (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) || - (all_of(Mask, - [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) && - !ShuffleVectorInst::isIdentityMask(Mask)))) { + !ShuffleVectorInst::isIdentityMask(Mask, VF))) { SmallVector<int> OrigMask(VecVF, PoisonMaskElem); std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), OrigMask.begin()); @@ -8611,19 +9101,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { // Calculate the cost of the reshuffled vectors, if any. for (int I = 0, E = FirstUsers.size(); I < E; ++I) { Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0); - unsigned VF = ShuffleMasks[I].begin()->second.size(); - auto *FTy = FixedVectorType::get( - cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF); auto Vector = ShuffleMasks[I].takeVector(); - auto &&EstimateShufflesCost = [this, FTy, - &Cost](ArrayRef<int> Mask, - ArrayRef<const TreeEntry *> TEs) { + unsigned VF = 0; + auto EstimateShufflesCost = [&](ArrayRef<int> Mask, + ArrayRef<const TreeEntry *> TEs) { assert((TEs.size() == 1 || TEs.size() == 2) && "Expected exactly 1 or 2 tree entries."); if (TEs.size() == 1) { - int Limit = 2 * Mask.size(); - if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) || - !ShuffleVectorInst::isIdentityMask(Mask)) { + if (VF == 0) + VF = TEs.front()->getVectorFactor(); + auto *FTy = + FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF); + if (!ShuffleVectorInst::isIdentityMask(Mask, VF) && + !all_of(enumerate(Mask), [=](const auto &Data) { + return Data.value() == PoisonMaskElem || + (Data.index() < VF && + static_cast<int>(Data.index()) == Data.value()); + })) { InstructionCost C = TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C @@ -8634,6 +9128,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { Cost += C; } } else { + if (VF == 0) { + if (TEs.front() && + TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor()) + VF = TEs.front()->getVectorFactor(); + else + VF = Mask.size(); + } + auto *FTy = + FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF); InstructionCost C = TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C @@ -8643,6 +9146,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { dbgs() << "SLP: Current total cost = " << Cost << "\n"); Cost += C; } + VF = Mask.size(); return TEs.back(); }; (void)performExtractsShuffleAction<const TreeEntry>( @@ -8671,54 +9175,198 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { return Cost; } -std::optional<TargetTransformInfo::ShuffleKind> -BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, - SmallVectorImpl<int> &Mask, - SmallVectorImpl<const TreeEntry *> &Entries) { - Entries.clear(); - // No need to check for the topmost gather node. - if (TE == VectorizableTree.front().get()) +/// Tries to find extractelement instructions with constant indices from fixed +/// vector type and gather such instructions into a bunch, which highly likely +/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was +/// successful, the matched scalars are replaced by poison values in \p VL for +/// future analysis. +std::optional<TTI::ShuffleKind> +BoUpSLP::tryToGatherSingleRegisterExtractElements( + MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const { + // Scan list of gathered scalars for extractelements that can be represented + // as shuffles. + MapVector<Value *, SmallVector<int>> VectorOpToIdx; + SmallVector<int> UndefVectorExtracts; + for (int I = 0, E = VL.size(); I < E; ++I) { + auto *EI = dyn_cast<ExtractElementInst>(VL[I]); + if (!EI) { + if (isa<UndefValue>(VL[I])) + UndefVectorExtracts.push_back(I); + continue; + } + auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType()); + if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand())) + continue; + std::optional<unsigned> Idx = getExtractIndex(EI); + // Undefined index. + if (!Idx) { + UndefVectorExtracts.push_back(I); + continue; + } + SmallBitVector ExtractMask(VecTy->getNumElements(), true); + ExtractMask.reset(*Idx); + if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { + UndefVectorExtracts.push_back(I); + continue; + } + VectorOpToIdx[EI->getVectorOperand()].push_back(I); + } + // Sort the vector operands by the maximum number of uses in extractelements. + MapVector<unsigned, SmallVector<Value *>> VFToVector; + for (const auto &Data : VectorOpToIdx) + VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()] + .push_back(Data.first); + for (auto &Data : VFToVector) { + stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { + return VectorOpToIdx.find(V1)->second.size() > + VectorOpToIdx.find(V2)->second.size(); + }); + } + // Find the best pair of the vectors with the same number of elements or a + // single vector. + const int UndefSz = UndefVectorExtracts.size(); + unsigned SingleMax = 0; + Value *SingleVec = nullptr; + unsigned PairMax = 0; + std::pair<Value *, Value *> PairVec(nullptr, nullptr); + for (auto &Data : VFToVector) { + Value *V1 = Data.second.front(); + if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { + SingleMax = VectorOpToIdx[V1].size() + UndefSz; + SingleVec = V1; + } + Value *V2 = nullptr; + if (Data.second.size() > 1) + V2 = *std::next(Data.second.begin()); + if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + + UndefSz) { + PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; + PairVec = std::make_pair(V1, V2); + } + } + if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) + return std::nullopt; + // Check if better to perform a shuffle of 2 vectors or just of a single + // vector. + SmallVector<Value *> SavedVL(VL.begin(), VL.end()); + SmallVector<Value *> GatheredExtracts( + VL.size(), PoisonValue::get(VL.front()->getType())); + if (SingleMax >= PairMax && SingleMax) { + for (int Idx : VectorOpToIdx[SingleVec]) + std::swap(GatheredExtracts[Idx], VL[Idx]); + } else { + for (Value *V : {PairVec.first, PairVec.second}) + for (int Idx : VectorOpToIdx[V]) + std::swap(GatheredExtracts[Idx], VL[Idx]); + } + // Add extracts from undefs too. + for (int Idx : UndefVectorExtracts) + std::swap(GatheredExtracts[Idx], VL[Idx]); + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted from. + std::optional<TTI::ShuffleKind> Res = + isFixedVectorShuffle(GatheredExtracts, Mask); + if (!Res) { + // TODO: try to check other subsets if possible. + // Restore the original VL if attempt was not successful. + copy(SavedVL, VL.begin()); return std::nullopt; + } + // Restore unused scalars from mask, if some of the extractelements were not + // selected for shuffle. + for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { + if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) && + isa<UndefValue>(GatheredExtracts[I])) { + std::swap(VL[I], GatheredExtracts[I]); + continue; + } + auto *EI = dyn_cast<ExtractElementInst>(VL[I]); + if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) || + !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) || + is_contained(UndefVectorExtracts, I)) + continue; + } + return Res; +} + +/// Tries to find extractelement instructions with constant indices from fixed +/// vector type and gather such instructions into a bunch, which highly likely +/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was +/// successful, the matched scalars are replaced by poison values in \p VL for +/// future analysis. +SmallVector<std::optional<TTI::ShuffleKind>> +BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, + SmallVectorImpl<int> &Mask, + unsigned NumParts) const { + assert(NumParts > 0 && "NumParts expected be greater than or equal to 1."); + SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts); Mask.assign(VL.size(), PoisonMaskElem); - assert(TE->UserTreeIndices.size() == 1 && - "Expected only single user of the gather node."); + unsigned SliceSize = VL.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + // Scan list of gathered scalars for extractelements that can be represented + // as shuffles. + MutableArrayRef<Value *> SubVL = + MutableArrayRef(VL).slice(Part * SliceSize, SliceSize); + SmallVector<int> SubMask; + std::optional<TTI::ShuffleKind> Res = + tryToGatherSingleRegisterExtractElements(SubVL, SubMask); + ShufflesRes[Part] = Res; + copy(SubMask, std::next(Mask.begin(), Part * SliceSize)); + } + if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) { + return Res.has_value(); + })) + ShufflesRes.clear(); + return ShufflesRes; +} + +std::optional<TargetTransformInfo::ShuffleKind> +BoUpSLP::isGatherShuffledSingleRegisterEntry( + const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, + SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) { + Entries.clear(); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. - Instruction &UserInst = - getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE); - BasicBlock *ParentBB = nullptr; + const EdgeInfo &TEUseEI = TE->UserTreeIndices.front(); + const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE); + const BasicBlock *TEInsertBlock = nullptr; // Main node of PHI entries keeps the correct order of operands/incoming // blocks. - if (auto *PHI = - dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) { - ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx); + if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) { + TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx); + TEInsertPt = TEInsertBlock->getTerminator(); } else { - ParentBB = UserInst.getParent(); + TEInsertBlock = TEInsertPt->getParent(); } - auto *NodeUI = DT->getNode(ParentBB); + auto *NodeUI = DT->getNode(TEInsertBlock); assert(NodeUI && "Should only process reachable instructions"); SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end()); - auto CheckOrdering = [&](Instruction *LastEI) { - // Check if the user node of the TE comes after user node of EntryPtr, - // otherwise EntryPtr depends on TE. - // Gather nodes usually are not scheduled and inserted before their first - // user node. So, instead of checking dependency between the gather nodes - // themselves, we check the dependency between their user nodes. - // If one user node comes before the second one, we cannot use the second - // gather node as the source vector for the first gather node, because in - // the list of instructions it will be emitted later. - auto *EntryParent = LastEI->getParent(); - auto *NodeEUI = DT->getNode(EntryParent); + auto CheckOrdering = [&](const Instruction *InsertPt) { + // Argument InsertPt is an instruction where vector code for some other + // tree entry (one that shares one or more scalars with TE) is going to be + // generated. This lambda returns true if insertion point of vector code + // for the TE dominates that point (otherwise dependency is the other way + // around). The other node is not limited to be of a gather kind. Gather + // nodes are not scheduled and their vector code is inserted before their + // first user. If user is PHI, that is supposed to be at the end of a + // predecessor block. Otherwise it is the last instruction among scalars of + // the user node. So, instead of checking dependency between instructions + // themselves, we check dependency between their insertion points for vector + // code (since each scalar instruction ends up as a lane of a vector + // instruction). + const BasicBlock *InsertBlock = InsertPt->getParent(); + auto *NodeEUI = DT->getNode(InsertBlock); if (!NodeEUI) return false; assert((NodeUI == NodeEUI) == (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) && "Different nodes should have different DFS numbers"); // Check the order of the gather nodes users. - if (UserInst.getParent() != EntryParent && + if (TEInsertPt->getParent() != InsertBlock && (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI))) return false; - if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI)) + if (TEInsertPt->getParent() == InsertBlock && + TEInsertPt->comesBefore(InsertPt)) return false; return true; }; @@ -8743,43 +9391,42 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, [&](Value *V) { return GatheredScalars.contains(V); }) && "Must contain at least single gathered value."); assert(TEPtr->UserTreeIndices.size() == 1 && - "Expected only single user of the gather node."); - PHINode *EntryPHI = - dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp()); - Instruction *EntryUserInst = - EntryPHI ? nullptr - : &getLastInstructionInBundle( - TEPtr->UserTreeIndices.front().UserTE); - if (&UserInst == EntryUserInst) { - assert(!EntryPHI && "Unexpected phi node entry."); - // If 2 gathers are operands of the same entry, compare operands - // indices, use the earlier one as the base. - if (TE->UserTreeIndices.front().UserTE == - TEPtr->UserTreeIndices.front().UserTE && - TE->UserTreeIndices.front().EdgeIdx < - TEPtr->UserTreeIndices.front().EdgeIdx) + "Expected only single user of a gather node."); + const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front(); + + PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp()); + const Instruction *InsertPt = + UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator() + : &getLastInstructionInBundle(UseEI.UserTE); + if (TEInsertPt == InsertPt) { + // If 2 gathers are operands of the same entry (regardless of whether + // user is PHI or else), compare operands indices, use the earlier one + // as the base. + if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx) + continue; + // If the user instruction is used for some reason in different + // vectorized nodes - make it depend on index. + if (TEUseEI.UserTE != UseEI.UserTE && + TEUseEI.UserTE->Idx < UseEI.UserTE->Idx) continue; } - // Check if the user node of the TE comes after user node of EntryPtr, - // otherwise EntryPtr depends on TE. - auto *EntryI = - EntryPHI - ? EntryPHI - ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx) - ->getTerminator() - : EntryUserInst; - if ((ParentBB != EntryI->getParent() || - TE->UserTreeIndices.front().EdgeIdx < - TEPtr->UserTreeIndices.front().EdgeIdx || - TE->UserTreeIndices.front().UserTE != - TEPtr->UserTreeIndices.front().UserTE) && - !CheckOrdering(EntryI)) + + // Check if the user node of the TE comes after user node of TEPtr, + // otherwise TEPtr depends on TE. + if ((TEInsertBlock != InsertPt->getParent() || + TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) && + !CheckOrdering(InsertPt)) continue; VToTEs.insert(TEPtr); } if (const TreeEntry *VTE = getTreeEntry(V)) { - Instruction &EntryUserInst = getLastInstructionInBundle(VTE); - if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst)) + Instruction &LastBundleInst = getLastInstructionInBundle(VTE); + if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) + continue; + auto It = MinBWs.find(VTE); + // If vectorize node is demoted - do not match. + if (It != MinBWs.end() && + It->second.first != DL->getTypeSizeInBits(V->getType())) continue; VToTEs.insert(VTE); } @@ -8823,8 +9470,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, } } - if (UsedTEs.empty()) + if (UsedTEs.empty()) { + Entries.clear(); return std::nullopt; + } unsigned VF = 0; if (UsedTEs.size() == 1) { @@ -8838,9 +9487,19 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); }); - if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) { + if (It != FirstEntries.end() && + ((*It)->getVectorFactor() == VL.size() || + ((*It)->getVectorFactor() == TE->Scalars.size() && + TE->ReuseShuffleIndices.size() == VL.size() && + (*It)->isSame(TE->Scalars)))) { Entries.push_back(*It); - std::iota(Mask.begin(), Mask.end(), 0); + if ((*It)->getVectorFactor() == VL.size()) { + std::iota(std::next(Mask.begin(), Part * VL.size()), + std::next(Mask.begin(), (Part + 1) * VL.size()), 0); + } else { + SmallVector<int> CommonMask = TE->getCommonMask(); + copy(CommonMask, Mask.begin()); + } // Clear undef scalars. for (int I = 0, Sz = VL.size(); I < Sz; ++I) if (isa<PoisonValue>(VL[I])) @@ -8923,12 +9582,9 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, // by extractelements processing) or may form vector node in future. auto MightBeIgnored = [=](Value *V) { auto *I = dyn_cast<Instruction>(V); - SmallVector<Value *> IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) && !isVectorLikeInstWithConstOps(I) && - !areAllUsersVectorized(I, IgnoredVals) && isSimple(I); + !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I); }; // Check that the neighbor instruction may form a full vector node with the // current instruction V. It is possible, if they have same/alternate opcode @@ -8980,7 +9636,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, TempEntries.push_back(Entries[I]); } Entries.swap(TempEntries); - if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) { + if (EntryLanes.size() == Entries.size() && + !VL.equals(ArrayRef(TE->Scalars) + .slice(Part * VL.size(), + std::min<int>(VL.size(), TE->Scalars.size())))) { // We may have here 1 or 2 entries only. If the number of scalars is equal // to the number of entries, no need to do the analysis, it is not very // profitable. Since VL is not the same as TE->Scalars, it means we already @@ -8993,9 +9652,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, // Pair.first is the offset to the vector, while Pair.second is the index of // scalar in the list. for (const std::pair<unsigned, int> &Pair : EntryLanes) { - Mask[Pair.second] = Pair.first * VF + - Entries[Pair.first]->findLaneForValue(VL[Pair.second]); - IsIdentity &= Mask[Pair.second] == Pair.second; + unsigned Idx = Part * VL.size() + Pair.second; + Mask[Idx] = Pair.first * VF + + Entries[Pair.first]->findLaneForValue(VL[Pair.second]); + IsIdentity &= Mask[Idx] == Pair.second; } switch (Entries.size()) { case 1: @@ -9010,9 +9670,64 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL, break; } Entries.clear(); + // Clear the corresponding mask elements. + std::fill(std::next(Mask.begin(), Part * VL.size()), + std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem); return std::nullopt; } +SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> +BoUpSLP::isGatherShuffledEntry( + const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, + SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, + unsigned NumParts) { + assert(NumParts > 0 && NumParts < VL.size() && + "Expected positive number of registers."); + Entries.clear(); + // No need to check for the topmost gather node. + if (TE == VectorizableTree.front().get()) + return {}; + Mask.assign(VL.size(), PoisonMaskElem); + assert(TE->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); + assert(VL.size() % NumParts == 0 && + "Number of scalars must be divisible by NumParts."); + unsigned SliceSize = VL.size() / NumParts; + SmallVector<std::optional<TTI::ShuffleKind>> Res; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize); + SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back(); + std::optional<TTI::ShuffleKind> SubRes = + isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part); + if (!SubRes) + SubEntries.clear(); + Res.push_back(SubRes); + if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc && + SubEntries.front()->getVectorFactor() == VL.size() && + (SubEntries.front()->isSame(TE->Scalars) || + SubEntries.front()->isSame(VL))) { + SmallVector<const TreeEntry *> LocalSubEntries; + LocalSubEntries.swap(SubEntries); + Entries.clear(); + Res.clear(); + std::iota(Mask.begin(), Mask.end(), 0); + // Clear undef scalars. + for (int I = 0, Sz = VL.size(); I < Sz; ++I) + if (isa<PoisonValue>(VL[I])) + Mask[I] = PoisonMaskElem; + Entries.emplace_back(1, LocalSubEntries.front()); + Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc); + return Res; + } + } + if (all_of(Res, + [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) { + Entries.clear(); + return {}; + } + return Res; +} + InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const { // Find the type of the operands in VL. @@ -9224,18 +9939,20 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { auto *Front = E->getMainOp(); Instruction *LastInst = &getLastInstructionInBundle(E); assert(LastInst && "Failed to find last instruction in bundle"); + BasicBlock::iterator LastInstIt = LastInst->getIterator(); // If the instruction is PHI, set the insert point after all the PHIs. bool IsPHI = isa<PHINode>(LastInst); if (IsPHI) - LastInst = LastInst->getParent()->getFirstNonPHI(); + LastInstIt = LastInst->getParent()->getFirstNonPHIIt(); if (IsPHI || (E->State != TreeEntry::NeedToGather && doesNotNeedToSchedule(E->Scalars))) { - Builder.SetInsertPoint(LastInst); + Builder.SetInsertPoint(LastInst->getParent(), LastInstIt); } else { // Set the insertion point after the last instruction in the bundle. Set the // debug location to Front. - Builder.SetInsertPoint(LastInst->getParent(), - std::next(LastInst->getIterator())); + Builder.SetInsertPoint( + LastInst->getParent(), + LastInst->getNextNonDebugInstruction()->getIterator()); } Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } @@ -9271,10 +9988,12 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) { GatherShuffleExtractSeq.insert(InsElt); CSEBlocks.insert(InsElt->getParent()); // Add to our 'need-to-extract' list. - if (TreeEntry *Entry = getTreeEntry(V)) { - // Find which lane we need to extract. - unsigned FoundLane = Entry->findLaneForValue(V); - ExternalUses.emplace_back(V, InsElt, FoundLane); + if (isa<Instruction>(V)) { + if (TreeEntry *Entry = getTreeEntry(V)) { + // Find which lane we need to extract. + unsigned FoundLane = Entry->findLaneForValue(V); + ExternalUses.emplace_back(V, InsElt, FoundLane); + } } return Vec; }; @@ -9367,12 +10086,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { /// Holds all of the instructions that we gathered. SetVector<Instruction *> &GatherShuffleExtractSeq; /// A list of blocks that we are going to CSE. - SetVector<BasicBlock *> &CSEBlocks; + DenseSet<BasicBlock *> &CSEBlocks; public: ShuffleIRBuilder(IRBuilderBase &Builder, SetVector<Instruction *> &GatherShuffleExtractSeq, - SetVector<BasicBlock *> &CSEBlocks) + DenseSet<BasicBlock *> &CSEBlocks) : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq), CSEBlocks(CSEBlocks) {} ~ShuffleIRBuilder() = default; @@ -9392,7 +10111,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { return V1; unsigned VF = Mask.size(); unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements(); - if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask)) + if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF)) return V1; Value *Vec = Builder.CreateShuffleVector(V1, Mask); if (auto *I = dyn_cast<Instruction>(Vec)) { @@ -9455,7 +10174,11 @@ public: : Builder(Builder), R(R) {} /// Adjusts extractelements after reusing them. - Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) { + Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask, + ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, + unsigned NumParts, bool &UseVecBaseAsInput) { + UseVecBaseAsInput = false; + SmallPtrSet<Value *, 4> UniqueBases; Value *VecBase = nullptr; for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { int Idx = Mask[I]; @@ -9463,6 +10186,10 @@ public: continue; auto *EI = cast<ExtractElementInst>(E->Scalars[I]); VecBase = EI->getVectorOperand(); + if (const TreeEntry *TE = R.getTreeEntry(VecBase)) + VecBase = TE->VectorizedValue; + assert(VecBase && "Expected vectorized value."); + UniqueBases.insert(VecBase); // If the only one use is vectorized - can delete the extractelement // itself. if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) { @@ -9471,14 +10198,97 @@ public: continue; R.eraseInstruction(EI); } - return VecBase; + if (NumParts == 1 || UniqueBases.size() == 1) + return VecBase; + UseVecBaseAsInput = true; + auto TransformToIdentity = [](MutableArrayRef<int> Mask) { + for (auto [I, Idx] : enumerate(Mask)) + if (Idx != PoisonMaskElem) + Idx = I; + }; + // Perform multi-register vector shuffle, joining them into a single virtual + // long vector. + // Need to shuffle each part independently and then insert all this parts + // into a long virtual vector register, forming the original vector. + Value *Vec = nullptr; + SmallVector<int> VecMask(Mask.size(), PoisonMaskElem); + unsigned SliceSize = E->Scalars.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef<Value *> VL = + ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize); + MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize); + constexpr int MaxBases = 2; + SmallVector<Value *, MaxBases> Bases(MaxBases); +#ifndef NDEBUG + int PrevSize = 0; +#endif // NDEBUG + for (const auto [I, V]: enumerate(VL)) { + if (SubMask[I] == PoisonMaskElem) + continue; + Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand(); + if (const TreeEntry *TE = R.getTreeEntry(VecOp)) + VecOp = TE->VectorizedValue; + assert(VecOp && "Expected vectorized value."); + const int Size = + cast<FixedVectorType>(VecOp->getType())->getNumElements(); +#ifndef NDEBUG + assert((PrevSize == Size || PrevSize == 0) && + "Expected vectors of the same size."); + PrevSize = Size; +#endif // NDEBUG + Bases[SubMask[I] < Size ? 0 : 1] = VecOp; + } + if (!Bases.front()) + continue; + Value *SubVec; + if (Bases.back()) { + SubVec = createShuffle(Bases.front(), Bases.back(), SubMask); + TransformToIdentity(SubMask); + } else { + SubVec = Bases.front(); + } + if (!Vec) { + Vec = SubVec; + assert((Part == 0 || all_of(seq<unsigned>(0, Part), + [&](unsigned P) { + ArrayRef<int> SubMask = + Mask.slice(P * SliceSize, SliceSize); + return all_of(SubMask, [](int Idx) { + return Idx == PoisonMaskElem; + }); + })) && + "Expected first part or all previous parts masked."); + copy(SubMask, std::next(VecMask.begin(), Part * SliceSize)); + } else { + unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements(); + if (Vec->getType() != SubVec->getType()) { + unsigned SubVecVF = + cast<FixedVectorType>(SubVec->getType())->getNumElements(); + VF = std::max(VF, SubVecVF); + } + // Adjust SubMask. + for (auto [I, Idx] : enumerate(SubMask)) + if (Idx != PoisonMaskElem) + Idx += VF; + copy(SubMask, std::next(VecMask.begin(), Part * SliceSize)); + Vec = createShuffle(Vec, SubVec, VecMask); + TransformToIdentity(VecMask); + } + } + copy(VecMask, Mask.begin()); + return Vec; } /// Checks if the specified entry \p E needs to be delayed because of its /// dependency nodes. - Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) { + std::optional<Value *> + needToDelay(const TreeEntry *E, + ArrayRef<SmallVector<const TreeEntry *>> Deps) const { // No need to delay emission if all deps are ready. - if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; })) - return nullptr; + if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) { + return all_of( + TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; }); + })) + return std::nullopt; // Postpone gather emission, will be emitted after the end of the // process to keep correct order. auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(), @@ -9487,6 +10297,16 @@ public: VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())), MaybeAlign()); } + /// Adds 2 input vectors (in form of tree entries) and the mask for their + /// shuffling. + void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { + add(E1.VectorizedValue, E2.VectorizedValue, Mask); + } + /// Adds single input vector (in form of tree entry) and the mask for its + /// shuffling. + void add(const TreeEntry &E1, ArrayRef<int> Mask) { + add(E1.VectorizedValue, Mask); + } /// Adds 2 input vectors and the mask for their shuffling. void add(Value *V1, Value *V2, ArrayRef<int> Mask) { assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); @@ -9516,7 +10336,7 @@ public: InVectors.push_back(V1); } /// Adds another one input vector and the mask for the shuffling. - void add(Value *V1, ArrayRef<int> Mask) { + void add(Value *V1, ArrayRef<int> Mask, bool = false) { if (InVectors.empty()) { if (!isa<FixedVectorType>(V1->getType())) { V1 = createShuffle(V1, nullptr, CommonMask); @@ -9578,7 +10398,8 @@ public: inversePermutation(Order, NewMask); add(V1, NewMask); } - Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) { + Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0, + Value *Root = nullptr) { return R.gather(VL, Root); } Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } @@ -9639,8 +10460,14 @@ public: } }; -Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { - ArrayRef<Value *> VL = E->getOperand(NodeIdx); +Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, + bool PostponedPHIs) { + ValueList &VL = E->getOperand(NodeIdx); + if (E->State == TreeEntry::PossibleStridedVectorize && + !E->ReorderIndices.empty()) { + SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); + reorderScalars(VL, Mask); + } const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. @@ -9651,23 +10478,39 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { S = getSameOpcode(*It, *TLI); } if (S.getOpcode()) { - if (TreeEntry *VE = getTreeEntry(S.OpValue); - VE && VE->isSame(VL) && - (any_of(VE->UserTreeIndices, - [E, NodeIdx](const EdgeInfo &EI) { - return EI.UserTE == E && EI.EdgeIdx == NodeIdx; - }) || - any_of(VectorizableTree, - [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) { - return TE->isOperandGatherNode({E, NodeIdx}) && - VE->isSame(TE->Scalars); - }))) { + auto CheckSameVE = [&](const TreeEntry *VE) { + return VE->isSame(VL) && + (any_of(VE->UserTreeIndices, + [E, NodeIdx](const EdgeInfo &EI) { + return EI.UserTE == E && EI.EdgeIdx == NodeIdx; + }) || + any_of(VectorizableTree, + [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) { + return TE->isOperandGatherNode({E, NodeIdx}) && + VE->isSame(TE->Scalars); + })); + }; + TreeEntry *VE = getTreeEntry(S.OpValue); + bool IsSameVE = VE && CheckSameVE(VE); + if (!IsSameVE) { + auto It = MultiNodeScalars.find(S.OpValue); + if (It != MultiNodeScalars.end()) { + auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) { + return TE != VE && CheckSameVE(TE); + }); + if (I != It->getSecond().end()) { + VE = *I; + IsSameVE = true; + } + } + } + if (IsSameVE) { auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) { ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); ShuffleBuilder.add(V, Mask); return ShuffleBuilder.finalize(std::nullopt); }; - Value *V = vectorizeTree(VE); + Value *V = vectorizeTree(VE, PostponedPHIs); if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) { if (!VE->ReuseShuffleIndices.empty()) { // Reshuffle to get only unique values. @@ -9740,14 +10583,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { assert(I->get()->UserTreeIndices.size() == 1 && "Expected only single user for the gather node."); assert(I->get()->isSame(VL) && "Expected same list of scalars."); - IRBuilder<>::InsertPointGuard Guard(Builder); - if (E->getOpcode() != Instruction::InsertElement && - E->getOpcode() != Instruction::PHI) { - Instruction *LastInst = &getLastInstructionInBundle(E); - assert(LastInst && "Failed to find last instruction in bundle"); - Builder.SetInsertPoint(LastInst); - } - return vectorizeTree(I->get()); + return vectorizeTree(I->get(), PostponedPHIs); } template <typename BVTy, typename ResTy, typename... Args> @@ -9765,7 +10601,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { inversePermutation(E->ReorderIndices, ReorderMask); if (!ReorderMask.empty()) reorderScalars(GatheredScalars, ReorderMask); - auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) { + auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF) { if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { return isa<UndefValue>(V) && !isa<PoisonValue>(V); })) @@ -9782,70 +10618,102 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { }); if (It == VectorizableTree.end()) return false; - unsigned I = - *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); - int Sz = Mask.size(); - if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) && - ShuffleVectorInst::isIdentityMask(Mask)) + int Idx; + if ((Mask.size() < InputVF && + ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) && + Idx == 0) || + (Mask.size() == InputVF && + ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) { std::iota(Mask.begin(), Mask.end(), 0); - else + } else { + unsigned I = + *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); std::fill(Mask.begin(), Mask.end(), I); + } return true; }; BVTy ShuffleBuilder(Params...); ResTy Res = ResTy(); SmallVector<int> Mask; - SmallVector<int> ExtractMask; - std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle; - std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle; - SmallVector<const TreeEntry *> Entries; + SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem); + SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles; + Value *ExtractVecBase = nullptr; + bool UseVecBaseAsInput = false; + SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles; + SmallVector<SmallVector<const TreeEntry *>> Entries; Type *ScalarTy = GatheredScalars.front()->getType(); + auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size()); + unsigned NumParts = TTI->getNumberOfParts(VecTy); + if (NumParts == 0 || NumParts >= GatheredScalars.size()) + NumParts = 1; if (!all_of(GatheredScalars, UndefValue::classof)) { // Check for gathered extracts. - ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); - SmallVector<Value *> IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); bool Resized = false; - if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask)) - if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } + ExtractShuffles = + tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); + if (!ExtractShuffles.empty()) { + SmallVector<const TreeEntry *> ExtractEntries; + for (auto [Idx, I] : enumerate(ExtractMask)) { + if (I == PoisonMaskElem) + continue; + if (const auto *TE = getTreeEntry( + cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand())) + ExtractEntries.push_back(TE); + } + if (std::optional<ResTy> Delayed = + ShuffleBuilder.needToDelay(E, ExtractEntries)) { + // Delay emission of gathers which are not ready yet. + PostponedGathers.insert(E); + // Postpone gather emission, will be emitted after the end of the + // process to keep correct order. + return *Delayed; + } + if (Value *VecBase = ShuffleBuilder.adjustExtracts( + E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) { + ExtractVecBase = VecBase; + if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && + GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + } + } // Gather extracts after we check for full matched gathers only. - if (ExtractShuffle || E->getOpcode() != Instruction::Load || + if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { - GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + GatherShuffles = + isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts); } - if (GatherShuffle) { - if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) { + if (!GatherShuffles.empty()) { + if (std::optional<ResTy> Delayed = + ShuffleBuilder.needToDelay(E, Entries)) { // Delay emission of gathers which are not ready yet. PostponedGathers.insert(E); // Postpone gather emission, will be emitted after the end of the // process to keep correct order. - return Delayed; + return *Delayed; } - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - if (*GatherShuffle == TTI::SK_PermuteSingleSrc && - Entries.front()->isSame(E->Scalars)) { + if (GatherShuffles.size() == 1 && + *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && + Entries.front().front()->isSame(E->Scalars)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( dbgs() - << "SLP: perfect diamond match for gather bundle that starts with " - << *E->Scalars.front() << ".\n"); + << "SLP: perfect diamond match for gather bundle " + << shortBundleName(E->Scalars) << ".\n"); // Restore the mask for previous partially matched values. - if (Entries.front()->ReorderIndices.empty() && - ((Entries.front()->ReuseShuffleIndices.empty() && - E->Scalars.size() == Entries.front()->Scalars.size()) || - (E->Scalars.size() == - Entries.front()->ReuseShuffleIndices.size()))) { + Mask.resize(E->Scalars.size()); + const TreeEntry *FrontTE = Entries.front().front(); + if (FrontTE->ReorderIndices.empty() && + ((FrontTE->ReuseShuffleIndices.empty() && + E->Scalars.size() == FrontTE->Scalars.size()) || + (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) { std::iota(Mask.begin(), Mask.end(), 0); } else { for (auto [I, V] : enumerate(E->Scalars)) { @@ -9853,17 +10721,20 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { Mask[I] = PoisonMaskElem; continue; } - Mask[I] = Entries.front()->findLaneForValue(V); + Mask[I] = FrontTE->findLaneForValue(V); } } - ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); + ShuffleBuilder.add(*FrontTE, Mask); Res = ShuffleBuilder.finalize(E->getCommonMask()); return Res; } if (!Resized) { - unsigned VF1 = Entries.front()->getVectorFactor(); - unsigned VF2 = Entries.back()->getVectorFactor(); - if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) + if (GatheredScalars.size() != VF && + any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) { + return any_of(TEs, [&](const TreeEntry *TE) { + return TE->getVectorFactor() == VF; + }); + })) GatheredScalars.append(VF - GatheredScalars.size(), PoisonValue::get(ScalarTy)); } @@ -9943,78 +10814,108 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { if (It != Scalars.end()) { // Replace undefs by the non-poisoned scalars and emit broadcast. int Pos = std::distance(Scalars.begin(), It); - for_each(UndefPos, [&](int I) { + for (int I : UndefPos) { // Set the undef position to the non-poisoned scalar. ReuseMask[I] = Pos; // Replace the undef by the poison, in the mask it is replaced by // non-poisoned scalar already. if (I != Pos) Scalars[I] = PoisonValue::get(ScalarTy); - }); + } } else { // Replace undefs by the poisons, emit broadcast and then emit // freeze. - for_each(UndefPos, [&](int I) { + for (int I : UndefPos) { ReuseMask[I] = PoisonMaskElem; if (isa<UndefValue>(Scalars[I])) Scalars[I] = PoisonValue::get(ScalarTy); - }); + } NeedFreeze = true; } } }; - if (ExtractShuffle || GatherShuffle) { + if (!ExtractShuffles.empty() || !GatherShuffles.empty()) { bool IsNonPoisoned = true; - bool IsUsedInExpr = false; + bool IsUsedInExpr = true; Value *Vec1 = nullptr; - if (ExtractShuffle) { + if (!ExtractShuffles.empty()) { // Gather of extractelements can be represented as just a shuffle of // a single/two vectors the scalars are extracted from. // Find input vectors. Value *Vec2 = nullptr; for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { - if (ExtractMask[I] == PoisonMaskElem || - (!Mask.empty() && Mask[I] != PoisonMaskElem)) { + if (!Mask.empty() && Mask[I] != PoisonMaskElem) ExtractMask[I] = PoisonMaskElem; - continue; - } - if (isa<UndefValue>(E->Scalars[I])) - continue; - auto *EI = cast<ExtractElementInst>(E->Scalars[I]); - if (!Vec1) { - Vec1 = EI->getVectorOperand(); - } else if (Vec1 != EI->getVectorOperand()) { - assert((!Vec2 || Vec2 == EI->getVectorOperand()) && - "Expected only 1 or 2 vectors shuffle."); - Vec2 = EI->getVectorOperand(); + } + if (UseVecBaseAsInput) { + Vec1 = ExtractVecBase; + } else { + for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { + if (ExtractMask[I] == PoisonMaskElem) + continue; + if (isa<UndefValue>(E->Scalars[I])) + continue; + auto *EI = cast<ExtractElementInst>(E->Scalars[I]); + Value *VecOp = EI->getVectorOperand(); + if (const auto *TE = getTreeEntry(VecOp)) + if (TE->VectorizedValue) + VecOp = TE->VectorizedValue; + if (!Vec1) { + Vec1 = VecOp; + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = VecOp; + } } } if (Vec2) { + IsUsedInExpr = false; IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); ShuffleBuilder.add(Vec1, Vec2, ExtractMask); } else if (Vec1) { - IsUsedInExpr = FindReusedSplat(ExtractMask); - ShuffleBuilder.add(Vec1, ExtractMask); + IsUsedInExpr &= FindReusedSplat( + ExtractMask, + cast<FixedVectorType>(Vec1->getType())->getNumElements()); + ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true); IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); } else { + IsUsedInExpr = false; ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get( ScalarTy, GatheredScalars.size())), - ExtractMask); + ExtractMask, /*ForExtracts=*/true); } } - if (GatherShuffle) { - if (Entries.size() == 1) { - IsUsedInExpr = FindReusedSplat(Mask); - ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(Entries.front()->VectorizedValue); - } else { - ShuffleBuilder.add(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) && - isGuaranteedNotToBePoison(Entries.back()->VectorizedValue); + if (!GatherShuffles.empty()) { + unsigned SliceSize = E->Scalars.size() / NumParts; + SmallVector<int> VecMask(Mask.size(), PoisonMaskElem); + for (const auto [I, TEs] : enumerate(Entries)) { + if (TEs.empty()) { + assert(!GatherShuffles[I] && + "No shuffles with empty entries list expected."); + continue; + } + assert((TEs.size() == 1 || TEs.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize); + VecMask.assign(VecMask.size(), PoisonMaskElem); + copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); + if (TEs.size() == 1) { + IsUsedInExpr &= + FindReusedSplat(VecMask, TEs.front()->getVectorFactor()); + ShuffleBuilder.add(*TEs.front(), VecMask); + if (TEs.front()->VectorizedValue) + IsNonPoisoned &= + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue); + } else { + IsUsedInExpr = false; + ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask); + if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue) + IsNonPoisoned &= + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) && + isGuaranteedNotToBePoison(TEs.back()->VectorizedValue); + } } } // Try to figure out best way to combine values: build a shuffle and insert @@ -10025,16 +10926,24 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { int MSz = Mask.size(); // Try to build constant vector and shuffle with it only if currently we // have a single permutation and more than 1 scalar constants. - bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle; + bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty(); bool IsIdentityShuffle = - (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) == - TTI::SK_PermuteSingleSrc && + ((UseVecBaseAsInput || + all_of(ExtractShuffles, + [](const std::optional<TTI::ShuffleKind> &SK) { + return SK.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc; + })) && none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && - ShuffleVectorInst::isIdentityMask(ExtractMask)) || - (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) == - TTI::SK_PermuteSingleSrc && + ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) || + (!GatherShuffles.empty() && + all_of(GatherShuffles, + [](const std::optional<TTI::ShuffleKind> &SK) { + return SK.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc; + }) && none_of(Mask, [&](int I) { return I >= MSz; }) && - ShuffleVectorInst::isIdentityMask(Mask)); + ShuffleVectorInst::isIdentityMask(Mask, MSz)); bool EnoughConstsForShuffle = IsSingleShuffle && (none_of(GatheredScalars, @@ -10064,7 +10973,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { if (!all_of(GatheredScalars, PoisonValue::classof)) { SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem); TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true); - Value *BV = ShuffleBuilder.gather(GatheredScalars); + Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size()); ShuffleBuilder.add(BV, BVMask); } if (all_of(NonConstants, [=](Value *V) { @@ -10078,13 +10987,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { E->ReuseShuffleIndices, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl<int> &Mask) { TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); - Vec = ShuffleBuilder.gather(NonConstants, Vec); + Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); }); } else if (!allConstant(GatheredScalars)) { // Gather unique scalars and all constants. SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem); TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); - Value *BV = ShuffleBuilder.gather(GatheredScalars); + Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); ShuffleBuilder.add(BV, ReuseMask); Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } else { @@ -10109,10 +11018,12 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) { *this); } -Value *BoUpSLP::vectorizeTree(TreeEntry *E) { +Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { IRBuilder<>::InsertPointGuard Guard(Builder); - if (E->VectorizedValue) { + if (E->VectorizedValue && + (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI || + E->isAltShuffle())) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; } @@ -10126,13 +11037,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return Vec; } - auto FinalShuffle = [&](Value *V, const TreeEntry *E) { + auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy, + bool IsSigned) { + if (V->getType() != VecTy) + V = Builder.CreateIntCast(V, VecTy, IsSigned); ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); if (E->getOpcode() == Instruction::Store) { ArrayRef<int> Mask = ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()), E->ReorderIndices.size()); ShuffleBuilder.add(V, Mask); + } else if (E->State == TreeEntry::PossibleStridedVectorize) { + ShuffleBuilder.addOrdered(V, std::nullopt); } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); } @@ -10140,7 +11056,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { }; assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::ScatterVectorize) && + E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) && "Unhandled state"); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); @@ -10150,6 +11067,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ScalarTy = Store->getValueOperand()->getType(); else if (auto *IE = dyn_cast<InsertElementInst>(VL0)) ScalarTy = IE->getOperand(1)->getType(); + bool IsSigned = false; + auto It = MinBWs.find(E); + if (It != MinBWs.end()) { + ScalarTy = IntegerType::get(F->getContext(), It->second.first); + IsSigned = It->second.second; + } auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { @@ -10157,32 +11080,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E != VectorizableTree.front().get() || !E->UserTreeIndices.empty()) && "PHI reordering is free."); + if (PostponedPHIs && E->VectorizedValue) + return E->VectorizedValue; auto *PH = cast<PHINode>(VL0); - Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); - Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); - Value *V = NewPhi; - - // Adjust insertion point once all PHI's have been generated. - Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt()); + Builder.SetInsertPoint(PH->getParent(), + PH->getParent()->getFirstNonPHIIt()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + if (PostponedPHIs || !E->VectorizedValue) { + PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); + E->PHI = NewPhi; + Value *V = NewPhi; + + // Adjust insertion point once all PHI's have been generated. + Builder.SetInsertPoint(PH->getParent(), + PH->getParent()->getFirstInsertionPt()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - V = FinalShuffle(V, E); + V = FinalShuffle(V, E, VecTy, IsSigned); - E->VectorizedValue = V; + E->VectorizedValue = V; + if (PostponedPHIs) + return V; + } + PHINode *NewPhi = cast<PHINode>(E->PHI); + // If phi node is fully emitted - exit. + if (NewPhi->getNumIncomingValues() != 0) + return NewPhi; // PHINodes may have multiple entries from the same block. We want to // visit every block once. SmallPtrSet<BasicBlock *, 4> VisitedBBs; - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) { ValueList Operands; - BasicBlock *IBB = PH->getIncomingBlock(i); + BasicBlock *IBB = PH->getIncomingBlock(I); // Stop emission if all incoming values are generated. if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return V; + return NewPhi; } if (!VisitedBBs.insert(IBB).second) { @@ -10192,37 +11128,54 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeOperand(E, i); + Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true); + if (VecTy != Vec->getType()) { + assert(MinBWs.contains(getOperandEntry(E, I)) && + "Expected item in MinBWs."); + Vec = Builder.CreateIntCast(Vec, VecTy, It->second.second); + } NewPhi->addIncoming(Vec, IBB); } assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && "Invalid number of incoming values"); - return V; + return NewPhi; } case Instruction::ExtractElement: { Value *V = E->getSingleOperand(0); setInsertPointAfterBundle(E); - V = FinalShuffle(V, E); + V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { auto *LI = cast<LoadInst>(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); - auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); - Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); + Value *Ptr = LI->getPointerOperand(); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); Value *NewV = propagateMetadata(V, E->Scalars); - NewV = FinalShuffle(NewV, E); + NewV = FinalShuffle(NewV, E, VecTy, IsSigned); E->VectorizedValue = NewV; return NewV; } case Instruction::InsertElement: { assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back())); - Value *V = vectorizeOperand(E, 1); + Value *V = vectorizeOperand(E, 1, PostponedPHIs); + ArrayRef<Value *> Op = E->getOperand(1); + Type *ScalarTy = Op.front()->getType(); + if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) { + assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs."); + std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1)); + assert(Res.first > 0 && "Expected item in MinBWs."); + V = Builder.CreateIntCast( + V, + FixedVectorType::get( + ScalarTy, + cast<FixedVectorType>(V->getType())->getNumElements()), + Res.second); + } // Create InsertVector shuffle if necessary auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { @@ -10255,7 +11208,57 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Mask[InsertIdx - Offset] = I; } if (!IsIdentity || NumElts != NumScalars) { - V = Builder.CreateShuffleVector(V, Mask); + Value *V2 = nullptr; + bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V); + SmallVector<int> InsertMask(Mask); + if (NumElts != NumScalars && Offset == 0) { + // Follow all insert element instructions from the current buildvector + // sequence. + InsertElementInst *Ins = cast<InsertElementInst>(VL0); + do { + std::optional<unsigned> InsertIdx = getInsertIndex(Ins); + if (!InsertIdx) + break; + if (InsertMask[*InsertIdx] == PoisonMaskElem) + InsertMask[*InsertIdx] = *InsertIdx; + if (!Ins->hasOneUse()) + break; + Ins = dyn_cast_or_null<InsertElementInst>( + Ins->getUniqueUndroppableUser()); + } while (Ins); + SmallBitVector UseMask = + buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask); + SmallBitVector IsFirstPoison = + isUndefVector<true>(FirstInsert->getOperand(0), UseMask); + SmallBitVector IsFirstUndef = + isUndefVector(FirstInsert->getOperand(0), UseMask); + if (!IsFirstPoison.all()) { + unsigned Idx = 0; + for (unsigned I = 0; I < NumElts; I++) { + if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) && + IsFirstUndef.test(I)) { + if (IsVNonPoisonous) { + InsertMask[I] = I < NumScalars ? I : 0; + continue; + } + if (!V2) + V2 = UndefValue::get(V->getType()); + if (Idx >= NumScalars) + Idx = NumScalars - 1; + InsertMask[I] = NumScalars + Idx; + ++Idx; + } else if (InsertMask[I] != PoisonMaskElem && + Mask[I] == PoisonMaskElem) { + InsertMask[I] = PoisonMaskElem; + } + } + } else { + InsertMask = Mask; + } + } + if (!V2) + V2 = PoisonValue::get(V->getType()); + V = Builder.CreateShuffleVector(V, V2, InsertMask); if (auto *I = dyn_cast<Instruction>(V)) { GatherShuffleExtractSeq.insert(I); CSEBlocks.insert(I->getParent()); @@ -10274,15 +11277,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) && NumElts != NumScalars) { if (IsFirstUndef.all()) { - if (!ShuffleVectorInst::isIdentityMask(InsertMask)) { - SmallBitVector IsFirstPoison = - isUndefVector<true>(FirstInsert->getOperand(0), UseMask); - if (!IsFirstPoison.all()) { - for (unsigned I = 0; I < NumElts; I++) { - if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I)) - InsertMask[I] = I + NumElts; + if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) { + SmallBitVector IsFirstPoison = + isUndefVector<true>(FirstInsert->getOperand(0), UseMask); + if (!IsFirstPoison.all()) { + for (unsigned I = 0; I < NumElts; I++) { + if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I)) + InsertMask[I] = I + NumElts; + } } - } V = Builder.CreateShuffleVector( V, IsFirstPoison.all() ? PoisonValue::get(V->getType()) @@ -10330,15 +11333,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::BitCast: { setInsertPointAfterBundle(E); - Value *InVec = vectorizeOperand(E, 0); + Value *InVec = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } auto *CI = cast<CastInst>(VL0); - Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - V = FinalShuffle(V, E); + Instruction::CastOps VecOpcode = CI->getOpcode(); + Type *SrcScalarTy = VL0->getOperand(0)->getType(); + auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); + if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() && + (SrcIt != MinBWs.end() || It != MinBWs.end())) { + // Check if the values are candidates to demote. + unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy); + if (SrcIt != MinBWs.end()) + SrcBWSz = SrcIt->second.first; + unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); + if (BWSz == SrcBWSz) { + VecOpcode = Instruction::BitCast; + } else if (BWSz < SrcBWSz) { + VecOpcode = Instruction::Trunc; + } else if (It != MinBWs.end()) { + assert(BWSz > SrcBWSz && "Invalid cast!"); + VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt; + } + } + Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast) + ? InVec + : Builder.CreateCast(VecOpcode, InVec, VecTy); + V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; ++NumVectorInstructions; @@ -10348,21 +11372,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::ICmp: { setInsertPointAfterBundle(E); - Value *L = vectorizeOperand(E, 0); + Value *L = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } - Value *R = vectorizeOperand(E, 1); + Value *R = vectorizeOperand(E, 1, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } + if (L->getType() != R->getType()) { + assert((MinBWs.contains(getOperandEntry(E, 0)) || + MinBWs.contains(getOperandEntry(E, 1))) && + "Expected item in MinBWs."); + L = Builder.CreateIntCast(L, VecTy, IsSigned); + R = Builder.CreateIntCast(R, VecTy, IsSigned); + } CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); - V = FinalShuffle(V, E); + // Do not cast for cmps. + VecTy = cast<FixedVectorType>(V->getType()); + V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; ++NumVectorInstructions; @@ -10371,24 +11404,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Select: { setInsertPointAfterBundle(E); - Value *Cond = vectorizeOperand(E, 0); + Value *Cond = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } - Value *True = vectorizeOperand(E, 1); + Value *True = vectorizeOperand(E, 1, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } - Value *False = vectorizeOperand(E, 2); + Value *False = vectorizeOperand(E, 2, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } + if (True->getType() != False->getType()) { + assert((MinBWs.contains(getOperandEntry(E, 1)) || + MinBWs.contains(getOperandEntry(E, 2))) && + "Expected item in MinBWs."); + True = Builder.CreateIntCast(True, VecTy, IsSigned); + False = Builder.CreateIntCast(False, VecTy, IsSigned); + } Value *V = Builder.CreateSelect(Cond, True, False); - V = FinalShuffle(V, E); + V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; ++NumVectorInstructions; @@ -10397,7 +11437,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::FNeg: { setInsertPointAfterBundle(E); - Value *Op = vectorizeOperand(E, 0); + Value *Op = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -10410,7 +11450,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - V = FinalShuffle(V, E); + V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; ++NumVectorInstructions; @@ -10437,16 +11477,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Xor: { setInsertPointAfterBundle(E); - Value *LHS = vectorizeOperand(E, 0); + Value *LHS = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } - Value *RHS = vectorizeOperand(E, 1); + Value *RHS = vectorizeOperand(E, 1, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } + if (LHS->getType() != RHS->getType()) { + assert((MinBWs.contains(getOperandEntry(E, 0)) || + MinBWs.contains(getOperandEntry(E, 1))) && + "Expected item in MinBWs."); + LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned); + RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned); + } Value *V = Builder.CreateBinOp( static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, @@ -10455,7 +11502,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - V = FinalShuffle(V, E); + V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; ++NumVectorInstructions; @@ -10476,14 +11523,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // The pointer operand uses an in-tree scalar so we add the new // LoadInst to ExternalUses list to make sure that an extract will // be generated in the future. - if (TreeEntry *Entry = getTreeEntry(PO)) { - // Find which lane we need to extract. - unsigned FoundLane = Entry->findLaneForValue(PO); - ExternalUses.emplace_back(PO, NewLI, FoundLane); + if (isa<Instruction>(PO)) { + if (TreeEntry *Entry = getTreeEntry(PO)) { + // Find which lane we need to extract. + unsigned FoundLane = Entry->findLaneForValue(PO); + ExternalUses.emplace_back(PO, NewLI, FoundLane); + } } } else { - assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); - Value *VecPtr = vectorizeOperand(E, 0); + assert((E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) && + "Unhandled state"); + Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; @@ -10497,35 +11548,32 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = propagateMetadata(NewLI, E->Scalars); - V = FinalShuffle(V, E); + V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; ++NumVectorInstructions; return V; } case Instruction::Store: { auto *SI = cast<StoreInst>(VL0); - unsigned AS = SI->getPointerAddressSpace(); setInsertPointAfterBundle(E); - Value *VecValue = vectorizeOperand(E, 0); - VecValue = FinalShuffle(VecValue, E); + Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs); + VecValue = FinalShuffle(VecValue, E, VecTy, IsSigned); - Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast( - ScalarPtr, VecValue->getType()->getPointerTo(AS)); + Value *Ptr = SI->getPointerOperand(); StoreInst *ST = - Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign()); + Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign()); - // The pointer operand uses an in-tree scalar, so add the new BitCast or - // StoreInst to ExternalUses to make sure that an extract will be - // generated in the future. - if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) { - // Find which lane we need to extract. - unsigned FoundLane = Entry->findLaneForValue(ScalarPtr); - ExternalUses.push_back(ExternalUser( - ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST, - FoundLane)); + // The pointer operand uses an in-tree scalar, so add the new StoreInst to + // ExternalUses to make sure that an extract will be generated in the + // future. + if (isa<Instruction>(Ptr)) { + if (TreeEntry *Entry = getTreeEntry(Ptr)) { + // Find which lane we need to extract. + unsigned FoundLane = Entry->findLaneForValue(Ptr); + ExternalUses.push_back(ExternalUser(Ptr, ST, FoundLane)); + } } Value *V = propagateMetadata(ST, E->Scalars); @@ -10538,7 +11586,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *GEP0 = cast<GetElementPtrInst>(VL0); setInsertPointAfterBundle(E); - Value *Op0 = vectorizeOperand(E, 0); + Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; @@ -10546,7 +11594,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector<Value *> OpVecs; for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { - Value *OpVec = vectorizeOperand(E, J); + Value *OpVec = vectorizeOperand(E, J, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; @@ -10564,7 +11612,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V = propagateMetadata(I, GEPs); } - V = FinalShuffle(V, E); + V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; ++NumVectorInstructions; @@ -10586,41 +11634,42 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { VecCallCosts.first <= VecCallCosts.second; Value *ScalarArg = nullptr; - std::vector<Value *> OpVecs; + SmallVector<Value *> OpVecs; SmallVector<Type *, 2> TysForDecl; // Add return type if intrinsic is overloaded on it. if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1)) TysForDecl.push_back( FixedVectorType::get(CI->getType(), E->Scalars.size())); - for (int j = 0, e = CI->arg_size(); j < e; ++j) { + for (unsigned I : seq<unsigned>(0, CI->arg_size())) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. - if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) { + if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, I)) { CallInst *CEI = cast<CallInst>(VL0); - ScalarArg = CEI->getArgOperand(j); - OpVecs.push_back(CEI->getArgOperand(j)); - if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) + ScalarArg = CEI->getArgOperand(I); + OpVecs.push_back(CEI->getArgOperand(I)); + if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I)) TysForDecl.push_back(ScalarArg->getType()); continue; } - Value *OpVec = vectorizeOperand(E, j); + Value *OpVec = vectorizeOperand(E, I, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } - LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); + LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); - if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) + if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I)) TysForDecl.push_back(OpVec->getType()); } Function *CF; if (!UseIntrinsic) { VFShape Shape = - VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( - VecTy->getNumElements())), + VFShape::get(CI->getFunctionType(), + ElementCount::getFixed( + static_cast<unsigned>(VecTy->getNumElements())), false /*HasGlobalPred*/); CF = VFDatabase(*CI).getVectorizedFunction(Shape); } else { @@ -10634,7 +11683,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // The scalar argument uses an in-tree scalar so we add the new vectorized // call to ExternalUses list to make sure that an extract will be // generated in the future. - if (ScalarArg) { + if (isa_and_present<Instruction>(ScalarArg)) { if (TreeEntry *Entry = getTreeEntry(ScalarArg)) { // Find which lane we need to extract. unsigned FoundLane = Entry->findLaneForValue(ScalarArg); @@ -10644,7 +11693,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } propagateIRFlags(V, E->Scalars, VL0); - V = FinalShuffle(V, E); + V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; ++NumVectorInstructions; @@ -10662,20 +11711,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS = nullptr, *RHS = nullptr; if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) { setInsertPointAfterBundle(E); - LHS = vectorizeOperand(E, 0); + LHS = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } - RHS = vectorizeOperand(E, 1); + RHS = vectorizeOperand(E, 1, PostponedPHIs); } else { setInsertPointAfterBundle(E); - LHS = vectorizeOperand(E, 0); + LHS = vectorizeOperand(E, 0, PostponedPHIs); } if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } + if (LHS && RHS && LHS->getType() != RHS->getType()) { + assert((MinBWs.contains(getOperandEntry(E, 0)) || + MinBWs.contains(getOperandEntry(E, 1))) && + "Expected item in MinBWs."); + LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned); + RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned); + } Value *V0, *V1; if (Instruction::isBinaryOp(E->getOpcode())) { @@ -10708,8 +11764,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // each vector operation. ValueList OpScalars, AltScalars; SmallVector<int> Mask; - buildShuffleEntryMask( - E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + E->buildAltOpShuffleMask( [E, this](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), @@ -10727,6 +11782,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CSEBlocks.insert(I->getParent()); } + if (V->getType() != VecTy && !isa<CmpInst>(VL0)) + V = Builder.CreateIntCast( + V, FixedVectorType::get(ScalarTy, E->getVectorFactor()), IsSigned); E->VectorizedValue = V; ++NumVectorInstructions; @@ -10767,9 +11825,19 @@ Value *BoUpSLP::vectorizeTree( // need to rebuild it. EntryToLastInstruction.clear(); - Builder.SetInsertPoint(ReductionRoot ? ReductionRoot - : &F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); + if (ReductionRoot) + Builder.SetInsertPoint(ReductionRoot->getParent(), + ReductionRoot->getIterator()); + else + Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); + + // Postpone emission of PHIs operands to avoid cyclic dependencies issues. + (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true); + for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) + if (TE->State == TreeEntry::Vectorize && + TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() && + TE->VectorizedValue) + (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false); // Run through the list of postponed gathers and emit them, replacing the temp // emitted allocas with actual vector instructions. ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef(); @@ -10786,9 +11854,32 @@ Value *BoUpSLP::vectorizeTree( TE->VectorizedValue = nullptr; auto *UserI = cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue); - Builder.SetInsertPoint(PrevVec); + // If user is a PHI node, its vector code have to be inserted right before + // block terminator. Since the node was delayed, there were some unresolved + // dependencies at the moment when stab instruction was emitted. In a case + // when any of these dependencies turn out an operand of another PHI, coming + // from this same block, position of a stab instruction will become invalid. + // The is because source vector that supposed to feed this gather node was + // inserted at the end of the block [after stab instruction]. So we need + // to adjust insertion point again to the end of block. + if (isa<PHINode>(UserI)) { + // Insert before all users. + Instruction *InsertPt = PrevVec->getParent()->getTerminator(); + for (User *U : PrevVec->users()) { + if (U == UserI) + continue; + auto *UI = dyn_cast<Instruction>(U); + if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent()) + continue; + if (UI->comesBefore(InsertPt)) + InsertPt = UI; + } + Builder.SetInsertPoint(InsertPt); + } else { + Builder.SetInsertPoint(PrevVec); + } Builder.SetCurrentDebugLocation(UserI->getDebugLoc()); - Value *Vec = vectorizeTree(TE); + Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false); PrevVec->replaceAllUsesWith(Vec); PostponedValues.try_emplace(Vec).first->second.push_back(TE); // Replace the stub vector node, if it was used before for one of the @@ -10801,26 +11892,6 @@ Value *BoUpSLP::vectorizeTree( eraseInstruction(PrevVec); } - // If the vectorized tree can be rewritten in a smaller type, we truncate the - // vectorized root. InstCombine will then rewrite the entire expression. We - // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - if (auto *I = dyn_cast<Instruction>(VectorRoot)) { - // If current instr is a phi and not the last phi, insert it after the - // last phi node. - if (isa<PHINode>(I)) - Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt()); - else - Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); - } - auto BundleWidth = VectorizableTree[0]->Scalars.size(); - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); - auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); - VectorizableTree[0]->VectorizedValue = Trunc; - } - LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); @@ -10830,6 +11901,8 @@ Value *BoUpSLP::vectorizeTree( // Maps extract Scalar to the corresponding extractelement instruction in the // basic block. Only one extractelement per block should be emitted. DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs; + SmallDenseSet<Value *, 4> UsedInserts; + DenseMap<Value *, Value *> VectorCasts; // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; @@ -10864,7 +11937,8 @@ Value *BoUpSLP::vectorizeTree( Instruction *I = EEIt->second; if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() && Builder.GetInsertPoint()->comesBefore(I)) - I->moveBefore(&*Builder.GetInsertPoint()); + I->moveBefore(*Builder.GetInsertPoint()->getParent(), + Builder.GetInsertPoint()); Ex = I; } } @@ -10887,11 +11961,10 @@ Value *BoUpSLP::vectorizeTree( } // If necessary, sign-extend or zero-extend ScalarRoot // to the larger type. - if (!MinBWs.count(ScalarRoot)) - return Ex; - if (MinBWs[ScalarRoot].second) - return Builder.CreateSExt(Ex, Scalar->getType()); - return Builder.CreateZExt(Ex, Scalar->getType()); + if (Scalar->getType() != Ex->getType()) + return Builder.CreateIntCast(Ex, Scalar->getType(), + MinBWs.find(E)->second.second); + return Ex; } assert(isa<FixedVectorType>(Scalar->getType()) && isa<InsertElementInst>(Scalar) && @@ -10909,12 +11982,13 @@ Value *BoUpSLP::vectorizeTree( "ExternallyUsedValues map"); if (auto *VecI = dyn_cast<Instruction>(Vec)) { if (auto *PHI = dyn_cast<PHINode>(VecI)) - Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI()); + Builder.SetInsertPoint(PHI->getParent(), + PHI->getParent()->getFirstNonPHIIt()); else Builder.SetInsertPoint(VecI->getParent(), std::next(VecI->getIterator())); } else { - Builder.SetInsertPoint(&F->getEntryBlock().front()); + Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); } Value *NewInst = ExtractAndExtendIfNeeded(Vec); // Required to update internally referenced instructions. @@ -10927,12 +12001,26 @@ Value *BoUpSLP::vectorizeTree( // Skip if the scalar is another vector op or Vec is not an instruction. if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) { if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) { + if (!UsedInserts.insert(VU).second) + continue; + // Need to use original vector, if the root is truncated. + auto BWIt = MinBWs.find(E); + if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) { + auto VecIt = VectorCasts.find(Scalar); + if (VecIt == VectorCasts.end()) { + IRBuilder<>::InsertPointGuard Guard(Builder); + if (auto *IVec = dyn_cast<Instruction>(Vec)) + Builder.SetInsertPoint(IVec->getNextNonDebugInstruction()); + Vec = Builder.CreateIntCast(Vec, VU->getType(), + BWIt->second.second); + VectorCasts.try_emplace(Scalar, Vec); + } else { + Vec = VecIt->second; + } + } + std::optional<unsigned> InsertIdx = getInsertIndex(VU); if (InsertIdx) { - // Need to use original vector, if the root is truncated. - if (MinBWs.count(Scalar) && - VectorizableTree[0]->VectorizedValue == Vec) - Vec = VectorRoot; auto *It = find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) { // Checks if 2 insertelements are from the same buildvector. @@ -10992,18 +12080,18 @@ Value *BoUpSLP::vectorizeTree( // Find the insertion point for the extractelement lane. if (auto *VecI = dyn_cast<Instruction>(Vec)) { if (PHINode *PH = dyn_cast<PHINode>(User)) { - for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { - if (PH->getIncomingValue(i) == Scalar) { + for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) { + if (PH->getIncomingValue(I) == Scalar) { Instruction *IncomingTerminator = - PH->getIncomingBlock(i)->getTerminator(); + PH->getIncomingBlock(I)->getTerminator(); if (isa<CatchSwitchInst>(IncomingTerminator)) { Builder.SetInsertPoint(VecI->getParent(), std::next(VecI->getIterator())); } else { - Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); + Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator()); } Value *NewInst = ExtractAndExtendIfNeeded(Vec); - PH->setOperand(i, NewInst); + PH->setOperand(I, NewInst); } } } else { @@ -11012,7 +12100,7 @@ Value *BoUpSLP::vectorizeTree( User->replaceUsesOfWith(Scalar, NewInst); } } else { - Builder.SetInsertPoint(&F->getEntryBlock().front()); + Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); Value *NewInst = ExtractAndExtendIfNeeded(Vec); User->replaceUsesOfWith(Scalar, NewInst); } @@ -11085,7 +12173,7 @@ Value *BoUpSLP::vectorizeTree( // non-resizing mask. if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType()) ->getNumElements() || - !ShuffleVectorInst::isIdentityMask(Mask)) + !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) return CreateShuffle(Vals.front(), nullptr, Mask); return Vals.front(); } @@ -11676,7 +12764,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, } } - auto makeControlDependent = [&](Instruction *I) { + auto MakeControlDependent = [&](Instruction *I) { auto *DepDest = getScheduleData(I); assert(DepDest && "must be in schedule window"); DepDest->ControlDependencies.push_back(BundleMember); @@ -11698,7 +12786,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, continue; // Add the dependency - makeControlDependent(I); + MakeControlDependent(I); if (!isGuaranteedToTransferExecutionToSuccessor(I)) // Everything past here must be control dependent on I. @@ -11724,7 +12812,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, continue; // Add the dependency - makeControlDependent(I); + MakeControlDependent(I); } } @@ -11742,7 +12830,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, continue; // Add the dependency - makeControlDependent(I); + MakeControlDependent(I); break; } } @@ -11757,7 +12845,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, "NextLoadStore list for non memory effecting bundle?"); MemoryLocation SrcLoc = getLocation(SrcInst); bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); - unsigned numAliased = 0; + unsigned NumAliased = 0; unsigned DistToSrc = 1; for (; DepDest; DepDest = DepDest->NextLoadStore) { @@ -11772,13 +12860,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, // check this limit even between two read-only instructions. if (DistToSrc >= MaxMemDepDistance || ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && - (numAliased >= AliasedCheckLimit || + (NumAliased >= AliasedCheckLimit || SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { // We increment the counter only if the locations are aliased // (instead of counting all alias checks). This gives a better // balance between reduced runtime and accurate dependencies. - numAliased++; + NumAliased++; DepDest->MemoryDependencies.push_back(BundleMember); BundleMember->Dependencies++; @@ -11880,20 +12968,20 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { // Do the "real" scheduling. while (!ReadyInsts.empty()) { - ScheduleData *picked = *ReadyInsts.begin(); + ScheduleData *Picked = *ReadyInsts.begin(); ReadyInsts.erase(ReadyInsts.begin()); // Move the scheduled instruction(s) to their dedicated places, if not // there yet. - for (ScheduleData *BundleMember = picked; BundleMember; + for (ScheduleData *BundleMember = Picked; BundleMember; BundleMember = BundleMember->NextInBundle) { - Instruction *pickedInst = BundleMember->Inst; - if (pickedInst->getNextNode() != LastScheduledInst) - pickedInst->moveBefore(LastScheduledInst); - LastScheduledInst = pickedInst; + Instruction *PickedInst = BundleMember->Inst; + if (PickedInst->getNextNode() != LastScheduledInst) + PickedInst->moveBefore(LastScheduledInst); + LastScheduledInst = PickedInst; } - BS->schedule(picked, ReadyInsts); + BS->schedule(Picked, ReadyInsts); } // Check that we didn't break any of our invariants. @@ -11994,21 +13082,22 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { // Determine if a value V in a vectorizable expression Expr can be demoted to a // smaller type with a truncation. We collect the values that will be demoted // in ToDemote and additional roots that require investigating in Roots. -static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, - SmallVectorImpl<Value *> &ToDemote, - SmallVectorImpl<Value *> &Roots) { +bool BoUpSLP::collectValuesToDemote( + Value *V, SmallVectorImpl<Value *> &ToDemote, + DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts, + SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const { // We can always demote constants. - if (isa<Constant>(V)) { - ToDemote.push_back(V); + if (isa<Constant>(V)) return true; - } - // If the value is not an instruction in the expression with only one use, it - // cannot be demoted. + // If the value is not a vectorized instruction in the expression with only + // one use, it cannot be demoted. auto *I = dyn_cast<Instruction>(V); - if (!I || !I->hasOneUse() || !Expr.count(I)) + if (!I || !I->hasOneUse() || !getTreeEntry(I) || !Visited.insert(I).second) return false; + unsigned Start = 0; + unsigned End = I->getNumOperands(); switch (I->getOpcode()) { // We can always demote truncations and extensions. Since truncations can @@ -12030,16 +13119,21 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, case Instruction::And: case Instruction::Or: case Instruction::Xor: - if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) || - !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)) + if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots, + Visited) || + !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots, + Visited)) return false; break; // We can demote selects if we can demote their true and false values. case Instruction::Select: { + Start = 1; SelectInst *SI = cast<SelectInst>(I); - if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) || - !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)) + if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts, + Roots, Visited) || + !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts, + Roots, Visited)) return false; break; } @@ -12049,7 +13143,8 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, case Instruction::PHI: { PHINode *PN = cast<PHINode>(I); for (Value *IncValue : PN->incoming_values()) - if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots)) + if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots, + Visited)) return false; break; } @@ -12059,6 +13154,10 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, return false; } + // Gather demoted constant operands. + for (unsigned Idx : seq<unsigned>(Start, End)) + if (isa<Constant>(I->getOperand(Idx))) + DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx); // Record the value that we can demote. ToDemote.push_back(V); return true; @@ -12076,44 +13175,26 @@ void BoUpSLP::computeMinimumValueSizes() { if (!TreeRootIT) return; - // If the expression is not rooted by a store, these roots should have - // external uses. We will rely on InstCombine to rewrite the expression in - // the narrower type. However, InstCombine only rewrites single-use values. - // This means that if a tree entry other than a root is used externally, it - // must have multiple uses and InstCombine will not rewrite it. The code - // below ensures that only the roots are used externally. - SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end()); - for (auto &EU : ExternalUses) - if (!Expr.erase(EU.Scalar)) - return; - if (!Expr.empty()) + // Ensure the roots of the vectorizable tree don't form a cycle. + if (!VectorizableTree.front()->UserTreeIndices.empty()) return; - // Collect the scalar values of the vectorizable expression. We will use this - // context to determine which values can be demoted. If we see a truncation, - // we mark it as seeding another demotion. - for (auto &EntryPtr : VectorizableTree) - Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end()); - - // Ensure the roots of the vectorizable tree don't form a cycle. They must - // have a single external user that is not in the vectorizable tree. - for (auto *Root : TreeRoot) - if (!Root->hasOneUse() || Expr.count(*Root->user_begin())) - return; - // Conservatively determine if we can actually truncate the roots of the // expression. Collect the values that can be demoted in ToDemote and // additional roots that require investigating in Roots. SmallVector<Value *, 32> ToDemote; + DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts; SmallVector<Value *, 4> Roots; - for (auto *Root : TreeRoot) - if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) + for (auto *Root : TreeRoot) { + DenseSet<Value *> Visited; + if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited)) return; + } // The maximum bit width required to represent all the values that can be // demoted without loss of precision. It would be safe to truncate the roots // of the expression to this width. - auto MaxBitWidth = 8u; + auto MaxBitWidth = 1u; // We first check if all the bits of the roots are demanded. If they're not, // we can truncate the roots to this narrower type. @@ -12138,9 +13219,9 @@ void BoUpSLP::computeMinimumValueSizes() { // maximum bit width required to store the scalar by using ValueTracking to // compute the number of high-order bits we can truncate. if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && - llvm::all_of(TreeRoot, [](Value *R) { - assert(R->hasOneUse() && "Root should have only one use!"); - return isa<GetElementPtrInst>(R->user_back()); + all_of(TreeRoot, [](Value *V) { + return all_of(V->users(), + [](User *U) { return isa<GetElementPtrInst>(U); }); })) { MaxBitWidth = 8u; @@ -12189,12 +13270,39 @@ void BoUpSLP::computeMinimumValueSizes() { // If we can truncate the root, we must collect additional values that might // be demoted as a result. That is, those seeded by truncations we will // modify. - while (!Roots.empty()) - collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots); + while (!Roots.empty()) { + DenseSet<Value *> Visited; + collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots, + Visited); + } // Finally, map the values we can demote to the maximum bit with we computed. - for (auto *Scalar : ToDemote) - MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive); + for (auto *Scalar : ToDemote) { + auto *TE = getTreeEntry(Scalar); + assert(TE && "Expected vectorized scalar."); + if (MinBWs.contains(TE)) + continue; + bool IsSigned = any_of(TE->Scalars, [&](Value *R) { + KnownBits Known = computeKnownBits(R, *DL); + return !Known.isNonNegative(); + }); + MinBWs.try_emplace(TE, MaxBitWidth, IsSigned); + const auto *I = cast<Instruction>(Scalar); + auto DCIt = DemotedConsts.find(I); + if (DCIt != DemotedConsts.end()) { + for (unsigned Idx : DCIt->getSecond()) { + // Check that all instructions operands are demoted. + if (all_of(TE->Scalars, [&](Value *V) { + auto SIt = DemotedConsts.find(cast<Instruction>(V)); + return SIt != DemotedConsts.end() && + is_contained(SIt->getSecond(), Idx); + })) { + const TreeEntry *CTE = getOperandEntry(TE, Idx); + MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned); + } + } + } + } } PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { @@ -12348,139 +13456,206 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - int E = Stores.size(); - SmallBitVector Tails(E, false); - int MaxIter = MaxStoreLookup.getValue(); - SmallVector<std::pair<int, int>, 16> ConsecutiveChain( - E, std::make_pair(E, INT_MAX)); - SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false)); - int IterCnt; - auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, - &CheckedPairs, - &ConsecutiveChain](int K, int Idx) { - if (IterCnt >= MaxIter) - return true; - if (CheckedPairs[Idx].test(K)) - return ConsecutiveChain[K].second == 1 && - ConsecutiveChain[K].first == Idx; - ++IterCnt; - CheckedPairs[Idx].set(K); - CheckedPairs[K].set(Idx); - std::optional<int> Diff = getPointersDiff( - Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(), - Stores[Idx]->getValueOperand()->getType(), - Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); - if (!Diff || *Diff == 0) - return false; - int Val = *Diff; - if (Val < 0) { - if (ConsecutiveChain[Idx].second > -Val) { - Tails.set(K); - ConsecutiveChain[Idx] = std::make_pair(K, -Val); - } - return false; + // Stores the pair of stores (first_store, last_store) in a range, that were + // already tried to be vectorized. Allows to skip the store ranges that were + // already tried to be vectorized but the attempts were unsuccessful. + DenseSet<std::pair<Value *, Value *>> TriedSequences; + struct StoreDistCompare { + bool operator()(const std::pair<unsigned, int> &Op1, + const std::pair<unsigned, int> &Op2) const { + return Op1.second < Op2.second; } - if (ConsecutiveChain[K].second <= Val) - return false; - - Tails.set(Idx); - ConsecutiveChain[K] = std::make_pair(Idx, Val); - return Val == 1; }; - // Do a quadratic search on all of the given stores in reverse order and find - // all of the pairs of stores that follow each other. - for (int Idx = E - 1; Idx >= 0; --Idx) { - // If a store has multiple consecutive store candidates, search according - // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... - // This is because usually pairing with immediate succeeding or preceding - // candidate create the best chance to find slp vectorization opportunity. - const int MaxLookDepth = std::max(E - Idx, Idx + 1); - IterCnt = 0; - for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset) - if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || - (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) - break; - } - - // Tracks if we tried to vectorize stores starting from the given tail - // already. - SmallBitVector TriedTails(E, false); - // For stores that start but don't end a link in the chain: - for (int Cnt = E; Cnt > 0; --Cnt) { - int I = Cnt - 1; - if (ConsecutiveChain[I].first == E || Tails.test(I)) - continue; - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. + // A set of pairs (index of store in Stores array ref, Distance of the store + // address relative to base store address in units). + using StoreIndexToDistSet = + std::set<std::pair<unsigned, int>, StoreDistCompare>; + auto TryToVectorize = [&](const StoreIndexToDistSet &Set) { + int PrevDist = -1; BoUpSLP::ValueList Operands; // Collect the chain into a list. - while (I != E && !VectorizedStores.count(Stores[I])) { - Operands.push_back(Stores[I]); - Tails.set(I); - if (ConsecutiveChain[I].second != 1) { - // Mark the new end in the chain and go back, if required. It might be - // required if the original stores come in reversed order, for example. - if (ConsecutiveChain[I].first != E && - Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) && - !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) { - TriedTails.set(I); - Tails.reset(ConsecutiveChain[I].first); - if (Cnt < ConsecutiveChain[I].first + 2) - Cnt = ConsecutiveChain[I].first + 2; + for (auto [Idx, Data] : enumerate(Set)) { + if (Operands.empty() || Data.second - PrevDist == 1) { + Operands.push_back(Stores[Data.first]); + PrevDist = Data.second; + if (Idx != Set.size() - 1) + continue; + } + if (Operands.size() <= 1) { + Operands.clear(); + Operands.push_back(Stores[Data.first]); + PrevDist = Data.second; + continue; + } + + unsigned MaxVecRegSize = R.getMaxVecRegSize(); + unsigned EltSize = R.getVectorElementSize(Operands[0]); + unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize); + + unsigned MaxVF = + std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); + auto *Store = cast<StoreInst>(Operands[0]); + Type *StoreTy = Store->getValueOperand()->getType(); + Type *ValueTy = StoreTy; + if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) + ValueTy = Trunc->getSrcTy(); + unsigned MinVF = TTI->getStoreMinimumVF( + R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy); + + if (MaxVF <= MinVF) { + LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF + << ") <= " + << "MinVF (" << MinVF << ")\n"); + } + + // FIXME: Is division-by-2 the correct step? Should we assert that the + // register size is a power-of-2? + unsigned StartIdx = 0; + for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { + ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size); + assert( + all_of( + Slice, + [&](Value *V) { + return cast<StoreInst>(V)->getValueOperand()->getType() == + cast<StoreInst>(Slice.front()) + ->getValueOperand() + ->getType(); + }) && + "Expected all operands of same type."); + if (!VectorizedStores.count(Slice.front()) && + !VectorizedStores.count(Slice.back()) && + TriedSequences.insert(std::make_pair(Slice.front(), Slice.back())) + .second && + vectorizeStoreChain(Slice, R, Cnt, MinVF)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + Changed = true; + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += Size; + Cnt += Size; + continue; + } + ++Cnt; } - break; + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Operands.size()) + break; } - // Move to the next value in the chain. - I = ConsecutiveChain[I].first; + Operands.clear(); + Operands.push_back(Stores[Data.first]); + PrevDist = Data.second; } - assert(!Operands.empty() && "Expected non-empty list of stores."); + }; - unsigned MaxVecRegSize = R.getMaxVecRegSize(); - unsigned EltSize = R.getVectorElementSize(Operands[0]); - unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize); - - unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), - MaxElts); - auto *Store = cast<StoreInst>(Operands[0]); - Type *StoreTy = Store->getValueOperand()->getType(); - Type *ValueTy = StoreTy; - if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) - ValueTy = Trunc->getSrcTy(); - unsigned MinVF = TTI->getStoreMinimumVF( - R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy); - - if (MaxVF <= MinVF) { - LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= " - << "MinVF (" << MinVF << ")\n"); - } - - // FIXME: Is division-by-2 the correct step? Should we assert that the - // register size is a power-of-2? - unsigned StartIdx = 0; - for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size); - if (!VectorizedStores.count(Slice.front()) && - !VectorizedStores.count(Slice.back()) && - vectorizeStoreChain(Slice, R, Cnt, MinVF)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Slice.begin(), Slice.end()); - Changed = true; - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += Size; - Cnt += Size; - continue; - } - ++Cnt; + // Stores pair (first: index of the store into Stores array ref, address of + // which taken as base, second: sorted set of pairs {index, dist}, which are + // indices of stores in the set and their store location distances relative to + // the base address). + + // Need to store the index of the very first store separately, since the set + // may be reordered after the insertion and the first store may be moved. This + // container allows to reduce number of calls of getPointersDiff() function. + SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores; + // Inserts the specified store SI with the given index Idx to the set of the + // stores. If the store with the same distance is found already - stop + // insertion, try to vectorize already found stores. If some stores from this + // sequence were not vectorized - try to vectorize them with the new store + // later. But this logic is applied only to the stores, that come before the + // previous store with the same distance. + // Example: + // 1. store x, %p + // 2. store y, %p+1 + // 3. store z, %p+2 + // 4. store a, %p + // 5. store b, %p+3 + // - Scan this from the last to first store. The very first bunch of stores is + // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores + // vector). + // - The next store in the list - #1 - has the same distance from store #5 as + // the store #4. + // - Try to vectorize sequence of stores 4,2,3,5. + // - If all these stores are vectorized - just drop them. + // - If some of them are not vectorized (say, #3 and #5), do extra analysis. + // - Start new stores sequence. + // The new bunch of stores is {1, {1, 0}}. + // - Add the stores from previous sequence, that were not vectorized. + // Here we consider the stores in the reversed order, rather they are used in + // the IR (Stores are reversed already, see vectorizeStoreChains() function). + // Store #3 can be added -> comes after store #4 with the same distance as + // store #1. + // Store #5 cannot be added - comes before store #4. + // This logic allows to improve the compile time, we assume that the stores + // after previous store with the same distance most likely have memory + // dependencies and no need to waste compile time to try to vectorize them. + // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}. + auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) { + for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) { + std::optional<int> Diff = getPointersDiff( + Stores[Set.first]->getValueOperand()->getType(), + Stores[Set.first]->getPointerOperand(), + SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, + /*StrictCheck=*/true); + if (!Diff) + continue; + auto It = Set.second.find(std::make_pair(Idx, *Diff)); + if (It == Set.second.end()) { + Set.second.emplace(Idx, *Diff); + return; } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= Operands.size()) - break; + // Try to vectorize the first found set to avoid duplicate analysis. + TryToVectorize(Set.second); + StoreIndexToDistSet PrevSet; + PrevSet.swap(Set.second); + Set.first = Idx; + Set.second.emplace(Idx, 0); + // Insert stores that followed previous match to try to vectorize them + // with this store. + unsigned StartIdx = It->first + 1; + SmallBitVector UsedStores(Idx - StartIdx); + // Distances to previously found dup store (or this store, since they + // store to the same addresses). + SmallVector<int> Dists(Idx - StartIdx, 0); + for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) { + // Do not try to vectorize sequences, we already tried. + if (Pair.first <= It->first || + VectorizedStores.contains(Stores[Pair.first])) + break; + unsigned BI = Pair.first - StartIdx; + UsedStores.set(BI); + Dists[BI] = Pair.second - It->second; + } + for (unsigned I = StartIdx; I < Idx; ++I) { + unsigned BI = I - StartIdx; + if (UsedStores.test(BI)) + Set.second.emplace(I, Dists[BI]); + } + return; } + auto &Res = SortedStores.emplace_back(); + Res.first = Idx; + Res.second.emplace(Idx, 0); + }; + StoreInst *PrevStore = Stores.front(); + for (auto [I, SI] : enumerate(Stores)) { + // Check that we do not try to vectorize stores of different types. + if (PrevStore->getValueOperand()->getType() != + SI->getValueOperand()->getType()) { + for (auto &Set : SortedStores) + TryToVectorize(Set.second); + SortedStores.clear(); + PrevStore = SI; + } + FillStoresSet(I, SI); } + // Final vectorization attempt. + for (auto &Set : SortedStores) + TryToVectorize(Set.second); + return Changed; } @@ -12507,8 +13682,10 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { // constant index, or a pointer operand that doesn't point to a scalar // type. else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { - auto Idx = GEP->idx_begin()->get(); - if (GEP->getNumIndices() > 1 || isa<Constant>(Idx)) + if (GEP->getNumIndices() != 1) + continue; + Value *Idx = GEP->idx_begin()->get(); + if (isa<Constant>(Idx)) continue; if (!isValidElementType(Idx->getType())) continue; @@ -12542,8 +13719,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // NOTE: the following will give user internal llvm type name, which may // not be useful. R.getORE()->emit([&]() { - std::string type_str; - llvm::raw_string_ostream rso(type_str); + std::string TypeStr; + llvm::raw_string_ostream rso(TypeStr); Ty->print(rso); return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0) << "Cannot SLP vectorize list: type " @@ -12878,10 +14055,12 @@ class HorizontalReduction { static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, Value *RHS, const Twine &Name, const ReductionOpsListType &ReductionOps) { - bool UseSelect = ReductionOps.size() == 2 || - // Logical or/and. - (ReductionOps.size() == 1 && - isa<SelectInst>(ReductionOps.front().front())); + bool UseSelect = + ReductionOps.size() == 2 || + // Logical or/and. + (ReductionOps.size() == 1 && any_of(ReductionOps.front(), [](Value *V) { + return isa<SelectInst>(V); + })); assert((!UseSelect || ReductionOps.size() != 2 || isa<SelectInst>(ReductionOps[1][0])) && "Expected cmp + select pairs for reduction"); @@ -13315,12 +14494,26 @@ public: // Update the final value in the reduction. Builder.SetCurrentDebugLocation( cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); + if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) || + (isGuaranteedNotToBePoison(Res) && + !isGuaranteedNotToBePoison(VectorizedTree))) { + auto It = ReducedValsToOps.find(Res); + if (It != ReducedValsToOps.end() && + any_of(It->getSecond(), + [](Instruction *I) { return isBoolLogicOp(I); })) + std::swap(VectorizedTree, Res); + } + return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx", ReductionOps); } // Initialize the final value in the reduction. return Res; }; + bool AnyBoolLogicOp = + any_of(ReductionOps.back(), [](Value *V) { + return isBoolLogicOp(cast<Instruction>(V)); + }); // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; @@ -13364,10 +14557,12 @@ public: // Check if the reduction value was not overriden by the extractelement // instruction because of the vectorization and exclude it, if it is not // compatible with other values. - if (auto *Inst = dyn_cast<Instruction>(RdxVal)) - if (isVectorLikeInstWithConstOps(Inst) && - (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) - continue; + // Also check if the instruction was folded to constant/other value. + auto *Inst = dyn_cast<Instruction>(RdxVal); + if ((Inst && isVectorLikeInstWithConstOps(Inst) && + (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) || + (S.getOpcode() && !Inst)) + continue; Candidates.push_back(RdxVal); TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); } @@ -13543,11 +14738,9 @@ public: for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) { if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) continue; - for_each(ReducedVals[Cnt], - [&LocalExternallyUsedValues, &TrackedVals](Value *V) { - if (isa<Instruction>(V)) - LocalExternallyUsedValues[TrackedVals[V]]; - }); + for (Value *V : ReducedVals[Cnt]) + if (isa<Instruction>(V)) + LocalExternallyUsedValues[TrackedVals[V]]; } if (!IsSupportedHorRdxIdentityOp) { // Number of uses of the candidates in the vector of values. @@ -13591,7 +14784,7 @@ public: // Update LocalExternallyUsedValues for the scalar, replaced by // extractelement instructions. for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) { - auto It = ExternallyUsedValues.find(Pair.first); + auto *It = ExternallyUsedValues.find(Pair.first); if (It == ExternallyUsedValues.end()) continue; LocalExternallyUsedValues[Pair.second].append(It->second); @@ -13605,7 +14798,8 @@ public: InstructionCost ReductionCost = getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; - LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost + << " for reduction\n"); if (!Cost.isValid()) return nullptr; if (Cost >= -SLPCostThreshold) { @@ -13652,7 +14846,9 @@ public: // To prevent poison from leaking across what used to be sequential, // safe, scalar boolean logic operations, the reduction operand must be // frozen. - if (isBoolLogicOp(RdxRootInst)) + if ((isBoolLogicOp(RdxRootInst) || + (AnyBoolLogicOp && VL.size() != TrackedVals.size())) && + !isGuaranteedNotToBePoison(VectorizedRoot)) VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); // Emit code to correctly handle reused reduced values, if required. @@ -13664,6 +14860,16 @@ public: Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); + if (ReducedSubTree->getType() != VL.front()->getType()) { + ReducedSubTree = Builder.CreateIntCast( + ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) { + KnownBits Known = computeKnownBits( + R, cast<Instruction>(ReductionOps.front().front()) + ->getModule() + ->getDataLayout()); + return !Known.isNonNegative(); + })); + } // Improved analysis for add/fadd/xor reductions with same scale factor // for all operands of reductions. We can emit scalar ops for them @@ -13716,31 +14922,33 @@ public: // RedOp2 = select i1 ?, i1 RHS, i1 false // Then, we must freeze LHS in the new op. - auto &&FixBoolLogicalOps = - [&Builder, VectorizedTree](Value *&LHS, Value *&RHS, - Instruction *RedOp1, Instruction *RedOp2) { - if (!isBoolLogicOp(RedOp1)) - return; - if (LHS == VectorizedTree || getRdxOperand(RedOp1, 0) == LHS || - isGuaranteedNotToBePoison(LHS)) - return; - if (!isBoolLogicOp(RedOp2)) - return; - if (RHS == VectorizedTree || getRdxOperand(RedOp2, 0) == RHS || - isGuaranteedNotToBePoison(RHS)) { - std::swap(LHS, RHS); - return; - } - LHS = Builder.CreateFreeze(LHS); - }; + auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS, + Instruction *RedOp1, + Instruction *RedOp2, + bool InitStep) { + if (!AnyBoolLogicOp) + return; + if (isBoolLogicOp(RedOp1) && + ((!InitStep && LHS == VectorizedTree) || + getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS))) + return; + if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) || + getRdxOperand(RedOp2, 0) == RHS || + isGuaranteedNotToBePoison(RHS))) { + std::swap(LHS, RHS); + return; + } + if (LHS != VectorizedTree) + LHS = Builder.CreateFreeze(LHS); + }; // Finish the reduction. // Need to add extra arguments and not vectorized possible reduction // values. // Try to avoid dependencies between the scalar remainders after // reductions. - auto &&FinalGen = - [this, &Builder, &TrackedVals, &FixBoolLogicalOps]( - ArrayRef<std::pair<Instruction *, Value *>> InstVals) { + auto FinalGen = + [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals, + bool InitStep) { unsigned Sz = InstVals.size(); SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2); @@ -13761,7 +14969,7 @@ public: // sequential, safe, scalar boolean logic operations, the // reduction operand must be frozen. FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first, - RedOp); + RedOp, InitStep); Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1, StableRdxVal2, "op.rdx", ReductionOps); ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed); @@ -13791,11 +14999,13 @@ public: ExtraReductions.emplace_back(I, Pair.first); } // Iterate through all not-vectorized reduction values/extra arguments. + bool InitStep = true; while (ExtraReductions.size() > 1) { VectorizedTree = ExtraReductions.front().second; SmallVector<std::pair<Instruction *, Value *>> NewReds = - FinalGen(ExtraReductions); + FinalGen(ExtraReductions, InitStep); ExtraReductions.swap(NewReds); + InitStep = false; } VectorizedTree = ExtraReductions.front().second; @@ -13842,8 +15052,7 @@ private: bool IsCmpSelMinMax, unsigned ReduxWidth, FastMathFlags FMF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - Value *FirstReducedVal = ReducedVals.front(); - Type *ScalarTy = FirstReducedVal->getType(); + Type *ScalarTy = ReducedVals.front()->getType(); FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); InstructionCost VectorCost = 0, ScalarCost; // If all of the reduced values are constant, the vector cost is 0, since @@ -13917,7 +15126,7 @@ private: } LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost - << " for reduction that starts with " << *FirstReducedVal + << " for reduction of " << shortBundleName(ReducedVals) << " (It is a splitting reduction)\n"); return VectorCost - ScalarCost; } @@ -13932,7 +15141,7 @@ private: "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; - return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); + return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind); } /// Emits optimized code for unique scalar value reused \p Cnt times. @@ -13979,8 +15188,8 @@ private: case RecurKind::Mul: case RecurKind::FMul: case RecurKind::FMulAdd: - case RecurKind::SelectICmp: - case RecurKind::SelectFCmp: + case RecurKind::IAnyOf: + case RecurKind::FAnyOf: case RecurKind::None: llvm_unreachable("Unexpected reduction kind for repeated scalar."); } @@ -14068,8 +15277,8 @@ private: case RecurKind::Mul: case RecurKind::FMul: case RecurKind::FMulAdd: - case RecurKind::SelectICmp: - case RecurKind::SelectFCmp: + case RecurKind::IAnyOf: + case RecurKind::FAnyOf: case RecurKind::None: llvm_unreachable("Unexpected reduction kind for reused scalars."); } @@ -14164,8 +15373,8 @@ static bool findBuildAggregate(Instruction *LastInsertInst, InsertElts.resize(*AggregateSize); findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0); - llvm::erase_value(BuildVectorOpds, nullptr); - llvm::erase_value(InsertElts, nullptr); + llvm::erase(BuildVectorOpds, nullptr); + llvm::erase(InsertElts, nullptr); if (BuildVectorOpds.size() >= 2) return true; @@ -14401,8 +15610,7 @@ bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts, bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, BoUpSLP &R) { - const DataLayout &DL = BB->getModule()->getDataLayout(); - if (!R.canMapToVector(IVI->getType(), DL)) + if (!R.canMapToVector(IVI->getType())) return false; SmallVector<Value *, 16> BuildVectorOpds; @@ -14541,11 +15749,11 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, if (BasePred1 > BasePred2) return false; // Compare operands. - bool LEPreds = Pred1 <= Pred2; - bool GEPreds = Pred1 >= Pred2; + bool CI1Preds = Pred1 == BasePred1; + bool CI2Preds = Pred2 == BasePred1; for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { - auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1); - auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1); + auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1); + auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1); if (Op1->getValueID() < Op2->getValueID()) return !IsCompatibility; if (Op1->getValueID() > Op2->getValueID()) @@ -14691,14 +15899,20 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return true; if (Opcodes1.size() > Opcodes2.size()) return false; - std::optional<bool> ConstOrder; for (int I = 0, E = Opcodes1.size(); I < E; ++I) { // Undefs are compatible with any other value. if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) { - if (!ConstOrder) - ConstOrder = - !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]); - continue; + if (isa<Instruction>(Opcodes1[I])) + return true; + if (isa<Instruction>(Opcodes2[I])) + return false; + if (isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I])) + return true; + if (isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I])) + return false; + if (isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I])) + continue; + return isa<UndefValue>(Opcodes2[I]); } if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { @@ -14714,21 +15928,26 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S.getOpcode()) + if (S.getOpcode() && !S.isAltShuffle()) continue; return I1->getOpcode() < I2->getOpcode(); } - if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) { - if (!ConstOrder) - ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID(); - continue; - } + if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) + return Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID(); + if (isa<Instruction>(Opcodes1[I])) + return true; + if (isa<Instruction>(Opcodes2[I])) + return false; + if (isa<Constant>(Opcodes1[I])) + return true; + if (isa<Constant>(Opcodes2[I])) + return false; if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) return true; if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) return false; } - return ConstOrder && *ConstOrder; + return false; }; auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) { if (V1 == V2) @@ -14776,6 +15995,9 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { Incoming.push_back(P); } + if (Incoming.size() <= 1) + break; + // Find the corresponding non-phi nodes for better matching when trying to // build the tree. for (Value *V : Incoming) { @@ -14838,41 +16060,41 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return I->use_empty() && (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I)); }; - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) { // Skip instructions with scalable type. The num of elements is unknown at // compile-time for scalable type. - if (isa<ScalableVectorType>(it->getType())) + if (isa<ScalableVectorType>(It->getType())) continue; // Skip instructions marked for the deletion. - if (R.isDeleted(&*it)) + if (R.isDeleted(&*It)) continue; // We may go through BB multiple times so skip the one we have checked. - if (!VisitedInstrs.insert(&*it).second) { - if (HasNoUsers(&*it) && - VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) { + if (!VisitedInstrs.insert(&*It).second) { + if (HasNoUsers(&*It) && + VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. Changed = true; - it = BB->begin(); - e = BB->end(); + It = BB->begin(); + E = BB->end(); } continue; } - if (isa<DbgInfoIntrinsic>(it)) + if (isa<DbgInfoIntrinsic>(It)) continue; // Try to vectorize reductions that use PHINodes. - if (PHINode *P = dyn_cast<PHINode>(it)) { + if (PHINode *P = dyn_cast<PHINode>(It)) { // Check that the PHI is a reduction PHI. if (P->getNumIncomingValues() == 2) { // Try to match and vectorize a horizontal reduction. Instruction *Root = getReductionInstr(DT, P, BB, LI); if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) { Changed = true; - it = BB->begin(); - e = BB->end(); + It = BB->begin(); + E = BB->end(); continue; } } @@ -14897,23 +16119,23 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; } - if (HasNoUsers(&*it)) { + if (HasNoUsers(&*It)) { bool OpsChanged = false; - auto *SI = dyn_cast<StoreInst>(it); + auto *SI = dyn_cast<StoreInst>(It); bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI; if (SI) { - auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand())); + auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand())); // Try to vectorize chain in store, if this is the only store to the // address in the block. // TODO: This is just a temporarily solution to save compile time. Need // to investigate if we can safely turn on slp-vectorize-hor-store // instead to allow lookup for reduction chains in all non-vectorized // stores (need to check side effects and compile time). - TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) && - SI->getValueOperand()->hasOneUse(); + TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) && + SI->getValueOperand()->hasOneUse(); } if (TryToVectorizeRoot) { - for (auto *V : it->operand_values()) { + for (auto *V : It->operand_values()) { // Postponed instructions should not be vectorized here, delay their // vectorization. if (auto *VI = dyn_cast<Instruction>(V); @@ -14926,21 +16148,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // top-tree instructions to try to vectorize as many instructions as // possible. OpsChanged |= - VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator()); + VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator()); if (OpsChanged) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. Changed = true; - it = BB->begin(); - e = BB->end(); + It = BB->begin(); + E = BB->end(); continue; } } - if (isa<InsertElementInst, InsertValueInst>(it)) - PostProcessInserts.insert(&*it); - else if (isa<CmpInst>(it)) - PostProcessCmps.insert(cast<CmpInst>(&*it)); + if (isa<InsertElementInst, InsertValueInst>(It)) + PostProcessInserts.insert(&*It); + else if (isa<CmpInst>(It)) + PostProcessCmps.insert(cast<CmpInst>(&*It)); } return Changed; @@ -15044,6 +16266,12 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { // compatible (have the same opcode, same parent), otherwise it is // definitely not profitable to try to vectorize them. auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) { + if (V->getValueOperand()->getType()->getTypeID() < + V2->getValueOperand()->getType()->getTypeID()) + return true; + if (V->getValueOperand()->getType()->getTypeID() > + V2->getValueOperand()->getType()->getTypeID()) + return false; if (V->getPointerOperandType()->getTypeID() < V2->getPointerOperandType()->getTypeID()) return true; @@ -15082,6 +16310,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { if (V1 == V2) return true; + if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType()) + return false; if (V1->getPointerOperandType() != V2->getPointerOperandType()) return false; // Undefs are compatible with any other value. @@ -15113,8 +16343,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (!isValidElementType(Pair.second.front()->getValueOperand()->getType())) continue; + // Reverse stores to do bottom-to-top analysis. This is important if the + // values are stores to the same addresses several times, in this case need + // to follow the stores order (reversed to meet the memory dependecies). + SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(), + Pair.second.rend()); Changed |= tryToVectorizeSequence<StoreInst>( - Pair.second, StoreSorter, AreCompatibleStores, + ReversedStores, StoreSorter, AreCompatibleStores, [this, &R](ArrayRef<StoreInst *> Candidates, bool) { return vectorizeStores(Candidates, R); }, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 1271d1424c03..7ff6749a0908 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -133,9 +133,12 @@ public: Ingredient2Recipe[I] = R; } + /// Create the mask for the vector loop header block. + void createHeaderMask(VPlan &Plan); + /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. + /// that the header block of the loop is set to True or the loop mask when + /// tail folding. It returns the *entry* mask for the block BB. VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan); /// A helper function that computes the predicate of the edge between SRC diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index e81b88fd8099..1d7df9c9575a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -19,7 +19,6 @@ #include "VPlan.h" #include "VPlanCFG.h" #include "VPlanDominatorTree.h" -#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -234,6 +233,99 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { // set(Def, Extract, Instance); return Extract; } + +Value *VPTransformState::get(VPValue *Def, unsigned Part) { + // If Values have been set for this Def return the one relevant for \p Part. + if (hasVectorValue(Def, Part)) + return Data.PerPartOutput[Def][Part]; + + auto GetBroadcastInstrs = [this, Def](Value *V) { + bool SafeToHoist = Def->isDefinedOutsideVectorRegions(); + if (VF.isScalar()) + return V; + // Place the code for broadcasting invariant variables in the new preheader. + IRBuilder<>::InsertPointGuard Guard(Builder); + if (SafeToHoist) { + BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>( + Plan->getVectorLoopRegion()->getSinglePredecessor())]; + if (LoopVectorPreHeader) + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + } + + // Place the code for broadcasting invariant variables in the new preheader. + // Broadcast the scalar into all locations in the vector. + Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); + + return Shuf; + }; + + if (!hasScalarValue(Def, {Part, 0})) { + assert(Def->isLiveIn() && "expected a live-in"); + if (Part != 0) + return get(Def, 0); + Value *IRV = Def->getLiveInIRValue(); + Value *B = GetBroadcastInstrs(IRV); + set(Def, B, Part); + return B; + } + + Value *ScalarValue = get(Def, {Part, 0}); + // If we aren't vectorizing, we can just copy the scalar map values over + // to the vector map. + if (VF.isScalar()) { + set(Def, ScalarValue, Part); + return ScalarValue; + } + + bool IsUniform = vputils::isUniformAfterVectorization(Def); + + unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; + // Check if there is a scalar value for the selected lane. + if (!hasScalarValue(Def, {Part, LastLane})) { + // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and + // VPExpandSCEVRecipes can also be uniform. + assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || + isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) || + isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && + "unexpected recipe found to be invariant"); + IsUniform = true; + LastLane = 0; + } + + auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); + // Set the insert point after the last scalarized instruction or after the + // last PHI, if LastInst is a PHI. This ensures the insertelement sequence + // will directly follow the scalar definitions. + auto OldIP = Builder.saveIP(); + auto NewIP = + isa<PHINode>(LastInst) + ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) + : std::next(BasicBlock::iterator(LastInst)); + Builder.SetInsertPoint(&*NewIP); + + // However, if we are vectorizing, we need to construct the vector values. + // If the value is known to be uniform after vectorization, we can just + // broadcast the scalar value corresponding to lane zero for each unroll + // iteration. Otherwise, we construct the vector values using + // insertelement instructions. Since the resulting vectors are stored in + // State, we will only generate the insertelements once. + Value *VectorValue = nullptr; + if (IsUniform) { + VectorValue = GetBroadcastInstrs(ScalarValue); + set(Def, VectorValue, Part); + } else { + // Initialize packing with insertelements to start from undef. + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); + set(Def, Undef, Part); + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) + packScalarIntoVectorValue(Def, {Part, Lane}); + VectorValue = get(Def, Part); + } + Builder.restoreIP(OldIP); + return VectorValue; +} + BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) { VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion(); return VPBB2IRBB[LoopRegion->getPreheaderVPBB()]; @@ -267,18 +359,15 @@ void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) { } } -void VPTransformState::setDebugLocFromInst(const Value *V) { - const Instruction *Inst = dyn_cast<Instruction>(V); - if (!Inst) { - Builder.SetCurrentDebugLocation(DebugLoc()); - return; - } - - const DILocation *DIL = Inst->getDebugLoc(); +void VPTransformState::setDebugLocFrom(DebugLoc DL) { + const DILocation *DIL = DL; // When a FSDiscriminator is enabled, we don't need to add the multiply // factors to the discriminators. - if (DIL && Inst->getFunction()->shouldEmitDebugInfoForProfiling() && - !Inst->isDebugOrPseudoInst() && !EnableFSDiscriminator) { + if (DIL && + Builder.GetInsertBlock() + ->getParent() + ->shouldEmitDebugInfoForProfiling() && + !EnableFSDiscriminator) { // FIXME: For scalable vectors, assume vscale=1. auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); @@ -291,6 +380,15 @@ void VPTransformState::setDebugLocFromInst(const Value *V) { Builder.SetCurrentDebugLocation(DIL); } +void VPTransformState::packScalarIntoVectorValue(VPValue *Def, + const VPIteration &Instance) { + Value *ScalarInst = get(Def, Instance); + Value *VectorValue = get(Def, Instance.Part); + VectorValue = Builder.CreateInsertElement( + VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF)); + set(Def, VectorValue, Instance.Part); +} + BasicBlock * VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. @@ -616,22 +714,17 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) { auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader); Plan->TripCount = vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE); + // Create empty VPRegionBlock, to be filled during processing later. + auto *TopRegion = new VPRegionBlock("vector loop", false /*isReplicator*/); + VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader); + VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); + VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); return Plan; } -VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() { - VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - if (isa<VPActiveLaneMaskPHIRecipe>(&R)) - return cast<VPActiveLaneMaskPHIRecipe>(&R); - } - return nullptr; -} - void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, Value *CanonicalIVStartValue, - VPTransformState &State, - bool IsEpilogueVectorization) { + VPTransformState &State) { // Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); @@ -648,6 +741,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) State.set(&VectorTripCount, VectorTripCountV, Part); + IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); + // FIXME: Model VF * UF computation completely in VPlan. + State.set(&VFxUF, + createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF), + 0); + // When vectorizing the epilogue loop, the canonical induction start value // needs to be changed from zero to the value after the main vector loop. // FIXME: Improve modeling for canonical IV start values in the epilogue loop. @@ -656,16 +755,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, auto *IV = getCanonicalIV(); assert(all_of(IV->users(), [](const VPUser *U) { - if (isa<VPScalarIVStepsRecipe>(U) || - isa<VPDerivedIVRecipe>(U)) - return true; - auto *VPI = cast<VPInstruction>(U); - return VPI->getOpcode() == - VPInstruction::CanonicalIVIncrement || - VPI->getOpcode() == - VPInstruction::CanonicalIVIncrementNUW; + return isa<VPScalarIVStepsRecipe>(U) || + isa<VPDerivedIVRecipe>(U) || + cast<VPInstruction>(U)->getOpcode() == + Instruction::Add; }) && - "the canonical IV should only be used by its increments or " + "the canonical IV should only be used by its increment or " "ScalarIVSteps when resetting the start value"); IV->setOperand(0, VPV); } @@ -754,11 +849,14 @@ void VPlan::execute(VPTransformState *State) { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD -void VPlan::print(raw_ostream &O) const { +void VPlan::printLiveIns(raw_ostream &O) const { VPSlotTracker SlotTracker(this); - O << "VPlan '" << getName() << "' {"; + if (VFxUF.getNumUsers() > 0) { + O << "\nLive-in "; + VFxUF.printAsOperand(O, SlotTracker); + O << " = VF * UF"; + } if (VectorTripCount.getNumUsers() > 0) { O << "\nLive-in "; @@ -778,6 +876,15 @@ void VPlan::print(raw_ostream &O) const { TripCount->printAsOperand(O, SlotTracker); O << " = original trip-count"; O << "\n"; +} + +LLVM_DUMP_METHOD +void VPlan::print(raw_ostream &O) const { + VPSlotTracker SlotTracker(this); + + O << "VPlan '" << getName() << "' {"; + + printLiveIns(O); if (!getPreheader()->empty()) { O << "\n"; @@ -895,11 +1002,18 @@ void VPlanPrinter::dump() { OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; if (!Plan.getName().empty()) OS << "\\n" << DOT::EscapeString(Plan.getName()); - if (Plan.BackedgeTakenCount) { - OS << ", where:\\n"; - Plan.BackedgeTakenCount->print(OS, SlotTracker); - OS << " := BackedgeTakenCount"; + + { + // Print live-ins. + std::string Str; + raw_string_ostream SS(Str); + Plan.printLiveIns(SS); + SmallVector<StringRef, 0> Lines; + StringRef(Str).rtrim('\n').split(Lines, "\n"); + for (auto Line : Lines) + OS << DOT::EscapeString(Line.str()) << "\\n"; } + OS << "\"]\n"; OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; OS << "edge [fontname=Courier, fontsize=30]\n"; @@ -1021,16 +1135,43 @@ void VPlanIngredient::print(raw_ostream &O) const { template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT); void VPValue::replaceAllUsesWith(VPValue *New) { + if (this == New) + return; for (unsigned J = 0; J < getNumUsers();) { VPUser *User = Users[J]; - unsigned NumUsers = getNumUsers(); + bool RemovedUser = false; for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) - if (User->getOperand(I) == this) + if (User->getOperand(I) == this) { User->setOperand(I, New); + RemovedUser = true; + } // If a user got removed after updating the current user, the next user to // update will be moved to the current position, so we only need to // increment the index if the number of users did not change. - if (NumUsers == getNumUsers()) + if (!RemovedUser) + J++; + } +} + +void VPValue::replaceUsesWithIf( + VPValue *New, + llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace) { + if (this == New) + return; + for (unsigned J = 0; J < getNumUsers();) { + VPUser *User = Users[J]; + bool RemovedUser = false; + for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) { + if (User->getOperand(I) != this || !ShouldReplace(*User, I)) + continue; + + RemovedUser = true; + User->setOperand(I, New); + } + // If a user got removed after updating the current user, the next user to + // update will be moved to the current position, so we only need to + // increment the index if the number of users did not change. + if (!RemovedUser) J++; } } @@ -1116,6 +1257,8 @@ void VPSlotTracker::assignSlot(const VPValue *V) { } void VPSlotTracker::assignSlots(const VPlan &Plan) { + if (Plan.VFxUF.getNumUsers() > 0) + assignSlot(&Plan.VFxUF); assignSlot(&Plan.VectorTripCount); if (Plan.BackedgeTakenCount) assignSlot(Plan.BackedgeTakenCount); @@ -1139,6 +1282,11 @@ bool vputils::onlyFirstLaneUsed(VPValue *Def) { [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); }); } +bool vputils::onlyFirstPartUsed(VPValue *Def) { + return all_of(Def->users(), + [Def](VPUser *U) { return U->onlyFirstPartUsed(Def); }); +} + VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE) { if (auto *Expanded = Plan.getSCEVExpansion(Expr)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 73313465adea..94cb76889813 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -23,6 +23,7 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H +#include "VPlanAnalysis.h" #include "VPlanValue.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" @@ -233,9 +234,9 @@ struct VPIteration { struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan) + InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx) : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan), - LVer(nullptr) {} + LVer(nullptr), TypeAnalysis(Ctx) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; @@ -274,10 +275,6 @@ struct VPTransformState { I->second[Part]; } - bool hasAnyVectorValue(VPValue *Def) const { - return Data.PerPartOutput.contains(Def); - } - bool hasScalarValue(VPValue *Def, VPIteration Instance) { auto I = Data.PerPartScalars.find(Def); if (I == Data.PerPartScalars.end()) @@ -349,8 +346,11 @@ struct VPTransformState { /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); - /// Set the debug location in the builder using the debug location in \p V. - void setDebugLocFromInst(const Value *V); + /// Set the debug location in the builder using the debug location \p DL. + void setDebugLocFrom(DebugLoc DL); + + /// Construct the vector value of a scalarized value \p V one lane at a time. + void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance); /// Hold state information used when constructing the CFG of the output IR, /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. @@ -410,6 +410,9 @@ struct VPTransformState { /// Map SCEVs to their expanded values. Populated when executing /// VPExpandSCEVRecipes. DenseMap<const SCEV *, Value *> ExpandedSCEVs; + + /// VPlan-based type analysis. + VPTypeAnalysis TypeAnalysis; }; /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. @@ -582,6 +585,8 @@ public: /// This VPBlockBase must have no successors. void setOneSuccessor(VPBlockBase *Successor) { assert(Successors.empty() && "Setting one successor when others exist."); + assert(Successor->getParent() == getParent() && + "connected blocks must have the same parent"); appendSuccessor(Successor); } @@ -693,7 +698,7 @@ public: }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR -/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef +/// instructions. VPRecipeBase owns the VPValues it defines through VPDef /// and is responsible for deleting its defined values. Single-value /// VPRecipeBases that also inherit from VPValue must make sure to inherit from /// VPRecipeBase before VPValue. @@ -706,13 +711,18 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, /// Each VPRecipe belongs to a single VPBasicBlock. VPBasicBlock *Parent = nullptr; + /// The debug location for the recipe. + DebugLoc DL; + public: - VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands) - : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {} + VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands, + DebugLoc DL = {}) + : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {} template <typename IterT> - VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands) - : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {} + VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands, + DebugLoc DL = {}) + : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {} virtual ~VPRecipeBase() = default; /// \return the VPBasicBlock which this VPRecipe belongs to. @@ -789,6 +799,9 @@ public: bool mayReadOrWriteMemory() const { return mayReadFromMemory() || mayWriteToMemory(); } + + /// Returns the debug location of the recipe. + DebugLoc getDebugLoc() const { return DL; } }; // Helper macro to define common classof implementations for recipes. @@ -808,153 +821,30 @@ public: return R->getVPDefID() == VPDefID; \ } -/// This is a concrete Recipe that models a single VPlan-level instruction. -/// While as any Recipe it may generate a sequence of IR instructions when -/// executed, these instructions would always form a single-def expression as -/// the VPInstruction is also a single def-use vertex. -class VPInstruction : public VPRecipeBase, public VPValue { - friend class VPlanSlp; - -public: - /// VPlan opcodes, extending LLVM IR with idiomatics instructions. - enum { - FirstOrderRecurrenceSplice = - Instruction::OtherOpsEnd + 1, // Combines the incoming and previous - // values of a first-order recurrence. - Not, - ICmpULE, - SLPLoad, - SLPStore, - ActiveLaneMask, - CalculateTripCountMinusVF, - CanonicalIVIncrement, - CanonicalIVIncrementNUW, - // The next two are similar to the above, but instead increment the - // canonical IV separately for each unrolled part. - CanonicalIVIncrementForPart, - CanonicalIVIncrementForPartNUW, - BranchOnCount, - BranchOnCond - }; - -private: - typedef unsigned char OpcodeTy; - OpcodeTy Opcode; - FastMathFlags FMF; - DebugLoc DL; - - /// An optional name that can be used for the generated IR instruction. - const std::string Name; - - /// Utility method serving execute(): generates a single instance of the - /// modeled instruction. \returns the generated value for \p Part. - /// In some cases an existing value is returned rather than a generated - /// one. - Value *generateInstruction(VPTransformState &State, unsigned Part); - -protected: - void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } - -public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL, - const Twine &Name = "") - : VPRecipeBase(VPDef::VPInstructionSC, Operands), VPValue(this), - Opcode(Opcode), DL(DL), Name(Name.str()) {} - - VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, - DebugLoc DL = {}, const Twine &Name = "") - : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {} - - VP_CLASSOF_IMPL(VPDef::VPInstructionSC) - - VPInstruction *clone() const { - SmallVector<VPValue *, 2> Operands(operands()); - return new VPInstruction(Opcode, Operands, DL, Name); - } - - unsigned getOpcode() const { return Opcode; } - - /// Generate the instruction. - /// TODO: We currently execute only per-part unless a specific instance is - /// provided. - void execute(VPTransformState &State) override; - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the VPInstruction to \p O. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; - - /// Print the VPInstruction to dbgs() (for debugging). - LLVM_DUMP_METHOD void dump() const; -#endif - - /// Return true if this instruction may modify memory. - bool mayWriteToMemory() const { - // TODO: we can use attributes of the called function to rule out memory - // modifications. - return Opcode == Instruction::Store || Opcode == Instruction::Call || - Opcode == Instruction::Invoke || Opcode == SLPStore; - } - - bool hasResult() const { - // CallInst may or may not have a result, depending on the called function. - // Conservatively return calls have results for now. - switch (getOpcode()) { - case Instruction::Ret: - case Instruction::Br: - case Instruction::Store: - case Instruction::Switch: - case Instruction::IndirectBr: - case Instruction::Resume: - case Instruction::CatchRet: - case Instruction::Unreachable: - case Instruction::Fence: - case Instruction::AtomicRMW: - case VPInstruction::BranchOnCond: - case VPInstruction::BranchOnCount: - return false; - default: - return true; - } - } - - /// Set the fast-math flags. - void setFastMathFlags(FastMathFlags FMFNew); - - /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { - assert(is_contained(operands(), Op) && - "Op must be an operand of the recipe"); - if (getOperand(0) != Op) - return false; - switch (getOpcode()) { - default: - return false; - case VPInstruction::ActiveLaneMask: - case VPInstruction::CalculateTripCountMinusVF: - case VPInstruction::CanonicalIVIncrement: - case VPInstruction::CanonicalIVIncrementNUW: - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::CanonicalIVIncrementForPartNUW: - case VPInstruction::BranchOnCount: - return true; - }; - llvm_unreachable("switch should return"); - } -}; - /// Class to record LLVM IR flag for a recipe along with it. class VPRecipeWithIRFlags : public VPRecipeBase { enum class OperationType : unsigned char { + Cmp, OverflowingBinOp, + DisjointOp, PossiblyExactOp, GEPOp, FPMathOp, + NonNegOp, Other }; + +public: struct WrapFlagsTy { char HasNUW : 1; char HasNSW : 1; + + WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {} + }; + +private: + struct DisjointFlagsTy { + char IsDisjoint : 1; }; struct ExactFlagsTy { char IsExact : 1; @@ -962,6 +852,9 @@ class VPRecipeWithIRFlags : public VPRecipeBase { struct GEPFlagsTy { char IsInBounds : 1; }; + struct NonNegFlagsTy { + char NonNeg : 1; + }; struct FastMathFlagsTy { char AllowReassoc : 1; char NoNaNs : 1; @@ -970,56 +863,81 @@ class VPRecipeWithIRFlags : public VPRecipeBase { char AllowReciprocal : 1; char AllowContract : 1; char ApproxFunc : 1; + + FastMathFlagsTy(const FastMathFlags &FMF); }; OperationType OpType; union { + CmpInst::Predicate CmpPredicate; WrapFlagsTy WrapFlags; + DisjointFlagsTy DisjointFlags; ExactFlagsTy ExactFlags; GEPFlagsTy GEPFlags; + NonNegFlagsTy NonNegFlags; FastMathFlagsTy FMFs; - unsigned char AllFlags; + unsigned AllFlags; }; public: template <typename IterT> - VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands) - : VPRecipeBase(SC, Operands) { + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {}) + : VPRecipeBase(SC, Operands, DL) { OpType = OperationType::Other; AllFlags = 0; } template <typename IterT> - VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands, - Instruction &I) - : VPRecipeWithIRFlags(SC, Operands) { - if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) { + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I) + : VPRecipeWithIRFlags(SC, Operands, I.getDebugLoc()) { + if (auto *Op = dyn_cast<CmpInst>(&I)) { + OpType = OperationType::Cmp; + CmpPredicate = Op->getPredicate(); + } else if (auto *Op = dyn_cast<PossiblyDisjointInst>(&I)) { + OpType = OperationType::DisjointOp; + DisjointFlags.IsDisjoint = Op->isDisjoint(); + } else if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) { OpType = OperationType::OverflowingBinOp; - WrapFlags.HasNUW = Op->hasNoUnsignedWrap(); - WrapFlags.HasNSW = Op->hasNoSignedWrap(); + WrapFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()}; } else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) { OpType = OperationType::PossiblyExactOp; ExactFlags.IsExact = Op->isExact(); } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { OpType = OperationType::GEPOp; GEPFlags.IsInBounds = GEP->isInBounds(); + } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) { + OpType = OperationType::NonNegOp; + NonNegFlags.NonNeg = PNNI->hasNonNeg(); } else if (auto *Op = dyn_cast<FPMathOperator>(&I)) { OpType = OperationType::FPMathOp; - FastMathFlags FMF = Op->getFastMathFlags(); - FMFs.AllowReassoc = FMF.allowReassoc(); - FMFs.NoNaNs = FMF.noNaNs(); - FMFs.NoInfs = FMF.noInfs(); - FMFs.NoSignedZeros = FMF.noSignedZeros(); - FMFs.AllowReciprocal = FMF.allowReciprocal(); - FMFs.AllowContract = FMF.allowContract(); - FMFs.ApproxFunc = FMF.approxFunc(); + FMFs = Op->getFastMathFlags(); } } + template <typename IterT> + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, + CmpInst::Predicate Pred, DebugLoc DL = {}) + : VPRecipeBase(SC, Operands, DL), OpType(OperationType::Cmp), + CmpPredicate(Pred) {} + + template <typename IterT> + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, + WrapFlagsTy WrapFlags, DebugLoc DL = {}) + : VPRecipeBase(SC, Operands, DL), OpType(OperationType::OverflowingBinOp), + WrapFlags(WrapFlags) {} + + template <typename IterT> + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, + FastMathFlags FMFs, DebugLoc DL = {}) + : VPRecipeBase(SC, Operands, DL), OpType(OperationType::FPMathOp), + FMFs(FMFs) {} + static inline bool classof(const VPRecipeBase *R) { - return R->getVPDefID() == VPRecipeBase::VPWidenSC || + return R->getVPDefID() == VPRecipeBase::VPInstructionSC || + R->getVPDefID() == VPRecipeBase::VPWidenSC || R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || + R->getVPDefID() == VPRecipeBase::VPWidenCastSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC; } @@ -1032,6 +950,9 @@ public: WrapFlags.HasNUW = false; WrapFlags.HasNSW = false; break; + case OperationType::DisjointOp: + DisjointFlags.IsDisjoint = false; + break; case OperationType::PossiblyExactOp: ExactFlags.IsExact = false; break; @@ -1042,6 +963,10 @@ public: FMFs.NoNaNs = false; FMFs.NoInfs = false; break; + case OperationType::NonNegOp: + NonNegFlags.NonNeg = false; + break; + case OperationType::Cmp: case OperationType::Other: break; } @@ -1054,6 +979,9 @@ public: I->setHasNoUnsignedWrap(WrapFlags.HasNUW); I->setHasNoSignedWrap(WrapFlags.HasNSW); break; + case OperationType::DisjointOp: + cast<PossiblyDisjointInst>(I)->setIsDisjoint(DisjointFlags.IsDisjoint); + break; case OperationType::PossiblyExactOp: I->setIsExact(ExactFlags.IsExact); break; @@ -1069,43 +997,209 @@ public: I->setHasAllowContract(FMFs.AllowContract); I->setHasApproxFunc(FMFs.ApproxFunc); break; + case OperationType::NonNegOp: + I->setNonNeg(NonNegFlags.NonNeg); + break; + case OperationType::Cmp: case OperationType::Other: break; } } + CmpInst::Predicate getPredicate() const { + assert(OpType == OperationType::Cmp && + "recipe doesn't have a compare predicate"); + return CmpPredicate; + } + bool isInBounds() const { assert(OpType == OperationType::GEPOp && "recipe doesn't have inbounds flag"); return GEPFlags.IsInBounds; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - FastMathFlags getFastMathFlags() const { - FastMathFlags Res; - Res.setAllowReassoc(FMFs.AllowReassoc); - Res.setNoNaNs(FMFs.NoNaNs); - Res.setNoInfs(FMFs.NoInfs); - Res.setNoSignedZeros(FMFs.NoSignedZeros); - Res.setAllowReciprocal(FMFs.AllowReciprocal); - Res.setAllowContract(FMFs.AllowContract); - Res.setApproxFunc(FMFs.ApproxFunc); - return Res; + /// Returns true if the recipe has fast-math flags. + bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; } + + FastMathFlags getFastMathFlags() const; + + bool hasNoUnsignedWrap() const { + assert(OpType == OperationType::OverflowingBinOp && + "recipe doesn't have a NUW flag"); + return WrapFlags.HasNUW; } + bool hasNoSignedWrap() const { + assert(OpType == OperationType::OverflowingBinOp && + "recipe doesn't have a NSW flag"); + return WrapFlags.HasNSW; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printFlags(raw_ostream &O) const; #endif }; +/// This is a concrete Recipe that models a single VPlan-level instruction. +/// While as any Recipe it may generate a sequence of IR instructions when +/// executed, these instructions would always form a single-def expression as +/// the VPInstruction is also a single def-use vertex. +class VPInstruction : public VPRecipeWithIRFlags, public VPValue { + friend class VPlanSlp; + +public: + /// VPlan opcodes, extending LLVM IR with idiomatics instructions. + enum { + FirstOrderRecurrenceSplice = + Instruction::OtherOpsEnd + 1, // Combines the incoming and previous + // values of a first-order recurrence. + Not, + SLPLoad, + SLPStore, + ActiveLaneMask, + CalculateTripCountMinusVF, + // Increment the canonical IV separately for each unrolled part. + CanonicalIVIncrementForPart, + BranchOnCount, + BranchOnCond + }; + +private: + typedef unsigned char OpcodeTy; + OpcodeTy Opcode; + + /// An optional name that can be used for the generated IR instruction. + const std::string Name; + + /// Utility method serving execute(): generates a single instance of the + /// modeled instruction. \returns the generated value for \p Part. + /// In some cases an existing value is returned rather than a generated + /// one. + Value *generateInstruction(VPTransformState &State, unsigned Part); + +#if !defined(NDEBUG) + /// Return true if the VPInstruction is a floating point math operation, i.e. + /// has fast-math flags. + bool isFPMathOp() const; +#endif + +protected: + void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } + +public: + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL, + const Twine &Name = "") + : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), + VPValue(this), Opcode(Opcode), Name(Name.str()) {} + + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, + DebugLoc DL = {}, const Twine &Name = "") + : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {} + + VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A, + VPValue *B, DebugLoc DL = {}, const Twine &Name = ""); + + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, + WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "") + : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL), + VPValue(this), Opcode(Opcode), Name(Name.str()) {} + + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, + FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = ""); + + VP_CLASSOF_IMPL(VPDef::VPInstructionSC) + + unsigned getOpcode() const { return Opcode; } + + /// Generate the instruction. + /// TODO: We currently execute only per-part unless a specific instance is + /// provided. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the VPInstruction to \p O. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; + + /// Print the VPInstruction to dbgs() (for debugging). + LLVM_DUMP_METHOD void dump() const; +#endif + + /// Return true if this instruction may modify memory. + bool mayWriteToMemory() const { + // TODO: we can use attributes of the called function to rule out memory + // modifications. + return Opcode == Instruction::Store || Opcode == Instruction::Call || + Opcode == Instruction::Invoke || Opcode == SLPStore; + } + + bool hasResult() const { + // CallInst may or may not have a result, depending on the called function. + // Conservatively return calls have results for now. + switch (getOpcode()) { + case Instruction::Ret: + case Instruction::Br: + case Instruction::Store: + case Instruction::Switch: + case Instruction::IndirectBr: + case Instruction::Resume: + case Instruction::CatchRet: + case Instruction::Unreachable: + case Instruction::Fence: + case Instruction::AtomicRMW: + case VPInstruction::BranchOnCond: + case VPInstruction::BranchOnCount: + return false; + default: + return true; + } + } + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + if (getOperand(0) != Op) + return false; + switch (getOpcode()) { + default: + return false; + case VPInstruction::ActiveLaneMask: + case VPInstruction::CalculateTripCountMinusVF: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::BranchOnCount: + return true; + }; + llvm_unreachable("switch should return"); + } + + /// Returns true if the recipe only uses the first part of operand \p Op. + bool onlyFirstPartUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + if (getOperand(0) != Op) + return false; + switch (getOpcode()) { + default: + return false; + case VPInstruction::BranchOnCount: + return true; + }; + llvm_unreachable("switch should return"); + } +}; + /// VPWidenRecipe is a recipe for producing a copy of vector type its /// ingredient. This recipe covers most of the traditional vectorization cases /// where each ingredient transforms into a vectorized version of itself. class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue { + unsigned Opcode; public: template <typename IterT> VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) - : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPValue(this, &I) {} + : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPValue(this, &I), + Opcode(I.getOpcode()) {} ~VPWidenRecipe() override = default; @@ -1114,6 +1208,8 @@ public: /// Produce widened copies of all Ingredients. void execute(VPTransformState &State) override; + unsigned getOpcode() const { return Opcode; } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -1122,7 +1218,7 @@ public: }; /// VPWidenCastRecipe is a recipe to create vector cast instructions. -class VPWidenCastRecipe : public VPRecipeBase, public VPValue { +class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPValue { /// Cast instruction opcode. Instruction::CastOps Opcode; @@ -1131,15 +1227,19 @@ class VPWidenCastRecipe : public VPRecipeBase, public VPValue { public: VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, - CastInst *UI = nullptr) - : VPRecipeBase(VPDef::VPWidenCastSC, Op), VPValue(this, UI), + CastInst &UI) + : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), VPValue(this, &UI), Opcode(Opcode), ResultTy(ResultTy) { - assert((!UI || UI->getOpcode() == Opcode) && + assert(UI.getOpcode() == Opcode && "opcode of underlying cast doesn't match"); - assert((!UI || UI->getType() == ResultTy) && + assert(UI.getType() == ResultTy && "result type of underlying cast doesn't match"); } + VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) + : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPValue(this, nullptr), + Opcode(Opcode), ResultTy(ResultTy) {} + ~VPWidenCastRecipe() override = default; VP_CLASSOF_IMPL(VPDef::VPWidenCastSC) @@ -1196,7 +1296,8 @@ public: struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue { template <typename IterT> VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands) - : VPRecipeBase(VPDef::VPWidenSelectSC, Operands), VPValue(this, &I) {} + : VPRecipeBase(VPDef::VPWidenSelectSC, Operands, I.getDebugLoc()), + VPValue(this, &I) {} ~VPWidenSelectRecipe() override = default; @@ -1282,8 +1383,8 @@ public: class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue { protected: VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr, - VPValue *Start = nullptr) - : VPRecipeBase(VPDefID, {}), VPValue(this, UnderlyingInstr) { + VPValue *Start = nullptr, DebugLoc DL = {}) + : VPRecipeBase(VPDefID, {}, DL), VPValue(this, UnderlyingInstr) { if (Start) addOperand(Start); } @@ -1404,7 +1505,7 @@ public: bool isCanonical() const; /// Returns the scalar type of the induction. - const Type *getScalarType() const { + Type *getScalarType() const { return Trunc ? Trunc->getType() : IV->getType(); } }; @@ -1565,14 +1666,13 @@ public: /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class VPBlendRecipe : public VPRecipeBase, public VPValue { - PHINode *Phi; - public: /// The blend operation is a User of the incoming values and of their /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value /// might be incoming with a full mask for which there is no VPValue. VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) - : VPRecipeBase(VPDef::VPBlendSC, Operands), VPValue(this, Phi), Phi(Phi) { + : VPRecipeBase(VPDef::VPBlendSC, Operands, Phi->getDebugLoc()), + VPValue(this, Phi) { assert(Operands.size() > 0 && ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && "Expected either a single incoming value or a positive even number " @@ -1701,16 +1801,13 @@ public: /// The Operands are {ChainOp, VecOp, [Condition]}. class VPReductionRecipe : public VPRecipeBase, public VPValue { /// The recurrence decriptor for the reduction in question. - const RecurrenceDescriptor *RdxDesc; - /// Pointer to the TTI, needed to create the target reduction - const TargetTransformInfo *TTI; + const RecurrenceDescriptor &RdxDesc; public: - VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I, - VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - const TargetTransformInfo *TTI) + VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I, + VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp) : VPRecipeBase(VPDef::VPReductionSC, {ChainOp, VecOp}), VPValue(this, I), - RdxDesc(R), TTI(TTI) { + RdxDesc(R) { if (CondOp) addOperand(CondOp); } @@ -2008,11 +2105,9 @@ public: /// loop). VPWidenCanonicalIVRecipe represents the vector version of the /// canonical induction variable. class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { - DebugLoc DL; - public: VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL) - : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV), DL(DL) {} + : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {} ~VPCanonicalIVPHIRecipe() override = default; @@ -2032,8 +2127,8 @@ public: #endif /// Returns the scalar type of the induction. - const Type *getScalarType() const { - return getOperand(0)->getLiveInIRValue()->getType(); + Type *getScalarType() const { + return getStartValue()->getLiveInIRValue()->getType(); } /// Returns true if the recipe only uses the first lane of operand \p Op. @@ -2043,6 +2138,13 @@ public: return true; } + /// Returns true if the recipe only uses the first part of operand \p Op. + bool onlyFirstPartUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + /// Check if the induction described by \p Kind, /p Start and \p Step is /// canonical, i.e. has the same start, step (of 1), and type as the /// canonical IV. @@ -2055,12 +2157,10 @@ public: /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and /// remove VPActiveLaneMaskPHIRecipe. class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { - DebugLoc DL; - public: VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) - : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask), - DL(DL) {} + : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, + DL) {} ~VPActiveLaneMaskPHIRecipe() override = default; @@ -2113,19 +2213,24 @@ public: /// an IV with different start and step values, using Start + CanonicalIV * /// Step. class VPDerivedIVRecipe : public VPRecipeBase, public VPValue { - /// The type of the result value. It may be smaller than the type of the - /// induction and in this case it will get truncated to ResultTy. - Type *ResultTy; + /// If not nullptr, the result of the induction will get truncated to + /// TruncResultTy. + Type *TruncResultTy; - /// Induction descriptor for the induction the canonical IV is transformed to. - const InductionDescriptor &IndDesc; + /// Kind of the induction. + const InductionDescriptor::InductionKind Kind; + /// If not nullptr, the floating point induction binary operator. Must be set + /// for floating point inductions. + const FPMathOperator *FPBinOp; public: VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start, VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step, - Type *ResultTy) + Type *TruncResultTy) : VPRecipeBase(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}), - VPValue(this), ResultTy(ResultTy), IndDesc(IndDesc) {} + VPValue(this), TruncResultTy(TruncResultTy), Kind(IndDesc.getKind()), + FPBinOp(dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())) { + } ~VPDerivedIVRecipe() override = default; @@ -2141,6 +2246,11 @@ public: VPSlotTracker &SlotTracker) const override; #endif + Type *getScalarType() const { + return TruncResultTy ? TruncResultTy + : getStartValue()->getLiveInIRValue()->getType(); + } + VPValue *getStartValue() const { return getOperand(0); } VPValue *getCanonicalIV() const { return getOperand(1); } VPValue *getStepValue() const { return getOperand(2); } @@ -2155,14 +2265,23 @@ public: /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their scalar values. -class VPScalarIVStepsRecipe : public VPRecipeBase, public VPValue { - const InductionDescriptor &IndDesc; +class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, public VPValue { + Instruction::BinaryOps InductionOpcode; public: + VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step, + Instruction::BinaryOps Opcode, FastMathFlags FMFs) + : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC, + ArrayRef<VPValue *>({IV, Step}), FMFs), + VPValue(this), InductionOpcode(Opcode) {} + VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV, VPValue *Step) - : VPRecipeBase(VPDef::VPScalarIVStepsSC, {IV, Step}), VPValue(this), - IndDesc(IndDesc) {} + : VPScalarIVStepsRecipe( + IV, Step, IndDesc.getInductionOpcode(), + dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()) + ? IndDesc.getInductionBinOp()->getFastMathFlags() + : FastMathFlags()) {} ~VPScalarIVStepsRecipe() override = default; @@ -2445,6 +2564,9 @@ class VPlan { /// Represents the vector trip count. VPValue VectorTripCount; + /// Represents the loop-invariant VF * UF of the vector loop region. + VPValue VFxUF; + /// Holds a mapping between Values and their corresponding VPValue inside /// VPlan. Value2VPValueTy Value2VPValue; @@ -2490,15 +2612,17 @@ public: ~VPlan(); - /// Create an initial VPlan with preheader and entry blocks. Creates a - /// VPExpandSCEVRecipe for \p TripCount and uses it as plan's trip count. + /// Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping + /// original scalar pre-header) which contains SCEV expansions that need to + /// happen before the CFG is modified; a VPBasicBlock for the vector + /// pre-header, followed by a region for the vector loop, followed by the + /// middle VPBasicBlock. static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE); /// Prepare the plan for execution, setting up the required live-in values. void prepareToExecute(Value *TripCount, Value *VectorTripCount, - Value *CanonicalIVStartValue, VPTransformState &State, - bool IsEpilogueVectorization); + Value *CanonicalIVStartValue, VPTransformState &State); /// Generate the IR code for this VPlan. void execute(VPTransformState *State); @@ -2522,6 +2646,9 @@ public: /// The vector trip count. VPValue &getVectorTripCount() { return VectorTripCount; } + /// Returns VF * UF of the vector loop region. + VPValue &getVFxUF() { return VFxUF; } + /// Mark the plan to indicate that using Value2VPValue is not safe any /// longer, because it may be stale. void disableValue2VPValue() { Value2VPValueEnabled = false; } @@ -2583,13 +2710,10 @@ public: return getVPValue(V); } - void removeVPValueFor(Value *V) { - assert(Value2VPValueEnabled && - "IR value to VPValue mapping may be out of date!"); - Value2VPValue.erase(V); - } - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the live-ins of this VPlan to \p O. + void printLiveIns(raw_ostream &O) const; + /// Print this VPlan to \p O. void print(raw_ostream &O) const; @@ -2628,10 +2752,6 @@ public: return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); } - /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there - /// be only one at most. If there isn't one, then return nullptr. - VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi(); - void addLiveOut(PHINode *PN, VPValue *V); void removeLiveOut(PHINode *PN) { @@ -2959,6 +3079,9 @@ namespace vputils { /// Returns true if only the first lane of \p Def is used. bool onlyFirstLaneUsed(VPValue *Def); +/// Returns true if only the first part of \p Def is used. +bool onlyFirstPartUsed(VPValue *Def); + /// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p /// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in /// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp new file mode 100644 index 000000000000..97a8a1803bbf --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -0,0 +1,237 @@ +//===- VPlanAnalysis.cpp - Various Analyses working on VPlan ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "VPlanAnalysis.h" +#include "VPlan.h" +#include "llvm/ADT/TypeSwitch.h" + +using namespace llvm; + +#define DEBUG_TYPE "vplan" + +Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPBlendRecipe *R) { + Type *ResTy = inferScalarType(R->getIncomingValue(0)); + for (unsigned I = 1, E = R->getNumIncomingValues(); I != E; ++I) { + VPValue *Inc = R->getIncomingValue(I); + assert(inferScalarType(Inc) == ResTy && + "different types inferred for different incoming values"); + CachedTypes[Inc] = ResTy; + } + return ResTy; +} + +Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { + switch (R->getOpcode()) { + case Instruction::Select: { + Type *ResTy = inferScalarType(R->getOperand(1)); + VPValue *OtherV = R->getOperand(2); + assert(inferScalarType(OtherV) == ResTy && + "different types inferred for different operands"); + CachedTypes[OtherV] = ResTy; + return ResTy; + } + case VPInstruction::FirstOrderRecurrenceSplice: { + Type *ResTy = inferScalarType(R->getOperand(0)); + VPValue *OtherV = R->getOperand(1); + assert(inferScalarType(OtherV) == ResTy && + "different types inferred for different operands"); + CachedTypes[OtherV] = ResTy; + return ResTy; + } + default: + break; + } + // Type inference not implemented for opcode. + LLVM_DEBUG({ + dbgs() << "LV: Found unhandled opcode for: "; + R->getVPSingleValue()->dump(); + }); + llvm_unreachable("Unhandled opcode!"); +} + +Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) { + unsigned Opcode = R->getOpcode(); + switch (Opcode) { + case Instruction::ICmp: + case Instruction::FCmp: + return IntegerType::get(Ctx, 1); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + Type *ResTy = inferScalarType(R->getOperand(0)); + assert(ResTy == inferScalarType(R->getOperand(1)) && + "types for both operands must match for binary op"); + CachedTypes[R->getOperand(1)] = ResTy; + return ResTy; + } + case Instruction::FNeg: + case Instruction::Freeze: + return inferScalarType(R->getOperand(0)); + default: + break; + } + + // Type inference not implemented for opcode. + LLVM_DEBUG({ + dbgs() << "LV: Found unhandled opcode for: "; + R->getVPSingleValue()->dump(); + }); + llvm_unreachable("Unhandled opcode!"); +} + +Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { + auto &CI = *cast<CallInst>(R->getUnderlyingInstr()); + return CI.getType(); +} + +Type *VPTypeAnalysis::inferScalarTypeForRecipe( + const VPWidenMemoryInstructionRecipe *R) { + assert(!R->isStore() && "Store recipes should not define any values"); + return cast<LoadInst>(&R->getIngredient())->getType(); +} + +Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenSelectRecipe *R) { + Type *ResTy = inferScalarType(R->getOperand(1)); + VPValue *OtherV = R->getOperand(2); + assert(inferScalarType(OtherV) == ResTy && + "different types inferred for different operands"); + CachedTypes[OtherV] = ResTy; + return ResTy; +} + +Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) { + switch (R->getUnderlyingInstr()->getOpcode()) { + case Instruction::Call: { + unsigned CallIdx = R->getNumOperands() - (R->isPredicated() ? 2 : 1); + return cast<Function>(R->getOperand(CallIdx)->getLiveInIRValue()) + ->getReturnType(); + } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + Type *ResTy = inferScalarType(R->getOperand(0)); + assert(ResTy == inferScalarType(R->getOperand(1)) && + "inferred types for operands of binary op don't match"); + CachedTypes[R->getOperand(1)] = ResTy; + return ResTy; + } + case Instruction::Select: { + Type *ResTy = inferScalarType(R->getOperand(1)); + assert(ResTy == inferScalarType(R->getOperand(2)) && + "inferred types for operands of select op don't match"); + CachedTypes[R->getOperand(2)] = ResTy; + return ResTy; + } + case Instruction::ICmp: + case Instruction::FCmp: + return IntegerType::get(Ctx, 1); + case Instruction::Alloca: + case Instruction::BitCast: + case Instruction::Trunc: + case Instruction::SExt: + case Instruction::ZExt: + case Instruction::FPExt: + case Instruction::FPTrunc: + case Instruction::ExtractValue: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::FPToSI: + case Instruction::FPToUI: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + return R->getUnderlyingInstr()->getType(); + case Instruction::Freeze: + case Instruction::FNeg: + case Instruction::GetElementPtr: + return inferScalarType(R->getOperand(0)); + case Instruction::Load: + return cast<LoadInst>(R->getUnderlyingInstr())->getType(); + case Instruction::Store: + // FIXME: VPReplicateRecipes with store opcodes still define a result + // VPValue, so we need to handle them here. Remove the code here once this + // is modeled accurately in VPlan. + return Type::getVoidTy(Ctx); + default: + break; + } + // Type inference not implemented for opcode. + LLVM_DEBUG({ + dbgs() << "LV: Found unhandled opcode for: "; + R->getVPSingleValue()->dump(); + }); + llvm_unreachable("Unhandled opcode"); +} + +Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { + if (Type *CachedTy = CachedTypes.lookup(V)) + return CachedTy; + + if (V->isLiveIn()) + return V->getLiveInIRValue()->getType(); + + Type *ResultTy = + TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe()) + .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe, + VPReductionPHIRecipe, VPWidenPointerInductionRecipe>( + [this](const auto *R) { + // Handle header phi recipes, except VPWienIntOrFpInduction + // which needs special handling due it being possibly truncated. + // TODO: consider inferring/caching type of siblings, e.g., + // backedge value, here and in cases below. + return inferScalarType(R->getStartValue()); + }) + .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>( + [](const auto *R) { return R->getScalarType(); }) + .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe, + VPWidenGEPRecipe>([this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) + .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe, + VPWidenCallRecipe, VPWidenMemoryInstructionRecipe, + VPWidenSelectRecipe>( + [this](const auto *R) { return inferScalarTypeForRecipe(R); }) + .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) { + // TODO: Use info from interleave group. + return V->getUnderlyingValue()->getType(); + }) + .Case<VPWidenCastRecipe>( + [](const VPWidenCastRecipe *R) { return R->getResultType(); }); + assert(ResultTy && "could not infer type for the given VPValue"); + CachedTypes[V] = ResultTy; + return ResultTy; +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h new file mode 100644 index 000000000000..7276641551ae --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -0,0 +1,61 @@ +//===- VPlanAnalysis.h - Various Analyses working on VPlan ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H + +#include "llvm/ADT/DenseMap.h" + +namespace llvm { + +class LLVMContext; +class VPValue; +class VPBlendRecipe; +class VPInstruction; +class VPWidenRecipe; +class VPWidenCallRecipe; +class VPWidenIntOrFpInductionRecipe; +class VPWidenMemoryInstructionRecipe; +struct VPWidenSelectRecipe; +class VPReplicateRecipe; +class Type; + +/// An analysis for type-inference for VPValues. +/// It infers the scalar type for a given VPValue by bottom-up traversing +/// through defining recipes until root nodes with known types are reached (e.g. +/// live-ins or load recipes). The types are then propagated top down through +/// operations. +/// Note that the analysis caches the inferred types. A new analysis object must +/// be constructed once a VPlan has been modified in a way that invalidates any +/// of the previously inferred types. +class VPTypeAnalysis { + DenseMap<const VPValue *, Type *> CachedTypes; + LLVMContext &Ctx; + + Type *inferScalarTypeForRecipe(const VPBlendRecipe *R); + Type *inferScalarTypeForRecipe(const VPInstruction *R); + Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R); + Type *inferScalarTypeForRecipe(const VPWidenRecipe *R); + Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R); + Type *inferScalarTypeForRecipe(const VPWidenMemoryInstructionRecipe *R); + Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R); + Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R); + +public: + VPTypeAnalysis(LLVMContext &Ctx) : Ctx(Ctx) {} + + /// Infer the type of \p V. Returns the scalar type of \p V. + Type *inferScalarType(const VPValue *V); + + /// Return the LLVMContext used by the analysis. + LLVMContext &getContext() { return Ctx; } +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index f6e3a2a16db8..f950d4740e41 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -61,6 +61,7 @@ private: // Utility functions. void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB); + void setRegionPredsFromBB(VPRegionBlock *VPBB, BasicBlock *BB); void fixPhiNodes(); VPBasicBlock *getOrCreateVPBB(BasicBlock *BB); #ifndef NDEBUG @@ -81,14 +82,43 @@ public: // Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB // must have no predecessors. void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) { - SmallVector<VPBlockBase *, 8> VPBBPreds; + auto GetLatchOfExit = [this](BasicBlock *BB) -> BasicBlock * { + auto *SinglePred = BB->getSinglePredecessor(); + Loop *LoopForBB = LI->getLoopFor(BB); + if (!SinglePred || LI->getLoopFor(SinglePred) == LoopForBB) + return nullptr; + // The input IR must be in loop-simplify form, ensuring a single predecessor + // for exit blocks. + assert(SinglePred == LI->getLoopFor(SinglePred)->getLoopLatch() && + "SinglePred must be the only loop latch"); + return SinglePred; + }; + if (auto *LatchBB = GetLatchOfExit(BB)) { + auto *PredRegion = getOrCreateVPBB(LatchBB)->getParent(); + assert(VPBB == cast<VPBasicBlock>(PredRegion->getSingleSuccessor()) && + "successor must already be set for PredRegion; it must have VPBB " + "as single successor"); + VPBB->setPredecessors({PredRegion}); + return; + } // Collect VPBB predecessors. + SmallVector<VPBlockBase *, 2> VPBBPreds; for (BasicBlock *Pred : predecessors(BB)) VPBBPreds.push_back(getOrCreateVPBB(Pred)); - VPBB->setPredecessors(VPBBPreds); } +static bool isHeaderBB(BasicBlock *BB, Loop *L) { + return L && BB == L->getHeader(); +} + +void PlainCFGBuilder::setRegionPredsFromBB(VPRegionBlock *Region, + BasicBlock *BB) { + // BB is a loop header block. Connect the region to the loop preheader. + Loop *LoopOfBB = LI->getLoopFor(BB); + Region->setPredecessors({getOrCreateVPBB(LoopOfBB->getLoopPredecessor())}); +} + // Add operands to VPInstructions representing phi nodes from the input IR. void PlainCFGBuilder::fixPhiNodes() { for (auto *Phi : PhisToFix) { @@ -100,38 +130,85 @@ void PlainCFGBuilder::fixPhiNodes() { assert(VPPhi->getNumOperands() == 0 && "Expected VPInstruction with no operands."); + Loop *L = LI->getLoopFor(Phi->getParent()); + if (isHeaderBB(Phi->getParent(), L)) { + // For header phis, make sure the incoming value from the loop + // predecessor is the first operand of the recipe. + assert(Phi->getNumOperands() == 2); + BasicBlock *LoopPred = L->getLoopPredecessor(); + VPPhi->addIncoming( + getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred)), + BB2VPBB[LoopPred]); + BasicBlock *LoopLatch = L->getLoopLatch(); + VPPhi->addIncoming( + getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch)), + BB2VPBB[LoopLatch]); + continue; + } + for (unsigned I = 0; I != Phi->getNumOperands(); ++I) VPPhi->addIncoming(getOrCreateVPOperand(Phi->getIncomingValue(I)), BB2VPBB[Phi->getIncomingBlock(I)]); } } +static bool isHeaderVPBB(VPBasicBlock *VPBB) { + return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB; +} + +/// Return true of \p L loop is contained within \p OuterLoop. +static bool doesContainLoop(const Loop *L, const Loop *OuterLoop) { + if (L->getLoopDepth() < OuterLoop->getLoopDepth()) + return false; + const Loop *P = L; + while (P) { + if (P == OuterLoop) + return true; + P = P->getParentLoop(); + } + return false; +} + // Create a new empty VPBasicBlock for an incoming BasicBlock in the region // corresponding to the containing loop or retrieve an existing one if it was // already created. If no region exists yet for the loop containing \p BB, a new // one is created. VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { - auto BlockIt = BB2VPBB.find(BB); - if (BlockIt != BB2VPBB.end()) + if (auto *VPBB = BB2VPBB.lookup(BB)) { // Retrieve existing VPBB. - return BlockIt->second; - - // Get or create a region for the loop containing BB. - Loop *CurrentLoop = LI->getLoopFor(BB); - VPRegionBlock *ParentR = nullptr; - if (CurrentLoop) { - auto Iter = Loop2Region.insert({CurrentLoop, nullptr}); - if (Iter.second) - Iter.first->second = new VPRegionBlock( - CurrentLoop->getHeader()->getName().str(), false /*isReplicator*/); - ParentR = Iter.first->second; + return VPBB; } // Create new VPBB. - LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n"); - VPBasicBlock *VPBB = new VPBasicBlock(BB->getName()); + StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName(); + LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n"); + VPBasicBlock *VPBB = new VPBasicBlock(Name); BB2VPBB[BB] = VPBB; - VPBB->setParent(ParentR); + + // Get or create a region for the loop containing BB. + Loop *LoopOfBB = LI->getLoopFor(BB); + if (!LoopOfBB || !doesContainLoop(LoopOfBB, TheLoop)) + return VPBB; + + auto *RegionOfVPBB = Loop2Region.lookup(LoopOfBB); + if (!isHeaderBB(BB, LoopOfBB)) { + assert(RegionOfVPBB && + "Region should have been created by visiting header earlier"); + VPBB->setParent(RegionOfVPBB); + return VPBB; + } + + assert(!RegionOfVPBB && + "First visit of a header basic block expects to register its region."); + // Handle a header - take care of its Region. + if (LoopOfBB == TheLoop) { + RegionOfVPBB = Plan.getVectorLoopRegion(); + } else { + RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/); + RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]); + } + RegionOfVPBB->setEntry(VPBB); + Loop2Region[LoopOfBB] = RegionOfVPBB; return VPBB; } @@ -254,6 +331,25 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, // Main interface to build the plain CFG. void PlainCFGBuilder::buildPlainCFG() { + // 0. Reuse the top-level region, vector-preheader and exit VPBBs from the + // skeleton. These were created directly rather than via getOrCreateVPBB(), + // revisit them now to update BB2VPBB. Note that header/entry and + // latch/exiting VPBB's of top-level region have yet to be created. + VPRegionBlock *TheRegion = Plan.getVectorLoopRegion(); + BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader(); + assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) && + "Unexpected loop preheader"); + auto *VectorPreheaderVPBB = + cast<VPBasicBlock>(TheRegion->getSinglePredecessor()); + // ThePreheaderBB conceptually corresponds to both Plan.getPreheader() (which + // wraps the original preheader BB) and Plan.getEntry() (which represents the + // new vector preheader); here we're interested in setting BB2VPBB to the + // latter. + BB2VPBB[ThePreheaderBB] = VectorPreheaderVPBB; + BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock(); + assert(LoopExitBB && "Loops with multiple exits are not supported."); + BB2VPBB[LoopExitBB] = cast<VPBasicBlock>(TheRegion->getSingleSuccessor()); + // 1. Scan the body of the loop in a topological order to visit each basic // block after having visited its predecessor basic blocks. Create a VPBB for // each BB and link it to its successor and predecessor VPBBs. Note that @@ -263,21 +359,11 @@ void PlainCFGBuilder::buildPlainCFG() { // Loop PH needs to be explicitly visited since it's not taken into account by // LoopBlocksDFS. - BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader(); - assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) && - "Unexpected loop preheader"); - VPBasicBlock *ThePreheaderVPBB = Plan.getEntry(); - BB2VPBB[ThePreheaderBB] = ThePreheaderVPBB; - ThePreheaderVPBB->setName("vector.ph"); for (auto &I : *ThePreheaderBB) { if (I.getType()->isVoidTy()) continue; IRDef2VPValue[&I] = Plan.getVPValueOrAddLiveIn(&I); } - // Create empty VPBB for Loop H so that we can link PH->H. - VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader()); - HeaderVPBB->setName("vector.body"); - ThePreheaderVPBB->setOneSuccessor(HeaderVPBB); LoopBlocksRPO RPO(TheLoop); RPO.perform(LI); @@ -286,88 +372,55 @@ void PlainCFGBuilder::buildPlainCFG() { // Create or retrieve the VPBasicBlock for this BB and create its // VPInstructions. VPBasicBlock *VPBB = getOrCreateVPBB(BB); + VPRegionBlock *Region = VPBB->getParent(); createVPInstructionsForVPBB(VPBB, BB); + Loop *LoopForBB = LI->getLoopFor(BB); + // Set VPBB predecessors in the same order as they are in the incoming BB. + if (!isHeaderBB(BB, LoopForBB)) { + setVPBBPredsFromBB(VPBB, BB); + } else { + // BB is a loop header, set the predecessor for the region, except for the + // top region, whose predecessor was set when creating VPlan's skeleton. + assert(isHeaderVPBB(VPBB) && "isHeaderBB and isHeaderVPBB disagree"); + if (TheRegion != Region) + setRegionPredsFromBB(Region, BB); + } // Set VPBB successors. We create empty VPBBs for successors if they don't // exist already. Recipes will be created when the successor is visited // during the RPO traversal. - Instruction *TI = BB->getTerminator(); - assert(TI && "Terminator expected."); - unsigned NumSuccs = TI->getNumSuccessors(); - + auto *BI = cast<BranchInst>(BB->getTerminator()); + unsigned NumSuccs = succ_size(BB); if (NumSuccs == 1) { - VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0)); - assert(SuccVPBB && "VPBB Successor not found."); - VPBB->setOneSuccessor(SuccVPBB); - } else if (NumSuccs == 2) { - VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0)); - assert(SuccVPBB0 && "Successor 0 not found."); - VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1)); - assert(SuccVPBB1 && "Successor 1 not found."); - - // Get VPBB's condition bit. - assert(isa<BranchInst>(TI) && "Unsupported terminator!"); - // Look up the branch condition to get the corresponding VPValue - // representing the condition bit in VPlan (which may be in another VPBB). - assert(IRDef2VPValue.count(cast<BranchInst>(TI)->getCondition()) && - "Missing condition bit in IRDef2VPValue!"); - - // Link successors. - VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1); - } else - llvm_unreachable("Number of successors not supported."); - - // Set VPBB predecessors in the same order as they are in the incoming BB. - setVPBBPredsFromBB(VPBB, BB); + auto *Successor = getOrCreateVPBB(BB->getSingleSuccessor()); + VPBB->setOneSuccessor(isHeaderVPBB(Successor) + ? Successor->getParent() + : static_cast<VPBlockBase *>(Successor)); + continue; + } + assert(BI->isConditional() && NumSuccs == 2 && BI->isConditional() && + "block must have conditional branch with 2 successors"); + // Look up the branch condition to get the corresponding VPValue + // representing the condition bit in VPlan (which may be in another VPBB). + assert(IRDef2VPValue.contains(BI->getCondition()) && + "Missing condition bit in IRDef2VPValue!"); + VPBasicBlock *Successor0 = getOrCreateVPBB(BI->getSuccessor(0)); + VPBasicBlock *Successor1 = getOrCreateVPBB(BI->getSuccessor(1)); + if (!LoopForBB || BB != LoopForBB->getLoopLatch()) { + VPBB->setTwoSuccessors(Successor0, Successor1); + continue; + } + // For a latch we need to set the successor of the region rather than that + // of VPBB and it should be set to the exit, i.e., non-header successor, + // except for the top region, whose successor was set when creating VPlan's + // skeleton. + if (TheRegion != Region) + Region->setOneSuccessor(isHeaderVPBB(Successor0) ? Successor1 + : Successor0); + Region->setExiting(VPBB); } - // 2. Process outermost loop exit. We created an empty VPBB for the loop - // single exit BB during the RPO traversal of the loop body but Instructions - // weren't visited because it's not part of the the loop. - BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock(); - assert(LoopExitBB && "Loops with multiple exits are not supported."); - VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB]; - // Loop exit was already set as successor of the loop exiting BB. - // We only set its predecessor VPBB now. - setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB); - - // 3. Fix up region blocks for loops. For each loop, - // * use the header block as entry to the corresponding region, - // * use the latch block as exit of the corresponding region, - // * set the region as successor of the loop pre-header, and - // * set the exit block as successor to the region. - SmallVector<Loop *> LoopWorkList; - LoopWorkList.push_back(TheLoop); - while (!LoopWorkList.empty()) { - Loop *L = LoopWorkList.pop_back_val(); - BasicBlock *Header = L->getHeader(); - BasicBlock *Exiting = L->getLoopLatch(); - assert(Exiting == L->getExitingBlock() && - "Latch must be the only exiting block"); - VPRegionBlock *Region = Loop2Region[L]; - VPBasicBlock *HeaderVPBB = getOrCreateVPBB(Header); - VPBasicBlock *ExitingVPBB = getOrCreateVPBB(Exiting); - - // Disconnect backedge and pre-header from header. - VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(L->getLoopPreheader()); - VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPBB); - VPBlockUtils::disconnectBlocks(ExitingVPBB, HeaderVPBB); - - Region->setParent(PreheaderVPBB->getParent()); - Region->setEntry(HeaderVPBB); - VPBlockUtils::connectBlocks(PreheaderVPBB, Region); - - // Disconnect exit block from exiting (=latch) block, set exiting block and - // connect region to exit block. - VPBasicBlock *ExitVPBB = getOrCreateVPBB(L->getExitBlock()); - VPBlockUtils::disconnectBlocks(ExitingVPBB, ExitVPBB); - Region->setExiting(ExitingVPBB); - VPBlockUtils::connectBlocks(Region, ExitVPBB); - - // Queue sub-loops for processing. - LoopWorkList.append(L->begin(), L->end()); - } - // 4. The whole CFG has been built at this point so all the input Values must + // 2. The whole CFG has been built at this point so all the input Values must // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding // VPlan operands. fixPhiNodes(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 26c309eed800..02e400d590be 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "VPlan.h" +#include "VPlanAnalysis.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" @@ -43,6 +44,8 @@ extern cl::opt<bool> EnableVPlanNativePath; bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { + case VPInterleaveSC: + return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; case VPWidenMemoryInstructionSC: { return cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); } @@ -114,6 +117,16 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPDerivedIVSC: case VPPredInstPHISC: return false; + case VPInstructionSC: + switch (cast<VPInstruction>(this)->getOpcode()) { + case Instruction::ICmp: + case VPInstruction::Not: + case VPInstruction::CalculateTripCountMinusVF: + case VPInstruction::CanonicalIVIncrementForPart: + return false; + default: + return true; + } case VPWidenCallSC: return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) ->mayHaveSideEffects(); @@ -135,6 +148,8 @@ bool VPRecipeBase::mayHaveSideEffects() const { "underlying instruction has side-effects"); return false; } + case VPInterleaveSC: + return mayWriteToMemory(); case VPWidenMemoryInstructionSC: assert(cast<VPWidenMemoryInstructionRecipe>(this) ->getIngredient() @@ -156,8 +171,13 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { VPValue *ExitValue = getOperand(0); if (vputils::isUniformAfterVectorization(ExitValue)) Lane = VPLane::getFirstLane(); + VPBasicBlock *MiddleVPBB = + cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); + assert(MiddleVPBB->getNumSuccessors() == 0 && + "the middle block must not have any successors"); + BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), - State.Builder.GetInsertBlock()); + MiddleBB); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -216,15 +236,55 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB, insertBefore(BB, I); } +FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { + assert(OpType == OperationType::FPMathOp && + "recipe doesn't have fast math flags"); + FastMathFlags Res; + Res.setAllowReassoc(FMFs.AllowReassoc); + Res.setNoNaNs(FMFs.NoNaNs); + Res.setNoInfs(FMFs.NoInfs); + Res.setNoSignedZeros(FMFs.NoSignedZeros); + Res.setAllowReciprocal(FMFs.AllowReciprocal); + Res.setAllowContract(FMFs.AllowContract); + Res.setApproxFunc(FMFs.ApproxFunc); + return Res; +} + +VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, + VPValue *A, VPValue *B, DebugLoc DL, + const Twine &Name) + : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), + Pred, DL), + VPValue(this), Opcode(Opcode), Name(Name.str()) { + assert(Opcode == Instruction::ICmp && + "only ICmp predicates supported at the moment"); +} + +VPInstruction::VPInstruction(unsigned Opcode, + std::initializer_list<VPValue *> Operands, + FastMathFlags FMFs, DebugLoc DL, const Twine &Name) + : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), + VPValue(this), Opcode(Opcode), Name(Name.str()) { + // Make sure the VPInstruction is a floating-point operation. + assert(isFPMathOp() && "this op can't take fast-math flags"); +} + Value *VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilderBase &Builder = State.Builder; - Builder.SetCurrentDebugLocation(DL); + Builder.SetCurrentDebugLocation(getDebugLoc()); if (Instruction::isBinaryOp(getOpcode())) { + if (Part != 0 && vputils::onlyFirstPartUsed(this)) + return State.get(this, 0); + Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); - return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); + auto *Res = + Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); + if (auto *I = dyn_cast<Instruction>(Res)) + setFlags(I); + return Res; } switch (getOpcode()) { @@ -232,10 +292,10 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Value *A = State.get(getOperand(0), Part); return Builder.CreateNot(A, Name); } - case VPInstruction::ICmpULE: { - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - return Builder.CreateICmpULE(IV, TC, Name); + case Instruction::ICmp: { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + return Builder.CreateCmp(getPredicate(), A, B, Name); } case Instruction::Select: { Value *Cond = State.get(getOperand(0), Part); @@ -285,23 +345,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); return Builder.CreateSelect(Cmp, Sub, Zero); } - case VPInstruction::CanonicalIVIncrement: - case VPInstruction::CanonicalIVIncrementNUW: { - if (Part == 0) { - bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; - auto *Phi = State.get(getOperand(0), 0); - // The loop step is equal to the vectorization factor (num of SIMD - // elements) times the unroll factor (num of SIMD instructions). - Value *Step = - createStepForVF(Builder, Phi->getType(), State.VF, State.UF); - return Builder.CreateAdd(Phi, Step, Name, IsNUW, false); - } - return State.get(this, 0); - } - - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::CanonicalIVIncrementForPartNUW: { - bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; + case VPInstruction::CanonicalIVIncrementForPart: { auto *IV = State.get(getOperand(0), VPIteration(0, 0)); if (Part == 0) return IV; @@ -309,7 +353,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, // The canonical IV is incremented by the vectorization factor (num of SIMD // elements) times the unroll part. Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); - return Builder.CreateAdd(IV, Step, Name, IsNUW, false); + return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), + hasNoSignedWrap()); } case VPInstruction::BranchOnCond: { if (Part != 0) @@ -361,10 +406,25 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, } } +#if !defined(NDEBUG) +bool VPInstruction::isFPMathOp() const { + // Inspired by FPMathOperator::classof. Notable differences are that we don't + // support Call, PHI and Select opcodes here yet. + return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || + Opcode == Instruction::FNeg || Opcode == Instruction::FSub || + Opcode == Instruction::FDiv || Opcode == Instruction::FRem || + Opcode == Instruction::FCmp || Opcode == Instruction::Select; +} +#endif + void VPInstruction::execute(VPTransformState &State) { assert(!State.Instance && "VPInstruction executing an Instance"); IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); - State.Builder.setFastMathFlags(FMF); + assert((hasFastMathFlags() == isFPMathOp() || + getOpcode() == Instruction::Select) && + "Recipe not a FPMathOp but has fast-math flags?"); + if (hasFastMathFlags()) + State.Builder.setFastMathFlags(getFastMathFlags()); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *GeneratedValue = generateInstruction(State, Part); if (!hasResult()) @@ -393,9 +453,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::Not: O << "not"; break; - case VPInstruction::ICmpULE: - O << "icmp ule"; - break; case VPInstruction::SLPLoad: O << "combined load"; break; @@ -408,12 +465,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; - case VPInstruction::CanonicalIVIncrement: - O << "VF * UF + "; - break; - case VPInstruction::CanonicalIVIncrementNUW: - O << "VF * UF +(nuw) "; - break; case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; @@ -421,49 +472,35 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, O << "TC > VF ? TC - VF : 0"; break; case VPInstruction::CanonicalIVIncrementForPart: - O << "VF * Part + "; - break; - case VPInstruction::CanonicalIVIncrementForPartNUW: - O << "VF * Part +(nuw) "; + O << "VF * Part +"; break; case VPInstruction::BranchOnCount: - O << "branch-on-count "; + O << "branch-on-count"; break; default: O << Instruction::getOpcodeName(getOpcode()); } - O << FMF; - - for (const VPValue *Operand : operands()) { - O << " "; - Operand->printAsOperand(O, SlotTracker); - } + printFlags(O); + printOperands(O, SlotTracker); - if (DL) { + if (auto DL = getDebugLoc()) { O << ", !dbg "; DL.print(O); } } #endif -void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { - // Make sure the VPInstruction is a floating-point operation. - assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul || - Opcode == Instruction::FNeg || Opcode == Instruction::FSub || - Opcode == Instruction::FDiv || Opcode == Instruction::FRem || - Opcode == Instruction::FCmp) && - "this op can't take fast-math flags"); - FMF = FMFNew; -} - void VPWidenCallRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "not widening"); auto &CI = *cast<CallInst>(getUnderlyingInstr()); assert(!isa<DbgInfoIntrinsic>(CI) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); - State.setDebugLocFromInst(&CI); + State.setDebugLocFrom(CI.getDebugLoc()); + FunctionType *VFTy = nullptr; + if (Variant) + VFTy = Variant->getFunctionType(); for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector<Type *, 2> TysForDecl; // Add return type if intrinsic is overloaded on it. @@ -475,12 +512,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { for (const auto &I : enumerate(operands())) { // Some intrinsics have a scalar argument - don't replace it with a // vector. + // Some vectorized function variants may also take a scalar argument, + // e.g. linear parameters for pointers. Value *Arg; - if (VectorIntrinsicID == Intrinsic::not_intrinsic || - !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) - Arg = State.get(I.value(), Part); - else + if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) || + (VectorIntrinsicID != Intrinsic::not_intrinsic && + isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))) Arg = State.get(I.value(), VPIteration(0, 0)); + else + Arg = State.get(I.value(), Part); if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); @@ -553,8 +593,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenSelectRecipe::execute(VPTransformState &State) { - auto &I = *cast<SelectInst>(getUnderlyingInstr()); - State.setDebugLocFromInst(&I); + State.setDebugLocFrom(getDebugLoc()); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -569,13 +608,31 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { Value *Op1 = State.get(getOperand(2), Part); Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); State.set(this, Sel, Part); - State.addMetadata(Sel, &I); + State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); } } +VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( + const FastMathFlags &FMF) { + AllowReassoc = FMF.allowReassoc(); + NoNaNs = FMF.noNaNs(); + NoInfs = FMF.noInfs(); + NoSignedZeros = FMF.noSignedZeros(); + AllowReciprocal = FMF.allowReciprocal(); + AllowContract = FMF.allowContract(); + ApproxFunc = FMF.approxFunc(); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { switch (OpType) { + case OperationType::Cmp: + O << " " << CmpInst::getPredicateName(getPredicate()); + break; + case OperationType::DisjointOp: + if (DisjointFlags.IsDisjoint) + O << " disjoint"; + break; case OperationType::PossiblyExactOp: if (ExactFlags.IsExact) O << " exact"; @@ -593,17 +650,22 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { if (GEPFlags.IsInBounds) O << " inbounds"; break; + case OperationType::NonNegOp: + if (NonNegFlags.NonNeg) + O << " nneg"; + break; case OperationType::Other: break; } - O << " "; + if (getNumOperands() > 0) + O << " "; } #endif void VPWidenRecipe::execute(VPTransformState &State) { - auto &I = *cast<Instruction>(getUnderlyingValue()); + State.setDebugLocFrom(getDebugLoc()); auto &Builder = State.Builder; - switch (I.getOpcode()) { + switch (Opcode) { case Instruction::Call: case Instruction::Br: case Instruction::PHI: @@ -630,28 +692,24 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::Or: case Instruction::Xor: { // Just widen unops and binops. - State.setDebugLocFromInst(&I); - for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector<Value *, 2> Ops; for (VPValue *VPOp : operands()) Ops.push_back(State.get(VPOp, Part)); - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + Value *V = Builder.CreateNAryOp(Opcode, Ops); if (auto *VecOp = dyn_cast<Instruction>(V)) setFlags(VecOp); // Use this vector value for all users of the original instruction. State.set(this, V, Part); - State.addMetadata(V, &I); + State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); } break; } case Instruction::Freeze: { - State.setDebugLocFromInst(&I); - for (unsigned Part = 0; Part < State.UF; ++Part) { Value *Op = State.get(getOperand(0), Part); @@ -663,9 +721,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - State.setDebugLocFromInst(Cmp); + bool FCmp = Opcode == Instruction::FCmp; for (unsigned Part = 0; Part < State.UF; ++Part) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); @@ -673,51 +729,64 @@ void VPWidenRecipe::execute(VPTransformState &State) { if (FCmp) { // Propagate fast math flags. IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) + Builder.setFastMathFlags(I->getFastMathFlags()); + C = Builder.CreateFCmp(getPredicate(), A, B); } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); + C = Builder.CreateICmp(getPredicate(), A, B); } State.set(this, C, Part); - State.addMetadata(C, &I); + State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); } break; } default: // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " + << Instruction::getOpcodeName(Opcode)); llvm_unreachable("Unhandled instruction!"); } // end of switch. + +#if !defined(NDEBUG) + // Verify that VPlan type inference results agree with the type of the + // generated values. + for (unsigned Part = 0; Part < State.UF; ++Part) { + assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), + State.VF) == State.get(this, Part)->getType() && + "inferred type and type from generated instructions do not match"); + } +#endif } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; printAsOperand(O, SlotTracker); - const Instruction *UI = getUnderlyingInstr(); - O << " = " << UI->getOpcodeName(); + O << " = " << Instruction::getOpcodeName(Opcode); printFlags(O); - if (auto *Cmp = dyn_cast<CmpInst>(UI)) - O << Cmp->getPredicate() << " "; printOperands(O, SlotTracker); } #endif void VPWidenCastRecipe::execute(VPTransformState &State) { - auto *I = cast_or_null<Instruction>(getUnderlyingValue()); - if (I) - State.setDebugLocFromInst(I); + State.setDebugLocFrom(getDebugLoc()); auto &Builder = State.Builder; /// Vectorize casts. assert(State.VF.isVector() && "Not vectorizing?"); Type *DestTy = VectorType::get(getResultType(), State.VF); - + VPValue *Op = getOperand(0); for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); + if (Part > 0 && Op->isLiveIn()) { + // FIXME: Remove once explicit unrolling is implemented using VPlan. + State.set(this, State.get(this, 0), Part); + continue; + } + Value *A = State.get(Op, Part); Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); State.set(this, Cast, Part); - State.addMetadata(Cast, I); + State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); } } @@ -727,10 +796,182 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN-CAST "; printAsOperand(O, SlotTracker); O << " = " << Instruction::getOpcodeName(Opcode) << " "; + printFlags(O); printOperands(O, SlotTracker); O << " to " << *getResultType(); } +#endif + +/// This function adds +/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) +/// to each vector element of Val. The sequence starts at StartIndex. +/// \p Opcode is relevant for FP induction variable. +static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, + Instruction::BinaryOps BinOp, ElementCount VF, + IRBuilderBase &Builder) { + assert(VF.isVector() && "only vector VFs are supported"); + + // Create and check the types. + auto *ValVTy = cast<VectorType>(Val->getType()); + ElementCount VLen = ValVTy->getElementCount(); + + Type *STy = Val->getType()->getScalarType(); + assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && + "Induction Step must be an integer or FP"); + assert(Step->getType() == STy && "Step has wrong type"); + + SmallVector<Constant *, 8> Indices; + + // Create a vector of consecutive numbers from zero to VF. + VectorType *InitVecValVTy = ValVTy; + if (STy->isFloatingPointTy()) { + Type *InitVecValSTy = + IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); + InitVecValVTy = VectorType::get(InitVecValSTy, VLen); + } + Value *InitVec = Builder.CreateStepVector(InitVecValVTy); + + // Splat the StartIdx + Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); + + if (STy->isIntegerTy()) { + InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw + // flags, which can be found from the original scalar operations. + Step = Builder.CreateMul(InitVec, Step); + return Builder.CreateAdd(Val, Step, "induction"); + } + + // Floating point induction. + assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && + "Binary Opcode should be specified for FP induction"); + InitVec = Builder.CreateUIToFP(InitVec, ValVTy); + InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); + + Step = Builder.CreateVectorSplat(VLen, Step); + Value *MulOp = Builder.CreateFMul(InitVec, Step); + return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); +} + +/// A helper function that returns an integer or floating-point constant with +/// value C. +static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { + return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) + : ConstantFP::get(Ty, C); +} + +static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, + ElementCount VF) { + assert(FTy->isFloatingPointTy() && "Expected floating point type!"); + Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); + Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); + return B.CreateUIToFP(RuntimeVF, FTy); +} + +void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Int or FP induction being replicated."); + + Value *Start = getStartValue()->getLiveInIRValue(); + const InductionDescriptor &ID = getInductionDescriptor(); + TruncInst *Trunc = getTruncInst(); + IRBuilderBase &Builder = State.Builder; + assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); + assert(State.VF.isVector() && "must have vector VF"); + + // The value from the original loop to which we are mapping the new induction + // variable. + Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; + + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) + Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); + + // Now do the actual transformations, and start with fetching the step value. + Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + + assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); + + // Construct the initial value of the vector IV in the vector loop preheader + auto CurrIP = Builder.saveIP(); + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + Builder.SetInsertPoint(VectorPH->getTerminator()); + if (isa<TruncInst>(EntryVal)) { + assert(Start->getType()->isIntegerTy() && + "Truncation requires an integer type"); + auto *TruncType = cast<IntegerType>(EntryVal->getType()); + Step = Builder.CreateTrunc(Step, TruncType); + Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); + } + + Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); + Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); + Value *SteppedStart = getStepVector( + SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); + + // We create vector phi nodes for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (Step->getType()->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = ID.getInductionOpcode(); + MulOp = Instruction::FMul; + } + + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Type *StepType = Step->getType(); + Value *RuntimeVF; + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); + else + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't + // handle a constant vector splat. + Value *SplatVF = isa<Constant>(Mul) + ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); + Builder.restoreIP(CurrIP); + + // We may need to add the step a number of times, depending on the unroll + // factor. The last of those goes into the PHI. + PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); + VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); + VecInd->setDebugLoc(EntryVal->getDebugLoc()); + Instruction *LastInduction = VecInd; + for (unsigned Part = 0; Part < State.UF; ++Part) { + State.set(this, LastInduction, Part); + + if (isa<TruncInst>(EntryVal)) + State.addMetadata(LastInduction, EntryVal); + LastInduction = cast<Instruction>( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + LastInduction->setDebugLoc(EntryVal->getDebugLoc()); + } + + LastInduction->setName("vec.ind.next"); + VecInd->addIncoming(SteppedStart, VectorPH); + // Add induction update using an incorrect block temporarily. The phi node + // will be fixed after VPlan execution. Note that at this point the latch + // block cannot be used, as it does not exist yet. + // TODO: Model increment value in VPlan, by turning the recipe into a + // multi-def and a subclass of VPHeaderPHIRecipe. + VecInd->addIncoming(LastInduction, VectorPH); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-INDUCTION"; @@ -770,17 +1011,112 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, O << " * "; getStepValue()->printAsOperand(O, SlotTracker); - if (IndDesc.getStep()->getType() != ResultTy) - O << " (truncated to " << *ResultTy << ")"; + if (TruncResultTy) + O << " (truncated to " << *TruncResultTy << ")"; } #endif +void VPScalarIVStepsRecipe::execute(VPTransformState &State) { + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); + if (hasFastMathFlags()) + State.Builder.setFastMathFlags(getFastMathFlags()); + + /// Compute scalar induction steps. \p ScalarIV is the scalar induction + /// variable on which to base the steps, \p Step is the size of the step. + + Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); + Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + IRBuilderBase &Builder = State.Builder; + + // Ensure step has the same type as that of scalar IV. + Type *BaseIVTy = BaseIV->getType()->getScalarType(); + if (BaseIVTy != Step->getType()) { + // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to + // avoid separate truncate here. + assert(Step->getType()->isIntegerTy() && + "Truncation requires an integer step"); + Step = State.Builder.CreateTrunc(Step, BaseIVTy); + } + + // We build scalar steps for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (BaseIVTy->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = InductionOpcode; + MulOp = Instruction::FMul; + } + + // Determine the number of scalars we need to generate for each unroll + // iteration. + bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); + // Compute the scalar steps and save the results in State. + Type *IntStepTy = + IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); + Type *VecIVTy = nullptr; + Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; + if (!FirstLaneOnly && State.VF.isScalable()) { + VecIVTy = VectorType::get(BaseIVTy, State.VF); + UnitStepVec = + Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); + SplatStep = Builder.CreateVectorSplat(State.VF, Step); + SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); + } + + unsigned StartPart = 0; + unsigned EndPart = State.UF; + unsigned StartLane = 0; + unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); + if (State.Instance) { + StartPart = State.Instance->Part; + EndPart = StartPart + 1; + StartLane = State.Instance->Lane.getKnownLane(); + EndLane = StartLane + 1; + } + for (unsigned Part = StartPart; Part < EndPart; ++Part) { + Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); + + if (!FirstLaneOnly && State.VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); + auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); + if (BaseIVTy->isFloatingPointTy()) + InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); + auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); + auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); + State.set(this, Add, Part); + // It's useful to record the lane values too for the known minimum number + // of elements so we do those below. This improves the code quality when + // trying to extract the first element, for example. + } + + if (BaseIVTy->isFloatingPointTy()) + StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); + + for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { + Value *StartIdx = Builder.CreateBinOp( + AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); + // The step returned by `createStepForVF` is a runtime-evaluated value + // when VF is scalable. Otherwise, it should be folded into a Constant. + assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && + "Expected StartIdx to be folded to a constant when VF is not " + "scalable"); + auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); + auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); + State.set(this, Add, VPIteration(Part, Lane)); + } + } +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); - O << Indent << "= SCALAR-STEPS "; + O << " = SCALAR-STEPS "; printOperands(O, SlotTracker); } #endif @@ -874,7 +1210,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPBlendRecipe::execute(VPTransformState &State) { - State.setDebugLocFromInst(Phi); + State.setDebugLocFrom(getDebugLoc()); // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. @@ -916,7 +1252,7 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "BLEND "; - Phi->printAsOperand(O, false); + printAsOperand(O, SlotTracker); O << " ="; if (getNumIncomingValues() == 1) { // Not a User of any mask: not really blending, this is a @@ -942,14 +1278,14 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, O << " +"; if (isa<FPMathOperator>(getUnderlyingInstr())) O << getUnderlyingInstr()->getFastMathFlags(); - O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " ("; + O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; getVecOp()->printAsOperand(O, SlotTracker); if (getCondOp()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; - if (RdxDesc->IntermediateStore) + if (RdxDesc.IntermediateStore) O << " (with final reduction value stored in invariant address sank " "outside of loop)"; } @@ -1093,12 +1429,12 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { Value *Start = getStartValue()->getLiveInIRValue(); - PHINode *EntryPart = PHINode::Create( - Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); + PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); + EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); EntryPart->addIncoming(Start, VectorPH); - EntryPart->setDebugLoc(DL); + EntryPart->setDebugLoc(getDebugLoc()); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) State.set(this, EntryPart, Part); } @@ -1108,7 +1444,8 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); - O << " = CANONICAL-INDUCTION"; + O << " = CANONICAL-INDUCTION "; + printOperands(O, SlotTracker); } #endif @@ -1221,8 +1558,8 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { } // Create a phi node for the new recurrence. - PHINode *EntryPart = PHINode::Create( - VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt()); + PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); + EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); EntryPart->addIncoming(VectorInit, VectorPH); State.set(this, EntryPart, 0); } @@ -1254,8 +1591,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { "recipe must be in the vector loop header"); unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *EntryPart = - PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt()); + Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); + EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); State.set(this, EntryPart, Part); } @@ -1269,8 +1606,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || - RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { - // MinMax reduction have the start value as their identify. + RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { + // MinMax and AnyOf reductions have the start value as their identity. if (ScalarPHI) { Iden = StartV; } else { @@ -1316,23 +1653,7 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) { assert(EnableVPlanNativePath && "Non-native vplans are not expected to have VPWidenPHIRecipes."); - // Currently we enter here in the VPlan-native path for non-induction - // PHIs where all control flow is uniform. We simply widen these PHIs. - // Create a vector phi with no operands - the vector phi operands will be - // set at the end of vector code generation. - VPBasicBlock *Parent = getParent(); - VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion(); - unsigned StartIdx = 0; - // For phis in header blocks of loop regions, use the index of the value - // coming from the preheader. - if (LoopRegion->getEntryBasicBlock() == Parent) { - for (unsigned I = 0; I < getNumOperands(); ++I) { - if (getIncomingBlock(I) == - LoopRegion->getSinglePredecessor()->getExitingBasicBlock()) - StartIdx = I; - } - } - Value *Op0 = State.get(getOperand(StartIdx), 0); + Value *Op0 = State.get(getOperand(0), 0); Type *VecTy = Op0->getType(); Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); State.set(this, VecPhi, 0); @@ -1368,7 +1689,7 @@ void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { PHINode *EntryPart = State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); EntryPart->addIncoming(StartMask, VectorPH); - EntryPart->setDebugLoc(DL); + EntryPart->setDebugLoc(getDebugLoc()); State.set(this, EntryPart, Part); } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 83bfdfd09d19..33132880d5a4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -12,17 +12,22 @@ //===----------------------------------------------------------------------===// #include "VPlanTransforms.h" -#include "VPlanDominatorTree.h" #include "VPRecipeBuilder.h" +#include "VPlanAnalysis.h" #include "VPlanCFG.h" +#include "VPlanDominatorTree.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" using namespace llvm; +using namespace llvm::PatternMatch; + void VPlanTransforms::VPInstructionsToVPRecipes( VPlanPtr &Plan, function_ref<const InductionDescriptor *(PHINode *)> @@ -76,7 +81,7 @@ void VPlanTransforms::VPInstructionsToVPRecipes( NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands()); } else if (auto *CI = dyn_cast<CastInst>(Inst)) { NewRecipe = new VPWidenCastRecipe( - CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI); + CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI); } else { NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands()); } @@ -158,17 +163,10 @@ static bool sinkScalarOperands(VPlan &Plan) { // TODO: add ".cloned" suffix to name of Clone's VPValue. Clone->insertBefore(SinkCandidate); - for (auto *U : to_vector(SinkCandidate->getVPSingleValue()->users())) { - auto *UI = cast<VPRecipeBase>(U); - if (UI->getParent() == SinkTo) - continue; - - for (unsigned Idx = 0; Idx != UI->getNumOperands(); Idx++) { - if (UI->getOperand(Idx) != SinkCandidate->getVPSingleValue()) - continue; - UI->setOperand(Idx, Clone); - } - } + SinkCandidate->getVPSingleValue()->replaceUsesWithIf( + Clone, [SinkTo](VPUser &U, unsigned) { + return cast<VPRecipeBase>(&U)->getParent() != SinkTo; + }); } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) @@ -273,16 +271,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { VPValue *PredInst1 = cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0); VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); - for (VPUser *U : to_vector(Phi1ToMoveV->users())) { - auto *UI = dyn_cast<VPRecipeBase>(U); - if (!UI || UI->getParent() != Then2) - continue; - for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) { - if (Phi1ToMoveV != U->getOperand(I)) - continue; - U->setOperand(I, PredInst1); - } - } + Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) { + auto *UI = dyn_cast<VPRecipeBase>(&U); + return UI && UI->getParent() == Then2; + }); Phi1ToMove.moveBefore(*Merge2, Merge2->begin()); } @@ -479,15 +471,45 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { // The recipes in the block are processed in reverse order, to catch chains // of dead recipes. for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { - if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) { - return V->getNumUsers() > 0; - })) + // A user keeps R alive: + if (any_of(R.definedValues(), + [](VPValue *V) { return V->getNumUsers(); })) + continue; + + // Having side effects keeps R alive, but do remove conditional assume + // instructions as their conditions may be flattened. + auto *RepR = dyn_cast<VPReplicateRecipe>(&R); + bool IsConditionalAssume = + RepR && RepR->isPredicated() && + match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>()); + if (R.mayHaveSideEffects() && !IsConditionalAssume) continue; + R.eraseFromParent(); } } } +static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID, + ScalarEvolution &SE, Instruction *TruncI, + Type *IVTy, VPValue *StartV, + VPValue *Step) { + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto IP = HeaderVPBB->getFirstNonPhi(); + VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + Type *TruncTy = TruncI ? TruncI->getType() : IVTy; + VPValue *BaseIV = CanonicalIV; + if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) { + BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step, + TruncI ? TruncI->getType() : nullptr); + HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP); + } + + VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step); + HeaderVPBB->insert(Steps, IP); + return Steps; +} + void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) { SmallVector<VPRecipeBase *> ToRemove; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); @@ -501,36 +523,18 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) { })) continue; - auto IP = HeaderVPBB->getFirstNonPhi(); - VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); - Type *ResultTy = WideIV->getPHINode()->getType(); - if (Instruction *TruncI = WideIV->getTruncInst()) - ResultTy = TruncI->getType(); const InductionDescriptor &ID = WideIV->getInductionDescriptor(); - VPValue *Step = WideIV->getStepValue(); - VPValue *BaseIV = CanonicalIV; - if (!CanonicalIV->isCanonical(ID.getKind(), WideIV->getStartValue(), Step, - ResultTy)) { - BaseIV = new VPDerivedIVRecipe(ID, WideIV->getStartValue(), CanonicalIV, - Step, ResultTy); - HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP); - } - - VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step); - HeaderVPBB->insert(Steps, IP); + VPValue *Steps = createScalarIVSteps( + Plan, ID, SE, WideIV->getTruncInst(), WideIV->getPHINode()->getType(), + WideIV->getStartValue(), WideIV->getStepValue()); - // Update scalar users of IV to use Step instead. Use SetVector to ensure - // the list of users doesn't contain duplicates. - SetVector<VPUser *> Users(WideIV->user_begin(), WideIV->user_end()); - for (VPUser *U : Users) { - if (HasOnlyVectorVFs && !U->usesScalars(WideIV)) - continue; - for (unsigned I = 0, E = U->getNumOperands(); I != E; I++) { - if (U->getOperand(I) != WideIV) - continue; - U->setOperand(I, Steps); - } - } + // Update scalar users of IV to use Step instead. + if (!HasOnlyVectorVFs) + WideIV->replaceAllUsesWith(Steps); + else + WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) { + return U.usesScalars(WideIV); + }); } } @@ -778,3 +782,375 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { } } } + +/// Returns true is \p V is constant one. +static bool isConstantOne(VPValue *V) { + if (!V->isLiveIn()) + return false; + auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue()); + return C && C->isOne(); +} + +/// Returns the llvm::Instruction opcode for \p R. +static unsigned getOpcodeForRecipe(VPRecipeBase &R) { + if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R)) + return WidenR->getUnderlyingInstr()->getOpcode(); + if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R)) + return WidenC->getOpcode(); + if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) + return RepR->getUnderlyingInstr()->getOpcode(); + if (auto *VPI = dyn_cast<VPInstruction>(&R)) + return VPI->getOpcode(); + return 0; +} + +/// Try to simplify recipe \p R. +static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { + switch (getOpcodeForRecipe(R)) { + case Instruction::Mul: { + VPValue *A = R.getOperand(0); + VPValue *B = R.getOperand(1); + if (isConstantOne(A)) + return R.getVPSingleValue()->replaceAllUsesWith(B); + if (isConstantOne(B)) + return R.getVPSingleValue()->replaceAllUsesWith(A); + break; + } + case Instruction::Trunc: { + VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe(); + if (!Ext) + break; + unsigned ExtOpcode = getOpcodeForRecipe(*Ext); + if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt) + break; + VPValue *A = Ext->getOperand(0); + VPValue *Trunc = R.getVPSingleValue(); + Type *TruncTy = TypeInfo.inferScalarType(Trunc); + Type *ATy = TypeInfo.inferScalarType(A); + if (TruncTy == ATy) { + Trunc->replaceAllUsesWith(A); + } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { + auto *VPC = + new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { + auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } +#ifndef NDEBUG + // Verify that the cached type info is for both A and its users is still + // accurate by comparing it to freshly computed types. + VPTypeAnalysis TypeInfo2(TypeInfo.getContext()); + assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); + for (VPUser *U : A->users()) { + auto *R = dyn_cast<VPRecipeBase>(U); + if (!R) + continue; + for (VPValue *VPV : R->definedValues()) + assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); + } +#endif + break; + } + default: + break; + } +} + +/// Try to simplify the recipes in \p Plan. +static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) { + ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( + Plan.getEntry()); + VPTypeAnalysis TypeInfo(Ctx); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + simplifyRecipe(R, TypeInfo); + } + } +} + +void VPlanTransforms::truncateToMinimalBitwidths( + VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs, + LLVMContext &Ctx) { +#ifndef NDEBUG + // Count the processed recipes and cross check the count later with MinBWs + // size, to make sure all entries in MinBWs have been handled. + unsigned NumProcessedRecipes = 0; +#endif + // Keep track of created truncates, so they can be re-used. Note that we + // cannot use RAUW after creating a new truncate, as this would could make + // other uses have different types for their operands, making them invalidly + // typed. + DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs; + VPTypeAnalysis TypeInfo(Ctx); + VPBasicBlock *PH = Plan.getEntry(); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_deep(Plan.getVectorLoopRegion()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe, + VPWidenSelectRecipe>(&R)) + continue; + + VPValue *ResultVPV = R.getVPSingleValue(); + auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue()); + unsigned NewResSizeInBits = MinBWs.lookup(UI); + if (!NewResSizeInBits) + continue; + +#ifndef NDEBUG + NumProcessedRecipes++; +#endif + // If the value wasn't vectorized, we must maintain the original scalar + // type. Skip those here, after incrementing NumProcessedRecipes. Also + // skip casts which do not need to be handled explicitly here, as + // redundant casts will be removed during recipe simplification. + if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) { +#ifndef NDEBUG + // If any of the operands is a live-in and not used by VPWidenRecipe or + // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as + // processed as well. When MinBWs is currently constructed, there is no + // information about whether recipes are widened or replicated and in + // case they are reciplicated the operands are not truncated. Counting + // them them here ensures we do not miss any recipes in MinBWs. + // TODO: Remove once the analysis is done on VPlan. + for (VPValue *Op : R.operands()) { + if (!Op->isLiveIn()) + continue; + auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue()); + if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) && + all_of(Op->users(), [](VPUser *U) { + return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U); + })) { + // Add an entry to ProcessedTruncs to avoid counting the same + // operand multiple times. + ProcessedTruncs[Op] = nullptr; + NumProcessedRecipes += 1; + } + } +#endif + continue; + } + + Type *OldResTy = TypeInfo.inferScalarType(ResultVPV); + unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits(); + assert(OldResTy->isIntegerTy() && "only integer types supported"); + if (OldResSizeInBits == NewResSizeInBits) + continue; + assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?"); + (void)OldResSizeInBits; + + auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits); + + // Shrink operands by introducing truncates as needed. + unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0; + for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { + auto *Op = R.getOperand(Idx); + unsigned OpSizeInBits = + TypeInfo.inferScalarType(Op)->getScalarSizeInBits(); + if (OpSizeInBits == NewResSizeInBits) + continue; + assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate"); + auto [ProcessedIter, IterIsEmpty] = + ProcessedTruncs.insert({Op, nullptr}); + VPWidenCastRecipe *NewOp = + IterIsEmpty + ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) + : ProcessedIter->second; + R.setOperand(Idx, NewOp); + if (!IterIsEmpty) + continue; + ProcessedIter->second = NewOp; + if (!Op->isLiveIn()) { + NewOp->insertBefore(&R); + } else { + PH->appendRecipe(NewOp); +#ifndef NDEBUG + auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue()); + bool IsContained = MinBWs.contains(OpInst); + NumProcessedRecipes += IsContained; +#endif + } + } + + // Any wrapping introduced by shrinking this operation shouldn't be + // considered undefined behavior. So, we can't unconditionally copy + // arithmetic wrapping flags to VPW. + if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R)) + VPW->dropPoisonGeneratingFlags(); + + // Extend result to original width. + auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); + Ext->insertAfter(&R); + ResultVPV->replaceAllUsesWith(Ext); + Ext->setOperand(0, ResultVPV); + } + } + + assert(MinBWs.size() == NumProcessedRecipes && + "some entries in MinBWs haven't been processed"); +} + +void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { + removeRedundantCanonicalIVs(Plan); + removeRedundantInductionCasts(Plan); + + optimizeInductions(Plan, SE); + simplifyRecipes(Plan, SE.getContext()); + removeDeadRecipes(Plan); + + createAndOptimizeReplicateRegions(Plan); + + removeRedundantExpandSCEVRecipes(Plan); + mergeBlocksIntoPredecessors(Plan); +} + +// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace +// the loop terminator with a branch-on-cond recipe with the negated +// active-lane-mask as operand. Note that this turns the loop into an +// uncountable one. Only the existing terminator is replaced, all other existing +// recipes/users remain unchanged, except for poison-generating flags being +// dropped from the canonical IV increment. Return the created +// VPActiveLaneMaskPHIRecipe. +// +// The function uses the following definitions: +// +// %TripCount = DataWithControlFlowWithoutRuntimeCheck ? +// calculate-trip-count-minus-VF (original TC) : original TC +// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? +// CanonicalIVPhi : CanonicalIVIncrement +// %StartV is the canonical induction start value. +// +// The function adds the following recipes: +// +// vector.ph: +// %TripCount = calculate-trip-count-minus-VF (original TC) +// [if DataWithControlFlowWithoutRuntimeCheck] +// %EntryInc = canonical-iv-increment-for-part %StartV +// %EntryALM = active-lane-mask %EntryInc, %TripCount +// +// vector.body: +// ... +// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] +// ... +// %InLoopInc = canonical-iv-increment-for-part %IncrementValue +// %ALM = active-lane-mask %InLoopInc, TripCount +// %Negated = Not %ALM +// branch-on-cond %Negated +// +static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( + VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); + auto *CanonicalIVPHI = Plan.getCanonicalIV(); + VPValue *StartV = CanonicalIVPHI->getStartValue(); + + auto *CanonicalIVIncrement = + cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); + // TODO: Check if dropping the flags is needed if + // !DataAndControlFlowWithoutRuntimeCheck. + CanonicalIVIncrement->dropPoisonGeneratingFlags(); + DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor()); + VPBuilder Builder(VecPreheader); + + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getTripCount(); + + VPValue *TripCount, *IncrementValue; + if (!DataAndControlFlowWithoutRuntimeCheck) { + // When the loop is guarded by a runtime overflow check for the loop + // induction variable increment by VF, we can increment the value before + // the get.active.lane mask and use the unmodified tripcount. + IncrementValue = CanonicalIVIncrement; + TripCount = TC; + } else { + // When avoiding a runtime check, the active.lane.mask inside the loop + // uses a modified trip count and the induction variable increment is + // done after the active.lane.mask intrinsic is called. + IncrementValue = CanonicalIVPHI; + TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, + {TC}, DL); + } + auto *EntryIncrement = Builder.createOverflowingOp( + VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL, + "index.part.next"); + + // Create the active lane mask instruction in the VPlan preheader. + auto *EntryALM = + Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, + DL, "active.lane.mask.entry"); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + LaneMaskPhi->insertAfter(CanonicalIVPHI); + + // Create the active lane mask for the next iteration of the loop before the + // original terminator. + VPRecipeBase *OriginalTerminator = EB->getTerminator(); + Builder.setInsertPoint(OriginalTerminator); + auto *InLoopIncrement = + Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, + {IncrementValue}, {false, false}, DL); + auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {InLoopIncrement, TripCount}, DL, + "active.lane.mask.next"); + LaneMaskPhi->addOperand(ALM); + + // Replace the original terminator with BranchOnCond. We have to invert the + // mask here because a true condition means jumping to the exit block. + auto *NotMask = Builder.createNot(ALM, DL); + Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); + OriginalTerminator->eraseFromParent(); + return LaneMaskPhi; +} + +void VPlanTransforms::addActiveLaneMask( + VPlan &Plan, bool UseActiveLaneMaskForControlFlow, + bool DataAndControlFlowWithoutRuntimeCheck) { + assert((!DataAndControlFlowWithoutRuntimeCheck || + UseActiveLaneMaskForControlFlow) && + "DataAndControlFlowWithoutRuntimeCheck implies " + "UseActiveLaneMaskForControlFlow"); + + auto FoundWidenCanonicalIVUser = + find_if(Plan.getCanonicalIV()->users(), + [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); + assert(FoundWidenCanonicalIVUser && + "Must have widened canonical IV when tail folding!"); + auto *WideCanonicalIV = + cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser); + VPRecipeBase *LaneMask; + if (UseActiveLaneMaskForControlFlow) { + LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( + Plan, DataAndControlFlowWithoutRuntimeCheck); + } else { + LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask, + {WideCanonicalIV, Plan.getTripCount()}, + nullptr, "active.lane.mask"); + LaneMask->insertAfter(WideCanonicalIV); + } + + // Walk users of WideCanonicalIV and replace all compares of the form + // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an + // active-lane-mask. + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) { + auto *CompareToReplace = dyn_cast<VPInstruction>(U); + if (!CompareToReplace || + CompareToReplace->getOpcode() != Instruction::ICmp || + CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || + CompareToReplace->getOperand(1) != BTC) + continue; + + assert(CompareToReplace->getOperand(0) == WideCanonicalIV && + "WidenCanonicalIV must be the first operand of the compare"); + CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue()); + CompareToReplace->eraseFromParent(); + } +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3eccf6e9600d..3bf91115debb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -22,11 +22,9 @@ class InductionDescriptor; class Instruction; class PHINode; class ScalarEvolution; -class Loop; class PredicatedScalarEvolution; class TargetLibraryInfo; class VPBuilder; -class VPRecipeBuilder; struct VPlanTransforms { /// Replaces the VPInstructions in \p Plan with corresponding @@ -37,12 +35,56 @@ struct VPlanTransforms { GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI); + /// Sink users of fixed-order recurrences after the recipe defining their + /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions + /// to combine the value from the recurrence phis and previous values. The + /// current implementation assumes all users can be sunk after the previous + /// value, which is enforced by earlier legality checks. + /// \returns true if all users of fixed-order recurrences could be re-arranged + /// as needed or false if it is not possible. In the latter case, \p Plan is + /// not valid. + static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder); + + /// Clear NSW/NUW flags from reduction instructions if necessary. + static void clearReductionWrapFlags(VPlan &Plan); + + /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the + /// resulting plan to \p BestVF and \p BestUF. + static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, + unsigned BestUF, + PredicatedScalarEvolution &PSE); + + /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe + /// optimizations, dead recipe removal, replicate region optimizations and + /// block merging. + static void optimize(VPlan &Plan, ScalarEvolution &SE); + /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then /// region block and remove the mask operand. Optimize the created regions by /// iteratively sinking scalar operands into the region, followed by merging /// regions until no improvements are remaining. static void createAndOptimizeReplicateRegions(VPlan &Plan); + /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an + /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p + /// UseActiveLaneMaskForControlFlow is true, introduce an + /// VPActiveLaneMaskPHIRecipe. If \p DataAndControlFlowWithoutRuntimeCheck is + /// true, no minimum-iteration runtime check will be created (during skeleton + /// creation) and instead it is handled using active-lane-mask. \p + /// DataAndControlFlowWithoutRuntimeCheck implies \p + /// UseActiveLaneMaskForControlFlow. + static void addActiveLaneMask(VPlan &Plan, + bool UseActiveLaneMaskForControlFlow, + bool DataAndControlFlowWithoutRuntimeCheck); + + /// Insert truncates and extends for any truncated recipe. Redundant casts + /// will be folded later. + static void + truncateToMinimalBitwidths(VPlan &Plan, + const MapVector<Instruction *, uint64_t> &MinBWs, + LLVMContext &Ctx); + +private: /// Remove redundant VPBasicBlocks by merging them into their predecessor if /// the predecessor has a single successor. static bool mergeBlocksIntoPredecessors(VPlan &Plan); @@ -71,24 +113,6 @@ struct VPlanTransforms { /// them with already existing recipes expanding the same SCEV expression. static void removeRedundantExpandSCEVRecipes(VPlan &Plan); - /// Sink users of fixed-order recurrences after the recipe defining their - /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions - /// to combine the value from the recurrence phis and previous values. The - /// current implementation assumes all users can be sunk after the previous - /// value, which is enforced by earlier legality checks. - /// \returns true if all users of fixed-order recurrences could be re-arranged - /// as needed or false if it is not possible. In the latter case, \p Plan is - /// not valid. - static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder); - - /// Clear NSW/NUW flags from reduction instructions if necessary. - static void clearReductionWrapFlags(VPlan &Plan); - - /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the - /// resulting plan to \p BestVF and \p BestUF. - static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, - unsigned BestUF, - PredicatedScalarEvolution &PSE); }; } // namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h index ac110bb3b0ef..116acad8e8f3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -121,18 +121,11 @@ public: /// Remove a single \p User from the list of users. void removeUser(VPUser &User) { - bool Found = false; // The same user can be added multiple times, e.g. because the same VPValue // is used twice by the same VPUser. Remove a single one. - erase_if(Users, [&User, &Found](VPUser *Other) { - if (Found) - return false; - if (Other == &User) { - Found = true; - return true; - } - return false; - }); + auto *I = find(Users, &User); + if (I != Users.end()) + Users.erase(I); } typedef SmallVectorImpl<VPUser *>::iterator user_iterator; @@ -163,6 +156,13 @@ public: void replaceAllUsesWith(VPValue *New); + /// Go through the uses list for this VPValue and make each use point to \p + /// New if the callback ShouldReplace returns true for the given use specified + /// by a pair of (VPUser, the use index). + void replaceUsesWithIf( + VPValue *New, + llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace); + /// Returns the recipe defining this VPValue or nullptr if it is not defined /// by a recipe, i.e. is a live-in. VPRecipeBase *getDefiningRecipe(); @@ -296,6 +296,14 @@ public: "Op must be an operand of the recipe"); return false; } + + /// Returns true if the VPUser only uses the first part of operand \p Op. + /// Conservatively returns false. + virtual bool onlyFirstPartUsed(const VPValue *Op) const { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return false; + } }; /// This class augments a recipe with a set of VPValues defined by the recipe. @@ -325,7 +333,7 @@ class VPDef { assert(V->Def == this && "can only remove VPValue linked with this VPDef"); assert(is_contained(DefinedValues, V) && "VPValue to remove must be in DefinedValues"); - erase_value(DefinedValues, V); + llvm::erase(DefinedValues, V); V->Def = nullptr; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 13464c9d3496..f18711ba30b7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -13,6 +13,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" @@ -28,6 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" #include <numeric> +#include <queue> #define DEBUG_TYPE "vector-combine" #include "llvm/Transforms/Utils/InstructionWorklist.h" @@ -100,8 +103,9 @@ private: Instruction &I); bool foldExtractExtract(Instruction &I); bool foldInsExtFNeg(Instruction &I); - bool foldBitcastShuf(Instruction &I); + bool foldBitcastShuffle(Instruction &I); bool scalarizeBinopOrCmp(Instruction &I); + bool scalarizeVPIntrinsic(Instruction &I); bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); @@ -258,8 +262,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // It is safe and potentially profitable to load a vector directly: // inselt undef, load Scalar, 0 --> load VecPtr IRBuilder<> Builder(Load); - Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( - SrcPtr, MinVecTy->getPointerTo(AS)); + Value *CastedPtr = + Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); VecLd = Builder.CreateShuffleVector(VecLd, Mask); @@ -321,7 +325,7 @@ bool VectorCombine::widenSubvectorLoad(Instruction &I) { IRBuilder<> Builder(Load); Value *CastedPtr = - Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS)); + Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment); replaceValue(I, *VecLd); ++NumVecLoad; @@ -677,7 +681,7 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { /// If this is a bitcast of a shuffle, try to bitcast the source vector to the /// destination type followed by shuffle. This can enable further transforms by /// moving bitcasts or shuffles together. -bool VectorCombine::foldBitcastShuf(Instruction &I) { +bool VectorCombine::foldBitcastShuffle(Instruction &I) { Value *V; ArrayRef<int> Mask; if (!match(&I, m_BitCast( @@ -687,35 +691,43 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for // scalable type is unknown; Second, we cannot reason if the narrowed shuffle // mask for scalable type is a splat or not. - // 2) Disallow non-vector casts and length-changing shuffles. + // 2) Disallow non-vector casts. // TODO: We could allow any shuffle. + auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); auto *SrcTy = dyn_cast<FixedVectorType>(V->getType()); - if (!SrcTy || I.getOperand(0)->getType() != SrcTy) + if (!DestTy || !SrcTy) + return false; + + unsigned DestEltSize = DestTy->getScalarSizeInBits(); + unsigned SrcEltSize = SrcTy->getScalarSizeInBits(); + if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0) return false; - auto *DestTy = cast<FixedVectorType>(I.getType()); - unsigned DestNumElts = DestTy->getNumElements(); - unsigned SrcNumElts = SrcTy->getNumElements(); SmallVector<int, 16> NewMask; - if (SrcNumElts <= DestNumElts) { + if (DestEltSize <= SrcEltSize) { // The bitcast is from wide to narrow/equal elements. The shuffle mask can // always be expanded to the equivalent form choosing narrower elements. - assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); - unsigned ScaleFactor = DestNumElts / SrcNumElts; + assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = SrcEltSize / DestEltSize; narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); } else { // The bitcast is from narrow elements to wide elements. The shuffle mask // must choose consecutive elements to allow casting first. - assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask"); - unsigned ScaleFactor = SrcNumElts / DestNumElts; + assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = DestEltSize / SrcEltSize; if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) return false; } + // Bitcast the shuffle src - keep its original width but using the destination + // scalar type. + unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize; + auto *ShuffleTy = FixedVectorType::get(DestTy->getScalarType(), NumSrcElts); + // The new shuffle must not cost more than the old shuffle. The bitcast is // moved ahead of the shuffle, so assume that it has the same cost as before. InstructionCost DestCost = TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask); + TargetTransformInfo::SK_PermuteSingleSrc, ShuffleTy, NewMask); InstructionCost SrcCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask); if (DestCost > SrcCost || !DestCost.isValid()) @@ -723,12 +735,131 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' ++NumShufOfBitcast; - Value *CastV = Builder.CreateBitCast(V, DestTy); + Value *CastV = Builder.CreateBitCast(V, ShuffleTy); Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); replaceValue(I, *Shuf); return true; } +/// VP Intrinsics whose vector operands are both splat values may be simplified +/// into the scalar version of the operation and the result splatted. This +/// can lead to scalarization down the line. +bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { + if (!isa<VPIntrinsic>(I)) + return false; + VPIntrinsic &VPI = cast<VPIntrinsic>(I); + Value *Op0 = VPI.getArgOperand(0); + Value *Op1 = VPI.getArgOperand(1); + + if (!isSplatValue(Op0) || !isSplatValue(Op1)) + return false; + + // Check getSplatValue early in this function, to avoid doing unnecessary + // work. + Value *ScalarOp0 = getSplatValue(Op0); + Value *ScalarOp1 = getSplatValue(Op1); + if (!ScalarOp0 || !ScalarOp1) + return false; + + // For the binary VP intrinsics supported here, the result on disabled lanes + // is a poison value. For now, only do this simplification if all lanes + // are active. + // TODO: Relax the condition that all lanes are active by using insertelement + // on inactive lanes. + auto IsAllTrueMask = [](Value *MaskVal) { + if (Value *SplattedVal = getSplatValue(MaskVal)) + if (auto *ConstValue = dyn_cast<Constant>(SplattedVal)) + return ConstValue->isAllOnesValue(); + return false; + }; + if (!IsAllTrueMask(VPI.getArgOperand(2))) + return false; + + // Check to make sure we support scalarization of the intrinsic + Intrinsic::ID IntrID = VPI.getIntrinsicID(); + if (!VPBinOpIntrinsic::isVPBinOp(IntrID)) + return false; + + // Calculate cost of splatting both operands into vectors and the vector + // intrinsic + VectorType *VecTy = cast<VectorType>(VPI.getType()); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost SplatCost = + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) + + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); + + // Calculate the cost of the VP Intrinsic + SmallVector<Type *, 4> Args; + for (Value *V : VPI.args()) + Args.push_back(V->getType()); + IntrinsicCostAttributes Attrs(IntrID, VecTy, Args); + InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); + InstructionCost OldCost = 2 * SplatCost + VectorOpCost; + + // Determine scalar opcode + std::optional<unsigned> FunctionalOpcode = + VPI.getFunctionalOpcode(); + std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt; + if (!FunctionalOpcode) { + ScalarIntrID = VPI.getFunctionalIntrinsicID(); + if (!ScalarIntrID) + return false; + } + + // Calculate cost of scalarizing + InstructionCost ScalarOpCost = 0; + if (ScalarIntrID) { + IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args); + ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); + } else { + ScalarOpCost = + TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType()); + } + + // The existing splats may be kept around if other instructions use them. + InstructionCost CostToKeepSplats = + (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse()); + InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats; + + LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI + << "\n"); + LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost + << ", Cost of scalarizing:" << NewCost << "\n"); + + // We want to scalarize unless the vector variant actually has lower cost. + if (OldCost < NewCost || !NewCost.isValid()) + return false; + + // Scalarize the intrinsic + ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount(); + Value *EVL = VPI.getArgOperand(3); + const DataLayout &DL = VPI.getModule()->getDataLayout(); + + // If the VP op might introduce UB or poison, we can scalarize it provided + // that we know the EVL > 0: If the EVL is zero, then the original VP op + // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by + // scalarizing it. + bool SafeToSpeculate; + if (ScalarIntrID) + SafeToSpeculate = Intrinsic::getAttributes(I.getContext(), *ScalarIntrID) + .hasFnAttr(Attribute::AttrKind::Speculatable); + else + SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode( + *FunctionalOpcode, &VPI, nullptr, &AC, &DT); + if (!SafeToSpeculate && !isKnownNonZero(EVL, DL, 0, &AC, &VPI, &DT)) + return false; + + Value *ScalarVal = + ScalarIntrID + ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID, + {ScalarOp0, ScalarOp1}) + : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode), + ScalarOp0, ScalarOp1); + + replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal)); + return true; +} + /// Match a vector binop or compare instruction with at least one inserted /// scalar operand and convert to scalar binop/cmp followed by insertelement. bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { @@ -1013,19 +1144,24 @@ public: /// Check if it is legal to scalarize a memory access to \p VecTy at index \p /// Idx. \p Idx must access a valid vector element. -static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy, - Value *Idx, Instruction *CtxI, +static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, + Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT) { + // We do checks for both fixed vector types and scalable vector types. + // This is the number of elements of fixed vector types, + // or the minimum number of elements of scalable vector types. + uint64_t NumElements = VecTy->getElementCount().getKnownMinValue(); + if (auto *C = dyn_cast<ConstantInt>(Idx)) { - if (C->getValue().ult(VecTy->getNumElements())) + if (C->getValue().ult(NumElements)) return ScalarizationResult::safe(); return ScalarizationResult::unsafe(); } unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); APInt Zero(IntWidth, 0); - APInt MaxElts(IntWidth, VecTy->getNumElements()); + APInt MaxElts(IntWidth, NumElements); ConstantRange ValidIndices(Zero, MaxElts); ConstantRange IdxRange(IntWidth, true); @@ -1074,8 +1210,7 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment, // store i32 %b, i32* %1 bool VectorCombine::foldSingleElementStore(Instruction &I) { auto *SI = cast<StoreInst>(&I); - if (!SI->isSimple() || - !isa<FixedVectorType>(SI->getValueOperand()->getType())) + if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType())) return false; // TODO: Combine more complicated patterns (multiple insert) by referencing @@ -1089,13 +1224,13 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) { return false; if (auto *Load = dyn_cast<LoadInst>(Source)) { - auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType()); + auto VecTy = cast<VectorType>(SI->getValueOperand()->getType()); const DataLayout &DL = I.getModule()->getDataLayout(); Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); // Don't optimize for atomic/volatile load or store. Ensure memory is not // modified between, vector type matches store size, and index is inbounds. if (!Load->isSimple() || Load->getParent() != SI->getParent() || - !DL.typeSizeEqualsStoreSize(Load->getType()) || + !DL.typeSizeEqualsStoreSize(Load->getType()->getScalarType()) || SrcAddr != SI->getPointerOperand()->stripPointerCasts()) return false; @@ -1130,19 +1265,26 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (!match(&I, m_Load(m_Value(Ptr)))) return false; - auto *FixedVT = cast<FixedVectorType>(I.getType()); + auto *VecTy = cast<VectorType>(I.getType()); auto *LI = cast<LoadInst>(&I); const DataLayout &DL = I.getModule()->getDataLayout(); - if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(FixedVT)) + if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(VecTy->getScalarType())) return false; InstructionCost OriginalCost = - TTI.getMemoryOpCost(Instruction::Load, FixedVT, LI->getAlign(), + TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(), LI->getPointerAddressSpace()); InstructionCost ScalarizedCost = 0; Instruction *LastCheckedInst = LI; unsigned NumInstChecked = 0; + DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze; + auto FailureGuard = make_scope_exit([&]() { + // If the transform is aborted, discard the ScalarizationResults. + for (auto &Pair : NeedFreeze) + Pair.second.discard(); + }); + // Check if all users of the load are extracts with no memory modifications // between the load and the extract. Compute the cost of both the original // code and the scalarized version. @@ -1151,9 +1293,6 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (!UI || UI->getParent() != LI->getParent()) return false; - if (!isGuaranteedNotToBePoison(UI->getOperand(1), &AC, LI, &DT)) - return false; - // Check if any instruction between the load and the extract may modify // memory. if (LastCheckedInst->comesBefore(UI)) { @@ -1168,22 +1307,23 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { LastCheckedInst = UI; } - auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT); - if (!ScalarIdx.isSafe()) { - // TODO: Freeze index if it is safe to do so. - ScalarIdx.discard(); + auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT); + if (ScalarIdx.isUnsafe()) return false; + if (ScalarIdx.isSafeWithFreeze()) { + NeedFreeze.try_emplace(UI, ScalarIdx); + ScalarIdx.discard(); } auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1)); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; OriginalCost += - TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind, + TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, Index ? Index->getZExtValue() : -1); ScalarizedCost += - TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(), + TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(), Align(1), LI->getPointerAddressSpace()); - ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType()); + ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType()); } if (ScalarizedCost >= OriginalCost) @@ -1192,21 +1332,27 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { // Replace extracts with narrow scalar loads. for (User *U : LI->users()) { auto *EI = cast<ExtractElementInst>(U); - Builder.SetInsertPoint(EI); - Value *Idx = EI->getOperand(1); + + // Insert 'freeze' for poison indexes. + auto It = NeedFreeze.find(EI); + if (It != NeedFreeze.end()) + It->second.freeze(Builder, *cast<Instruction>(Idx)); + + Builder.SetInsertPoint(EI); Value *GEP = - Builder.CreateInBoundsGEP(FixedVT, Ptr, {Builder.getInt32(0), Idx}); + Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx}); auto *NewLoad = cast<LoadInst>(Builder.CreateLoad( - FixedVT->getElementType(), GEP, EI->getName() + ".scalar")); + VecTy->getElementType(), GEP, EI->getName() + ".scalar")); Align ScalarOpAlignment = computeAlignmentAfterScalarization( - LI->getAlign(), FixedVT->getElementType(), Idx, DL); + LI->getAlign(), VecTy->getElementType(), Idx, DL); NewLoad->setAlignment(ScalarOpAlignment); replaceValue(*EI, *NewLoad); } + FailureGuard.release(); return true; } @@ -1340,21 +1486,28 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType()); if (!ShuffleInputType) return false; - int NumInputElts = ShuffleInputType->getNumElements(); + unsigned NumInputElts = ShuffleInputType->getNumElements(); // Find the mask from sorting the lanes into order. This is most likely to // become a identity or concat mask. Undef elements are pushed to the end. SmallVector<int> ConcatMask; Shuffle->getShuffleMask(ConcatMask); sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); + // In the case of a truncating shuffle it's possible for the mask + // to have an index greater than the size of the resulting vector. + // This requires special handling. + bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts; bool UsesSecondVec = - any_of(ConcatMask, [&](int M) { return M >= NumInputElts; }); + any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; }); + + FixedVectorType *VecTyForCost = + (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType; InstructionCost OldCost = TTI.getShuffleCost( - UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, - Shuffle->getShuffleMask()); + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, + VecTyForCost, Shuffle->getShuffleMask()); InstructionCost NewCost = TTI.getShuffleCost( - UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, - ConcatMask); + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, + VecTyForCost, ConcatMask); LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle << "\n"); @@ -1657,16 +1810,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { return SSV->getOperand(Op); return SV->getOperand(Op); }; - Builder.SetInsertPoint(SVI0A->getInsertionPointAfterDef()); + Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef()); Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), GetShuffleOperand(SVI0A, 1), V1A); - Builder.SetInsertPoint(SVI0B->getInsertionPointAfterDef()); + Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef()); Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), GetShuffleOperand(SVI0B, 1), V1B); - Builder.SetInsertPoint(SVI1A->getInsertionPointAfterDef()); + Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef()); Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), GetShuffleOperand(SVI1A, 1), V2A); - Builder.SetInsertPoint(SVI1B->getInsertionPointAfterDef()); + Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef()); Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), GetShuffleOperand(SVI1B, 1), V2B); Builder.SetInsertPoint(Op0); @@ -1723,9 +1876,6 @@ bool VectorCombine::run() { case Instruction::ShuffleVector: MadeChange |= widenSubvectorLoad(I); break; - case Instruction::Load: - MadeChange |= scalarizeLoadExtract(I); - break; default: break; } @@ -1733,13 +1883,15 @@ bool VectorCombine::run() { // This transform works with scalable and fixed vectors // TODO: Identify and allow other scalable transforms - if (isa<VectorType>(I.getType())) + if (isa<VectorType>(I.getType())) { MadeChange |= scalarizeBinopOrCmp(I); + MadeChange |= scalarizeLoadExtract(I); + MadeChange |= scalarizeVPIntrinsic(I); + } if (Opcode == Instruction::Store) MadeChange |= foldSingleElementStore(I); - // If this is an early pipeline invocation of this pass, we are done. if (TryEarlyFoldsOnly) return; @@ -1758,7 +1910,7 @@ bool VectorCombine::run() { MadeChange |= foldSelectShuffle(I); break; case Instruction::BitCast: - MadeChange |= foldBitcastShuf(I); + MadeChange |= foldBitcastShuffle(I); break; } } else { |
