aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-12-18 20:30:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-04-06 20:11:55 +0000
commit5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch)
tree1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Transforms/Vectorize
parent3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff)
parent312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h62
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2062
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp4231
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp230
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h585
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp237
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h61
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp257
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp575
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp484
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h64
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h30
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp270
16 files changed, 5822 insertions, 3382 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 260d7889906b..fa2459d1ca02 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -103,13 +103,11 @@
#include "llvm/Support/ModRef.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <iterator>
-#include <limits>
#include <numeric>
#include <optional>
#include <tuple>
@@ -900,9 +898,9 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// Chain is in offset order, so C[0] is the instr with the lowest offset,
// i.e. the root of the vector.
- Value *Bitcast = Builder.CreateBitCast(
- getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS));
- VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
+ VecInst = Builder.CreateAlignedLoad(VecTy,
+ getLoadStorePointerOperand(C[0].Inst),
+ Alignment);
unsigned VecIdx = 0;
for (const ChainElem &E : C) {
@@ -976,8 +974,7 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// i.e. the root of the vector.
VecInst = Builder.CreateAlignedStore(
Vec,
- Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst),
- VecTy->getPointerTo(AS)),
+ getLoadStorePointerOperand(C[0].Inst),
Alignment);
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index f923f0be6621..37a356c43e29 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -289,7 +289,7 @@ void LoopVectorizeHints::getHintsFromMetadata() {
}
void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
- if (!Name.startswith(Prefix()))
+ if (!Name.starts_with(Prefix()))
return;
Name = Name.substr(Prefix().size(), StringRef::npos);
@@ -943,6 +943,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
}
+ // If we found a vectorized variant of a function, note that so LV can
+ // make better decisions about maximum VF.
+ if (CI && !VFDatabase::getMappings(*CI).empty())
+ VecCallVariantsFound = true;
+
// Check that the instruction return type is vectorizable.
// Also, we can't vectorize extractelement instructions.
if ((!VectorType::isValidElementType(I.getType()) &&
@@ -1242,13 +1247,12 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
bool LoopVectorizationLegality::blockCanBePredicated(
BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
- SmallPtrSetImpl<const Instruction *> &MaskedOp,
- SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
+ SmallPtrSetImpl<const Instruction *> &MaskedOp) const {
for (Instruction &I : *BB) {
// We can predicate blocks with calls to assume, as long as we drop them in
// case we flatten the CFG via predication.
if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
- ConditionalAssumes.insert(&I);
+ MaskedOp.insert(&I);
continue;
}
@@ -1345,16 +1349,13 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
}
// We must be able to predicate all blocks that need to be predicated.
- if (blockNeedsPredication(BB)) {
- if (!blockCanBePredicated(BB, SafePointers, MaskedOp,
- ConditionalAssumes)) {
- reportVectorizationFailure(
- "Control flow cannot be substituted for a select",
- "control flow cannot be substituted for a select",
- "NoCFGForSelect", ORE, TheLoop,
- BB->getTerminator());
- return false;
- }
+ if (blockNeedsPredication(BB) &&
+ !blockCanBePredicated(BB, SafePointers, MaskedOp)) {
+ reportVectorizationFailure(
+ "Control flow cannot be substituted for a select",
+ "control flow cannot be substituted for a select", "NoCFGForSelect",
+ ORE, TheLoop, BB->getTerminator());
+ return false;
}
}
@@ -1554,14 +1555,14 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
// The list of pointers that we can safely read and write to remains empty.
SmallPtrSet<Value *, 8> SafePointers;
+ // Collect masked ops in temporary set first to avoid partially populating
+ // MaskedOp if a block cannot be predicated.
SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
- SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
// Check and mark all blocks for predication, including those that ordinarily
// do not need predication such as the header block.
for (BasicBlock *BB : TheLoop->blocks()) {
- if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
- TmpConditionalAssumes)) {
+ if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
return false;
}
@@ -1570,9 +1571,6 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
- ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
- TmpConditionalAssumes.end());
-
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 13357cb06c55..577ce8000de2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -31,6 +31,7 @@
namespace llvm {
class LoopInfo;
+class DominatorTree;
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class PredicatedScalarEvolution;
@@ -45,13 +46,17 @@ class VPBuilder {
VPBasicBlock *BB = nullptr;
VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
+ /// Insert \p VPI in BB at InsertPt if BB is set.
+ VPInstruction *tryInsertInstruction(VPInstruction *VPI) {
+ if (BB)
+ BB->insert(VPI, InsertPt);
+ return VPI;
+ }
+
VPInstruction *createInstruction(unsigned Opcode,
ArrayRef<VPValue *> Operands, DebugLoc DL,
const Twine &Name = "") {
- VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL, Name);
- if (BB)
- BB->insert(Instr, InsertPt);
- return Instr;
+ return tryInsertInstruction(new VPInstruction(Opcode, Operands, DL, Name));
}
VPInstruction *createInstruction(unsigned Opcode,
@@ -62,6 +67,7 @@ class VPBuilder {
public:
VPBuilder() = default;
+ VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }
/// Clear the insertion point: created instructions will not be inserted into
/// a block.
@@ -116,10 +122,11 @@ public:
InsertPt = IP;
}
- /// Insert and return the specified instruction.
- VPInstruction *insert(VPInstruction *I) const {
- BB->insert(I, InsertPt);
- return I;
+ /// This specifies that created instructions should be inserted at the
+ /// specified point.
+ void setInsertPoint(VPRecipeBase *IP) {
+ BB = IP->getParent();
+ InsertPt = IP->getIterator();
}
/// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
@@ -138,6 +145,13 @@ public:
return createInstruction(Opcode, Operands, DL, Name);
}
+ VPInstruction *createOverflowingOp(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands,
+ VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
+ DebugLoc DL, const Twine &Name = "") {
+ return tryInsertInstruction(
+ new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
+ }
VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") {
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
}
@@ -158,6 +172,12 @@ public:
Name);
}
+ /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
+ /// and \p B.
+ /// TODO: add createFCmp when needed.
+ VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
+ DebugLoc DL = {}, const Twine &Name = "");
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
@@ -268,6 +288,9 @@ class LoopVectorizationPlanner {
/// Loop Info analysis.
LoopInfo *LI;
+ /// The dominator tree.
+ DominatorTree *DT;
+
/// Target Library Info.
const TargetLibraryInfo *TLI;
@@ -298,16 +321,14 @@ class LoopVectorizationPlanner {
VPBuilder Builder;
public:
- LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
- const TargetTransformInfo &TTI,
- LoopVectorizationLegality *Legal,
- LoopVectorizationCostModel &CM,
- InterleavedAccessInfo &IAI,
- PredicatedScalarEvolution &PSE,
- const LoopVectorizeHints &Hints,
- OptimizationRemarkEmitter *ORE)
- : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
- PSE(PSE), Hints(Hints), ORE(ORE) {}
+ LoopVectorizationPlanner(
+ Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI,
+ PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints,
+ OptimizationRemarkEmitter *ORE)
+ : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
+ IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
/// Plan how to best vectorize, return the best VF and its cost, or
/// std::nullopt if vectorization and interleaving should be avoided up front.
@@ -333,7 +354,7 @@ public:
executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
InnerLoopVectorizer &LB, DominatorTree *DT,
bool IsEpilogueVectorization,
- DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr);
+ const DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr);
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void printPlans(raw_ostream &O);
@@ -377,8 +398,7 @@ private:
/// returned VPlan is valid for. If no VPlan can be built for the input range,
/// set the largest included VF to the maximum VF for which no plan could be
/// built.
- std::optional<VPlanPtr> tryToBuildVPlanWithVPRecipes(
- VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions);
+ VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range);
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b603bbe55dc9..f82e161fb846 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -27,7 +27,7 @@
//
// There is a development effort going on to migrate loop vectorizer to the
// VPlan infrastructure and to introduce outer loop vectorization support (see
-// docs/Proposal/VectorizationPlan.rst and
+// docs/VectorizationPlan.rst and
// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
// purpose, we temporarily introduced the VPlan-native vectorization path: an
// alternative vectorization path that is natively implemented on top of the
@@ -57,6 +57,7 @@
#include "LoopVectorizationPlanner.h"
#include "VPRecipeBuilder.h"
#include "VPlan.h"
+#include "VPlanAnalysis.h"
#include "VPlanHCFGBuilder.h"
#include "VPlanTransforms.h"
#include "llvm/ADT/APInt.h"
@@ -111,10 +112,12 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
@@ -390,6 +393,21 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
cl::desc(
"Override cost based safe divisor widening for div/rem instructions"));
+static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
+ "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
+ cl::Hidden,
+ cl::desc("Try wider VFs if they enable the use of vector variants"));
+
+// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
+// variables not overflowing do not hold. See `emitSCEVChecks`.
+static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
+// Likelyhood of bypassing the vectorized loop because pointers overlap. See
+// `emitMemRuntimeChecks`.
+static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
+// Likelyhood of bypassing the vectorized loop because there are zero trips left
+// after prolog. See `emitIterationCountCheck`.
+static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
+
/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type.
@@ -408,13 +426,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
/// we always assume predicated blocks have a 50% chance of executing.
static unsigned getReciprocalPredBlockProb() { return 2; }
-/// A helper function that returns an integer or floating-point constant with
-/// value C.
-static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
- return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
- : ConstantFP::get(Ty, C);
-}
-
/// Returns "best known" trip count for the specified loop \p L as defined by
/// the following procedure:
/// 1) Returns exact trip count if it is known.
@@ -556,10 +567,6 @@ public:
const VPIteration &Instance,
VPTransformState &State);
- /// Construct the vector value of a scalarized value \p V one lane at a time.
- void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
- VPTransformState &State);
-
/// Try to vectorize interleaved access group \p Group with the base address
/// given in \p Addr, optionally masking the vector operations if \p
/// BlockInMask is non-null. Use \p State to translate given VPValues to IR
@@ -634,10 +641,6 @@ protected:
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
- /// Shrinks vector element sizes to the smallest bitwidth they can be legally
- /// represented as.
- void truncateToMinimalBitwidths(VPTransformState &State);
-
/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
@@ -943,21 +946,21 @@ protected:
/// Look for a meaningful debug location on the instruction or it's
/// operands.
-static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
+static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
if (!I)
- return I;
+ return DebugLoc();
DebugLoc Empty;
if (I->getDebugLoc() != Empty)
- return I;
+ return I->getDebugLoc();
for (Use &Op : I->operands()) {
if (Instruction *OpInst = dyn_cast<Instruction>(Op))
if (OpInst->getDebugLoc() != Empty)
- return OpInst;
+ return OpInst->getDebugLoc();
}
- return I;
+ return I->getDebugLoc();
}
/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
@@ -1021,14 +1024,6 @@ const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
}
-static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
- ElementCount VF) {
- assert(FTy->isFloatingPointTy() && "Expected floating point type!");
- Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
- Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
- return B.CreateUIToFP(RuntimeVF, FTy);
-}
-
void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
@@ -1050,6 +1045,23 @@ void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
<< Msg);
}
+/// Report successful vectorization of the loop. In case an outer loop is
+/// vectorized, prepend "outer" to the vectorization remark.
+static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+ VectorizationFactor VF, unsigned IC) {
+ LLVM_DEBUG(debugVectorizationMessage(
+ "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
+ nullptr));
+ StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "vectorized " << LoopType << "loop (vectorization width: "
+ << ore::NV("VectorizationFactor", VF.Width)
+ << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
+ });
+}
+
} // end namespace llvm
#ifndef NDEBUG
@@ -1104,7 +1116,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
RecWithFlags->dropPoisonGeneratingFlags();
} else {
- Instruction *Instr = CurRec->getUnderlyingInstr();
+ Instruction *Instr = dyn_cast_or_null<Instruction>(
+ CurRec->getVPSingleValue()->getUnderlyingValue());
(void)Instr;
assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
"found instruction with poison generating flags not covered by "
@@ -1247,6 +1260,13 @@ public:
/// avoid redundant calculations.
void setCostBasedWideningDecision(ElementCount VF);
+ /// A call may be vectorized in different ways depending on whether we have
+ /// vectorized variants available and whether the target supports masking.
+ /// This function analyzes all calls in the function at the supplied VF,
+ /// makes a decision based on the costs of available options, and stores that
+ /// decision in a map for use in planning and plan execution.
+ void setVectorizedCallDecision(ElementCount VF);
+
/// A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
@@ -1270,7 +1290,7 @@ public:
void collectElementTypesForWidening();
/// Split reductions into those that happen in the loop, and those that happen
- /// outside. In loop reductions are collected into InLoopReductionChains.
+ /// outside. In loop reductions are collected into InLoopReductions.
void collectInLoopReductions();
/// Returns true if we should use strict in-order reductions for the given
@@ -1358,7 +1378,9 @@ public:
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
- CM_Scalarize
+ CM_Scalarize,
+ CM_VectorCall,
+ CM_IntrinsicCall
};
/// Save vectorization decision \p W and \p Cost taken by the cost model for
@@ -1414,6 +1436,29 @@ public:
return WideningDecisions[InstOnVF].second;
}
+ struct CallWideningDecision {
+ InstWidening Kind;
+ Function *Variant;
+ Intrinsic::ID IID;
+ std::optional<unsigned> MaskPos;
+ InstructionCost Cost;
+ };
+
+ void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
+ Function *Variant, Intrinsic::ID IID,
+ std::optional<unsigned> MaskPos,
+ InstructionCost Cost) {
+ assert(!VF.isScalar() && "Expected vector VF");
+ CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
+ MaskPos, Cost};
+ }
+
+ CallWideningDecision getCallWideningDecision(CallInst *CI,
+ ElementCount VF) const {
+ assert(!VF.isScalar() && "Expected vector VF");
+ return CallWideningDecisions.at(std::make_pair(CI, VF));
+ }
+
/// Return True if instruction \p I is an optimizable truncate whose operand
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
@@ -1447,11 +1492,15 @@ public:
/// Collect Uniform and Scalar values for the given \p VF.
/// The sets depend on CM decision for Load/Store instructions
/// that may be vectorized as interleave, gather-scatter or scalarized.
+ /// Also make a decision on what to do about call instructions in the loop
+ /// at that VF -- scalarize, call a known vector routine, or call a
+ /// vector intrinsic.
void collectUniformsAndScalars(ElementCount VF) {
// Do the analysis once.
if (VF.isScalar() || Uniforms.contains(VF))
return;
setCostBasedWideningDecision(VF);
+ setVectorizedCallDecision(VF);
collectLoopUniforms(VF);
collectLoopScalars(VF);
}
@@ -1606,20 +1655,9 @@ public:
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
- /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
- /// nodes to the chain of instructions representing the reductions. Uses a
- /// MapVector to ensure deterministic iteration order.
- using ReductionChainMap =
- SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
-
- /// Return the chain of instructions representing an inloop reduction.
- const ReductionChainMap &getInLoopReductionChains() const {
- return InLoopReductionChains;
- }
-
/// Returns true if the Phi is part of an inloop reduction.
bool isInLoopReduction(PHINode *Phi) const {
- return InLoopReductionChains.count(Phi);
+ return InLoopReductions.contains(Phi);
}
/// Estimate cost of an intrinsic call instruction CI if it were vectorized
@@ -1629,16 +1667,13 @@ public:
/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead
- /// if it's needed. The flag NeedToScalarize shows if the call needs to be
- /// scalarized -
- /// i.e. either vector version isn't available, or is too expensive.
- InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
- Function **Variant,
- bool *NeedsMask = nullptr) const;
+ /// if it's needed.
+ InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
WideningDecisions.clear();
+ CallWideningDecisions.clear();
Uniforms.clear();
Scalars.clear();
}
@@ -1675,14 +1710,14 @@ private:
/// elements is a power-of-2 larger than zero. If scalable vectorization is
/// disabled or unsupported, then the scalable part will be equal to
/// ElementCount::getScalable(0).
- FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
+ FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
ElementCount UserVF,
bool FoldTailByMasking);
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
/// This is a helper function of computeFeasibleMaxVF.
- ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
+ ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
unsigned SmallestType,
unsigned WidestType,
ElementCount MaxSafeVF,
@@ -1705,7 +1740,7 @@ private:
/// part of that pattern.
std::optional<InstructionCost>
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
- TTI::TargetCostKind CostKind);
+ TTI::TargetCostKind CostKind) const;
/// Calculate vectorization cost of memory instruction \p I.
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
@@ -1783,15 +1818,12 @@ private:
/// scalarized.
DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
- /// PHINodes of the reductions that should be expanded in-loop along with
- /// their associated chains of reduction operations, in program order from top
- /// (PHI) to bottom
- ReductionChainMap InLoopReductionChains;
+ /// PHINodes of the reductions that should be expanded in-loop.
+ SmallPtrSet<PHINode *, 4> InLoopReductions;
/// A Map of inloop reduction operations and their immediate chain operand.
/// FIXME: This can be removed once reductions can be costed correctly in
- /// vplan. This was added to allow quick lookup to the inloop operations,
- /// without having to loop through InLoopReductionChains.
+ /// VPlan. This was added to allow quick lookup of the inloop operations.
DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
/// Returns the expected difference in cost from scalarizing the expression
@@ -1830,6 +1862,11 @@ private:
DecisionList WideningDecisions;
+ using CallDecisionList =
+ DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
+
+ CallDecisionList CallWideningDecisions;
+
/// Returns true if \p V is expected to be vectorized and it needs to be
/// extracted.
bool needsExtract(Value *V, ElementCount VF) const {
@@ -1933,12 +1970,14 @@ class GeneratedRTChecks {
SCEVExpander MemCheckExp;
bool CostTooHigh = false;
+ const bool AddBranchWeights;
public:
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
- TargetTransformInfo *TTI, const DataLayout &DL)
+ TargetTransformInfo *TTI, const DataLayout &DL,
+ bool AddBranchWeights)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
- MemCheckExp(SE, DL, "scev.check") {}
+ MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1990,9 +2029,9 @@ public:
},
IC);
} else {
- MemRuntimeCheckCond =
- addRuntimeChecks(MemCheckBlock->getTerminator(), L,
- RtPtrChecking.getChecks(), MemCheckExp);
+ MemRuntimeCheckCond = addRuntimeChecks(
+ MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
+ MemCheckExp, VectorizerParams::HoistRuntimeChecks);
}
assert(MemRuntimeCheckCond &&
"no RT checks generated although RtPtrChecking "
@@ -2131,8 +2170,10 @@ public:
DT->addNewBlock(SCEVCheckBlock, Pred);
DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
- ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
+ BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
+ if (AddBranchWeights)
+ setBranchWeights(BI, SCEVCheckBypassWeights);
+ ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
return SCEVCheckBlock;
}
@@ -2156,9 +2197,12 @@ public:
if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
PL->addBasicBlockToLoop(MemCheckBlock, *LI);
- ReplaceInstWithInst(
- MemCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
+ if (AddBranchWeights) {
+ setBranchWeights(BI, MemCheckBypassWeights);
+ }
+ ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
MemCheckBlock->getTerminator()->setDebugLoc(
Pred->getTerminator()->getDebugLoc());
@@ -2252,157 +2296,17 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
// LoopVectorizationCostModel and LoopVectorizationPlanner.
//===----------------------------------------------------------------------===//
-/// This function adds
-/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
-/// to each vector element of Val. The sequence starts at StartIndex.
-/// \p Opcode is relevant for FP induction variable.
-static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
- Instruction::BinaryOps BinOp, ElementCount VF,
- IRBuilderBase &Builder) {
- assert(VF.isVector() && "only vector VFs are supported");
-
- // Create and check the types.
- auto *ValVTy = cast<VectorType>(Val->getType());
- ElementCount VLen = ValVTy->getElementCount();
-
- Type *STy = Val->getType()->getScalarType();
- assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
- "Induction Step must be an integer or FP");
- assert(Step->getType() == STy && "Step has wrong type");
-
- SmallVector<Constant *, 8> Indices;
-
- // Create a vector of consecutive numbers from zero to VF.
- VectorType *InitVecValVTy = ValVTy;
- if (STy->isFloatingPointTy()) {
- Type *InitVecValSTy =
- IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
- InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
- }
- Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
-
- // Splat the StartIdx
- Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
-
- if (STy->isIntegerTy()) {
- InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
- Step = Builder.CreateVectorSplat(VLen, Step);
- assert(Step->getType() == Val->getType() && "Invalid step vec");
- // FIXME: The newly created binary instructions should contain nsw/nuw
- // flags, which can be found from the original scalar operations.
- Step = Builder.CreateMul(InitVec, Step);
- return Builder.CreateAdd(Val, Step, "induction");
- }
-
- // Floating point induction.
- assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
- "Binary Opcode should be specified for FP induction");
- InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
- InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
-
- Step = Builder.CreateVectorSplat(VLen, Step);
- Value *MulOp = Builder.CreateFMul(InitVec, Step);
- return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
-}
-
-/// Compute scalar induction steps. \p ScalarIV is the scalar induction
-/// variable on which to base the steps, \p Step is the size of the step.
-static void buildScalarSteps(Value *ScalarIV, Value *Step,
- const InductionDescriptor &ID, VPValue *Def,
- VPTransformState &State) {
- IRBuilderBase &Builder = State.Builder;
-
- // Ensure step has the same type as that of scalar IV.
- Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
- if (ScalarIVTy != Step->getType()) {
- // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
- // avoid separate truncate here.
- assert(Step->getType()->isIntegerTy() &&
- "Truncation requires an integer step");
- Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
- }
-
- // We build scalar steps for both integer and floating-point induction
- // variables. Here, we determine the kind of arithmetic we will perform.
- Instruction::BinaryOps AddOp;
- Instruction::BinaryOps MulOp;
- if (ScalarIVTy->isIntegerTy()) {
- AddOp = Instruction::Add;
- MulOp = Instruction::Mul;
- } else {
- AddOp = ID.getInductionOpcode();
- MulOp = Instruction::FMul;
- }
-
- // Determine the number of scalars we need to generate for each unroll
- // iteration.
- bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
- // Compute the scalar steps and save the results in State.
- Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
- ScalarIVTy->getScalarSizeInBits());
- Type *VecIVTy = nullptr;
- Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
- if (!FirstLaneOnly && State.VF.isScalable()) {
- VecIVTy = VectorType::get(ScalarIVTy, State.VF);
- UnitStepVec =
- Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
- SplatStep = Builder.CreateVectorSplat(State.VF, Step);
- SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
- }
-
- unsigned StartPart = 0;
- unsigned EndPart = State.UF;
- unsigned StartLane = 0;
- unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
- if (State.Instance) {
- StartPart = State.Instance->Part;
- EndPart = StartPart + 1;
- StartLane = State.Instance->Lane.getKnownLane();
- EndLane = StartLane + 1;
- }
- for (unsigned Part = StartPart; Part < EndPart; ++Part) {
- Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
-
- if (!FirstLaneOnly && State.VF.isScalable()) {
- auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
- auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
- if (ScalarIVTy->isFloatingPointTy())
- InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
- auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
- auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
- State.set(Def, Add, Part);
- // It's useful to record the lane values too for the known minimum number
- // of elements so we do those below. This improves the code quality when
- // trying to extract the first element, for example.
- }
-
- if (ScalarIVTy->isFloatingPointTy())
- StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
-
- for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
- Value *StartIdx = Builder.CreateBinOp(
- AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
- // The step returned by `createStepForVF` is a runtime-evaluated value
- // when VF is scalable. Otherwise, it should be folded into a Constant.
- assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
- "Expected StartIdx to be folded to a constant when VF is not "
- "scalable");
- auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
- auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
- State.set(Def, Add, VPIteration(Part, Lane));
- }
- }
-}
-
/// Compute the transformed value of Index at offset StartValue using step
/// StepValue.
/// For integer induction, returns StartValue + Index * StepValue.
/// For pointer induction, returns StartValue[Index * StepValue].
/// FIXME: The newly created binary instructions should contain nsw/nuw
/// flags, which can be found from the original scalar operations.
-static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
- Value *StartValue, Value *Step,
- const InductionDescriptor &ID) {
+static Value *
+emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
+ Value *Step,
+ InductionDescriptor::InductionKind InductionKind,
+ const BinaryOperator *InductionBinOp) {
Type *StepTy = Step->getType();
Value *CastedIndex = StepTy->isIntegerTy()
? B.CreateSExtOrTrunc(Index, StepTy)
@@ -2446,7 +2350,7 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
return B.CreateMul(X, Y);
};
- switch (ID.getKind()) {
+ switch (InductionKind) {
case InductionDescriptor::IK_IntInduction: {
assert(!isa<VectorType>(Index->getType()) &&
"Vector indices not supported for integer inductions yet");
@@ -2464,7 +2368,6 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
assert(!isa<VectorType>(Index->getType()) &&
"Vector indices not supported for FP inductions yet");
assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
- auto InductionBinOp = ID.getInductionBinOp();
assert(InductionBinOp &&
(InductionBinOp->getOpcode() == Instruction::FAdd ||
InductionBinOp->getOpcode() == Instruction::FSub) &&
@@ -2524,17 +2427,6 @@ static bool isIndvarOverflowCheckKnownFalse(
return false;
}
-void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
- const VPIteration &Instance,
- VPTransformState &State) {
- Value *ScalarInst = State.get(Def, Instance);
- Value *VectorValue = State.get(Def, Instance.Part);
- VectorValue = Builder.CreateInsertElement(
- VectorValue, ScalarInst,
- Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
- State.set(Def, VectorValue, Instance.Part);
-}
-
// Return whether we allow using masked interleave-groups (for dealing with
// strided loads/stores that reside in predicated blocks, or for dealing
// with gaps).
@@ -2612,7 +2504,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
for (unsigned Part = 0; Part < UF; Part++) {
Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
- State.setDebugLocFromInst(AddrPart);
+ if (auto *I = dyn_cast<Instruction>(AddrPart))
+ State.setDebugLocFrom(I->getDebugLoc());
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
@@ -2630,14 +2523,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
InBounds = gep->isInBounds();
AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
-
- // Cast to the vector pointer type.
- unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
- Type *PtrTy = VecTy->getPointerTo(AddressSpace);
- AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
+ AddrParts.push_back(AddrPart);
}
- State.setDebugLocFromInst(Instr);
+ State.setDebugLocFrom(Instr->getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
@@ -2835,13 +2724,20 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Instruction *Cloned = Instr->clone();
- if (!IsVoidRetTy)
+ if (!IsVoidRetTy) {
Cloned->setName(Instr->getName() + ".cloned");
+#if !defined(NDEBUG)
+ // Verify that VPlan type inference results agree with the type of the
+ // generated values.
+ assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
+ "inferred type and type from generated instructions do not match");
+#endif
+ }
RepRecipe->setFlags(Cloned);
- if (Instr->getDebugLoc())
- State.setDebugLocFromInst(Instr);
+ if (auto DL = Instr->getDebugLoc())
+ State.setDebugLocFrom(DL);
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
@@ -3019,9 +2915,11 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// dominator of the exit blocks.
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
- ReplaceInstWithInst(
- TCCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
+ setBranchWeights(BI, MinItersBypassWeights);
+ ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
LoopBypassBlocks.push_back(TCCheckBlock);
}
@@ -3151,15 +3049,17 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
- EndValue =
- emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
+ EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
+ Step, II.getKind(), II.getInductionBinOp());
EndValue->setName("ind.end");
// Compute the end value for the additional bypass (if applicable).
if (AdditionalBypass.first) {
- B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
- EndValueFromAdditionalBypass = emitTransformedIndex(
- B, AdditionalBypass.second, II.getStartValue(), Step, II);
+ B.SetInsertPoint(AdditionalBypass.first,
+ AdditionalBypass.first->getFirstInsertionPt());
+ EndValueFromAdditionalBypass =
+ emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
+ Step, II.getKind(), II.getInductionBinOp());
EndValueFromAdditionalBypass->setName("ind.end");
}
}
@@ -3240,16 +3140,25 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// 3) Otherwise, construct a runtime check.
if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
!Cost->foldTailByMasking()) {
- Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
- Count, VectorTripCount, "cmp.n",
- LoopMiddleBlock->getTerminator());
-
// Here we use the same DebugLoc as the scalar loop latch terminator instead
// of the corresponding compare because they may have ended up with
// different line numbers and we want to avoid awkward line stepping while
// debugging. Eg. if the compare has got a line number inside the loop.
- CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
- cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
+ // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
+ // operands. Perform simplification directly on VPlan once the branch is
+ // modeled there.
+ IRBuilder<> B(LoopMiddleBlock->getTerminator());
+ B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
+ Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
+ BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
+ BI.setCondition(CmpN);
+ if (hasBranchWeightMD(*ScalarLatchTerm)) {
+ // Assume that `Count % VectorTripCount` is equally distributed.
+ unsigned TripCount = UF * VF.getKnownMinValue();
+ assert(TripCount > 0 && "trip count should not be zero");
+ const uint32_t Weights[] = {1, TripCount - 1};
+ setBranchWeights(BI, Weights);
+ }
}
#ifdef EXPENSIVE_CHECKS
@@ -3373,7 +3282,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
: State.get(StepVPV, {0, 0});
Value *Escape =
- emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
+ emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
+ II.getKind(), II.getInductionBinOp());
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
@@ -3445,76 +3355,33 @@ static void cse(BasicBlock *BB) {
}
}
-InstructionCost LoopVectorizationCostModel::getVectorCallCost(
- CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const {
- Function *F = CI->getCalledFunction();
- Type *ScalarRetTy = CI->getType();
- SmallVector<Type *, 4> Tys, ScalarTys;
- bool MaskRequired = Legal->isMaskRequired(CI);
- for (auto &ArgOp : CI->args())
- ScalarTys.push_back(ArgOp->getType());
+InstructionCost
+LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
+ ElementCount VF) const {
+ // We only need to calculate a cost if the VF is scalar; for actual vectors
+ // we should already have a pre-calculated cost at each VF.
+ if (!VF.isScalar())
+ return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
- // Estimate cost of scalarized vector call. The source operands are assumed
- // to be vectors, so we need to extract individual elements from there,
- // execute VF scalar calls, and then gather the result into the vector return
- // value.
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- InstructionCost ScalarCallCost =
- TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
- if (VF.isScalar())
- return ScalarCallCost;
-
- // Compute corresponding vector type for return value and arguments.
- Type *RetTy = ToVectorTy(ScalarRetTy, VF);
- for (Type *ScalarTy : ScalarTys)
- Tys.push_back(ToVectorTy(ScalarTy, VF));
-
- // Compute costs of unpacking argument values for the scalar calls and
- // packing the return values to a vector.
- InstructionCost ScalarizationCost =
- getScalarizationOverhead(CI, VF, CostKind);
+ Type *RetTy = CI->getType();
+ if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
+ return *RedCost;
- InstructionCost Cost =
- ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
-
- // If we can't emit a vector call for this function, then the currently found
- // cost is the cost we need to return.
- InstructionCost MaskCost = 0;
- VFShape Shape = VFShape::get(*CI, VF, MaskRequired);
- if (NeedsMask)
- *NeedsMask = MaskRequired;
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- // If we want an unmasked vector function but can't find one matching the VF,
- // maybe we can find vector function that does use a mask and synthesize
- // an all-true mask.
- if (!VecFunc && !MaskRequired) {
- Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
- VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- // If we found one, add in the cost of creating a mask
- if (VecFunc) {
- if (NeedsMask)
- *NeedsMask = true;
- MaskCost = TTI.getShuffleCost(
- TargetTransformInfo::SK_Broadcast,
- VectorType::get(
- IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()),
- VF));
- }
- }
+ SmallVector<Type *, 4> Tys;
+ for (auto &ArgOp : CI->args())
+ Tys.push_back(ArgOp->getType());
- // We don't support masked function calls yet, but we can scalarize a
- // masked call with branches (unless VF is scalable).
- if (!TLI || CI->isNoBuiltin() || !VecFunc)
- return VF.isScalable() ? InstructionCost::getInvalid() : Cost;
+ InstructionCost ScalarCallCost =
+ TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
- // If the corresponding vector cost is cheaper, return its cost.
- InstructionCost VectorCallCost =
- TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
- if (VectorCallCost < Cost) {
- *Variant = VecFunc;
- Cost = VectorCallCost;
+ // If this is an intrinsic we may have a lower cost for it.
+ if (getVectorIntrinsicIDForCall(CI, TLI)) {
+ InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+ return std::min(ScalarCallCost, IntrinsicCost);
}
- return Cost;
+ return ScalarCallCost;
}
static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
@@ -3558,146 +3425,8 @@ static Type *largestIntegerVectorType(Type *T1, Type *T2) {
return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
}
-void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
- // For every instruction `I` in MinBWs, truncate the operands, create a
- // truncated version of `I` and reextend its result. InstCombine runs
- // later and will remove any ext/trunc pairs.
- SmallPtrSet<Value *, 4> Erased;
- for (const auto &KV : Cost->getMinimalBitwidths()) {
- // If the value wasn't vectorized, we must maintain the original scalar
- // type. The absence of the value from State indicates that it
- // wasn't vectorized.
- // FIXME: Should not rely on getVPValue at this point.
- VPValue *Def = State.Plan->getVPValue(KV.first, true);
- if (!State.hasAnyVectorValue(Def))
- continue;
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *I = State.get(Def, Part);
- if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
- continue;
- Type *OriginalTy = I->getType();
- Type *ScalarTruncatedTy =
- IntegerType::get(OriginalTy->getContext(), KV.second);
- auto *TruncatedTy = VectorType::get(
- ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
- if (TruncatedTy == OriginalTy)
- continue;
-
- IRBuilder<> B(cast<Instruction>(I));
- auto ShrinkOperand = [&](Value *V) -> Value * {
- if (auto *ZI = dyn_cast<ZExtInst>(V))
- if (ZI->getSrcTy() == TruncatedTy)
- return ZI->getOperand(0);
- return B.CreateZExtOrTrunc(V, TruncatedTy);
- };
-
- // The actual instruction modification depends on the instruction type,
- // unfortunately.
- Value *NewI = nullptr;
- if (auto *BO = dyn_cast<BinaryOperator>(I)) {
- NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
- ShrinkOperand(BO->getOperand(1)));
-
- // Any wrapping introduced by shrinking this operation shouldn't be
- // considered undefined behavior. So, we can't unconditionally copy
- // arithmetic wrapping flags to NewI.
- cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
- } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
- NewI =
- B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
- ShrinkOperand(CI->getOperand(1)));
- } else if (auto *SI = dyn_cast<SelectInst>(I)) {
- NewI = B.CreateSelect(SI->getCondition(),
- ShrinkOperand(SI->getTrueValue()),
- ShrinkOperand(SI->getFalseValue()));
- } else if (auto *CI = dyn_cast<CastInst>(I)) {
- switch (CI->getOpcode()) {
- default:
- llvm_unreachable("Unhandled cast!");
- case Instruction::Trunc:
- NewI = ShrinkOperand(CI->getOperand(0));
- break;
- case Instruction::SExt:
- NewI = B.CreateSExtOrTrunc(
- CI->getOperand(0),
- smallestIntegerVectorType(OriginalTy, TruncatedTy));
- break;
- case Instruction::ZExt:
- NewI = B.CreateZExtOrTrunc(
- CI->getOperand(0),
- smallestIntegerVectorType(OriginalTy, TruncatedTy));
- break;
- }
- } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
- auto Elements0 =
- cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
- auto *O0 = B.CreateZExtOrTrunc(
- SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
- auto Elements1 =
- cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
- auto *O1 = B.CreateZExtOrTrunc(
- SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
-
- NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
- } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
- // Don't do anything with the operands, just extend the result.
- continue;
- } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
- auto Elements =
- cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
- auto *O0 = B.CreateZExtOrTrunc(
- IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
- auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
- NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
- } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
- auto Elements =
- cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
- auto *O0 = B.CreateZExtOrTrunc(
- EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
- NewI = B.CreateExtractElement(O0, EE->getOperand(2));
- } else {
- // If we don't know what to do, be conservative and don't do anything.
- continue;
- }
-
- // Lastly, extend the result.
- NewI->takeName(cast<Instruction>(I));
- Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
- I->replaceAllUsesWith(Res);
- cast<Instruction>(I)->eraseFromParent();
- Erased.insert(I);
- State.reset(Def, Res, Part);
- }
- }
-
- // We'll have created a bunch of ZExts that are now parentless. Clean up.
- for (const auto &KV : Cost->getMinimalBitwidths()) {
- // If the value wasn't vectorized, we must maintain the original scalar
- // type. The absence of the value from State indicates that it
- // wasn't vectorized.
- // FIXME: Should not rely on getVPValue at this point.
- VPValue *Def = State.Plan->getVPValue(KV.first, true);
- if (!State.hasAnyVectorValue(Def))
- continue;
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *I = State.get(Def, Part);
- ZExtInst *Inst = dyn_cast<ZExtInst>(I);
- if (Inst && Inst->use_empty()) {
- Value *NewI = Inst->getOperand(0);
- Inst->eraseFromParent();
- State.reset(Def, NewI, Part);
- }
- }
- }
-}
-
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VPlan &Plan) {
- // Insert truncates and extends for any truncated instructions as hints to
- // InstCombine.
- if (VF.isVector())
- truncateToMinimalBitwidths(State);
-
// Fix widened non-induction PHIs by setting up the PHI operands.
if (EnableVPlanNativePath)
fixNonInductionPHIs(Plan, State);
@@ -3710,6 +3439,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
+ PSE.getSE()->forgetBlockAndLoopDispositions();
// After vectorization, the exit blocks of the original loop will have
// additional predecessors. Invalidate SCEVs for the exit phis in case SE
@@ -3718,7 +3448,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
OrigLoop->getExitBlocks(ExitBlocks);
for (BasicBlock *Exit : ExitBlocks)
for (PHINode &PN : Exit->phis())
- PSE.getSE()->forgetValue(&PN);
+ PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
@@ -3744,7 +3474,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
// in the exit block, so update the builder.
- State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
+ State.Builder.SetInsertPoint(State.CFG.ExitBB,
+ State.CFG.ExitBB->getFirstNonPHIIt());
for (const auto &KV : Plan.getLiveOuts())
KV.second->fixPhi(Plan, State);
@@ -3782,40 +3513,10 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
VPBasicBlock *Header =
State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
- // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
- // sank outside of the loop would keep the same order as they had in the
- // original loop.
- SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
for (VPRecipeBase &R : Header->phis()) {
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
- ReductionPHIList.emplace_back(ReductionPhi);
+ fixReduction(ReductionPhi, State);
}
- stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,
- const VPReductionPHIRecipe *R2) {
- auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
- auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
-
- // If neither of the recipes has an intermediate store, keep the order the
- // same.
- if (!IS1 && !IS2)
- return false;
-
- // If only one of the recipes has an intermediate store, then move it
- // towards the beginning of the list.
- if (IS1 && !IS2)
- return true;
-
- if (!IS1 && IS2)
- return false;
-
- // If both recipes have an intermediate store, then the recipe with the
- // later store should be processed earlier. So it should go to the beginning
- // of the list.
- return DT->dominates(IS2, IS1);
- });
-
- for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)
- fixReduction(ReductionPhi, State);
for (VPRecipeBase &R : Header->phis()) {
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
@@ -3929,7 +3630,7 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
}
// Fix the initial value of the original recurrence in the scalar loop.
- Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
+ Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
@@ -3953,90 +3654,56 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
RecurKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- State.setDebugLocFromInst(ReductionStartValue);
+ if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
+ State.setDebugLocFrom(I->getDebugLoc());
VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
- // This is the vector-clone of the value that leaves the loop.
- Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
// instructions.
- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+ Builder.SetInsertPoint(LoopMiddleBlock,
+ LoopMiddleBlock->getFirstInsertionPt());
- State.setDebugLocFromInst(LoopExitInst);
+ State.setDebugLocFrom(LoopExitInst->getDebugLoc());
Type *PhiTy = OrigPhi->getType();
-
- VPBasicBlock *LatchVPBB =
- PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
- BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
// If tail is folded by masking, the vector value to leave the loop should be
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
// instead of the former. For an inloop reduction the reduction will already
// be predicated, and does not need to be handled here.
if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
- SelectInst *Sel = nullptr;
- for (User *U : VecLoopExitInst->users()) {
- if (isa<SelectInst>(U)) {
- assert(!Sel && "Reduction exit feeding two selects");
- Sel = cast<SelectInst>(U);
- } else
- assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
- }
- assert(Sel && "Reduction exit feeds no select");
- State.reset(LoopExitInstDef, Sel, Part);
-
- if (isa<FPMathOperator>(Sel))
- Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
-
- // If the target can create a predicated operator for the reduction at no
- // extra cost in the loop (for example a predicated vadd), it can be
- // cheaper for the select to remain in the loop than be sunk out of it,
- // and so use the select value for the phi instead of the old
- // LoopExitValue.
- if (PreferPredicatedReductionSelect ||
- TTI->preferPredicatedReductionSelect(
- RdxDesc.getOpcode(), PhiTy,
- TargetTransformInfo::ReductionFlags())) {
- auto *VecRdxPhi =
- cast<PHINode>(State.get(PhiR, Part));
- VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
+ VPValue *Def = nullptr;
+ for (VPUser *U : LoopExitInstDef->users()) {
+ auto *S = dyn_cast<VPInstruction>(U);
+ if (S && S->getOpcode() == Instruction::Select) {
+ Def = S;
+ break;
}
}
+ if (Def)
+ LoopExitInstDef = Def;
}
+ VectorParts RdxParts(UF);
+ for (unsigned Part = 0; Part < UF; ++Part)
+ RdxParts[Part] = State.get(LoopExitInstDef, Part);
+
// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
- assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
+ Builder.SetInsertPoint(LoopMiddleBlock,
+ LoopMiddleBlock->getFirstInsertionPt());
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
- VectorParts RdxParts(UF);
- for (unsigned Part = 0; Part < UF; ++Part) {
- RdxParts[Part] = State.get(LoopExitInstDef, Part);
- Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
- : Builder.CreateZExt(Trunc, VecTy);
- for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
- if (U != Trunc) {
- U->replaceUsesOfWith(RdxParts[Part], Extnd);
- RdxParts[Part] = Extnd;
- }
- }
- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
for (unsigned Part = 0; Part < UF; ++Part) {
RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- State.reset(LoopExitInstDef, RdxParts[Part], Part);
}
}
// Reduce all of the unrolled parts into a single vector.
- Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
+ Value *ReducedPartRdx = RdxParts[0];
unsigned Op = RecurrenceDescriptor::getOpcode(RK);
// The middle block terminator has already been assigned a DebugLoc here (the
@@ -4046,21 +3713,21 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// conditional branch, and (c) other passes may add new predecessors which
// terminate on this line. This is the easiest way to ensure we don't
// accidentally cause an extra step back into the loop while debugging.
- State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
+ State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
if (PhiR->isOrdered())
- ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
+ ReducedPartRdx = RdxParts[UF - 1];
else {
// Floating-point operations should have some FMF to enable the reduction.
IRBuilderBase::FastMathFlagGuard FMFG(Builder);
Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
for (unsigned Part = 1; Part < UF; ++Part) {
- Value *RdxPart = State.get(LoopExitInstDef, Part);
- if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ Value *RdxPart = RdxParts[Part];
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
ReducedPartRdx = Builder.CreateBinOp(
(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
- } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
- ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
- ReducedPartRdx, RdxPart);
+ else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
+ ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
+ ReducedPartRdx, RdxPart);
else
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
}
@@ -4070,7 +3737,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// target reduction in the loop using a Reduction recipe.
if (VF.isVector() && !PhiR->isInLoop()) {
ReducedPartRdx =
- createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
+ createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (PhiTy != RdxDesc.getRecurrenceType())
@@ -4107,7 +3774,8 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// inside the loop, create the final store here.
if (StoreInst *SI = RdxDesc.IntermediateStore) {
StoreInst *NewSI =
- Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+ Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),
+ SI->getAlign());
propagateMetadata(NewSI, SI);
// If the reduction value is used in other places,
@@ -4436,7 +4104,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
default:
return true;
case Instruction::Call:
- return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
+ if (VF.isScalar())
+ return true;
+ return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
+ .Kind == CM_Scalarize;
case Instruction::Load:
case Instruction::Store: {
auto *Ptr = getLoadStorePointerOperand(I);
@@ -4988,7 +4659,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
}
FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
- unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
+ unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5076,12 +4747,12 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
FixedScalableVFPair Result(ElementCount::getFixed(1),
ElementCount::getScalable(0));
if (auto MaxVF =
- getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+ getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
MaxSafeFixedVF, FoldTailByMasking))
Result.FixedVF = MaxVF;
if (auto MaxVF =
- getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+ getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
MaxSafeScalableVF, FoldTailByMasking))
if (MaxVF.isScalable()) {
Result.ScalableVF = MaxVF;
@@ -5105,6 +4776,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
if (TC == 1) {
reportVectorizationFailure("Single iteration (non) loop",
@@ -5115,7 +4787,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return computeFeasibleMaxVF(TC, UserVF, false);
+ return computeFeasibleMaxVF(MaxTC, UserVF, false);
case CM_ScalarEpilogueNotAllowedUsePredicate:
[[fallthrough]];
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5153,7 +4825,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return computeFeasibleMaxVF(TC, UserVF, false);
+ return computeFeasibleMaxVF(MaxTC, UserVF, false);
}
return FixedScalableVFPair::getNone();
}
@@ -5170,7 +4842,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we choose.
@@ -5246,7 +4918,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
- unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
+ unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
ElementCount MaxSafeVF, bool FoldTailByMasking) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
const TypeSize WidestRegister = TTI.getRegisterBitWidth(
@@ -5285,31 +4957,35 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
}
// When a scalar epilogue is required, at least one iteration of the scalar
- // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a
+ // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
// max VF that results in a dead vector loop.
- if (ConstTripCount > 0 && requiresScalarEpilogue(true))
- ConstTripCount -= 1;
-
- if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
- (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
- // If loop trip count (TC) is known at compile time there is no point in
- // choosing VF greater than TC (as done in the loop below). Select maximum
- // power of two which doesn't exceed TC.
- // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
- // when the TC is less than or equal to the known number of lanes.
- auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount);
+ if (MaxTripCount > 0 && requiresScalarEpilogue(true))
+ MaxTripCount -= 1;
+
+ if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
+ (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
+ // If upper bound loop trip count (TC) is known at compile time there is no
+ // point in choosing VF greater than TC (as done in the loop below). Select
+ // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
+ // scalable, we only fall back on a fixed VF when the TC is less than or
+ // equal to the known number of lanes.
+ auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: "
- << ClampedConstTripCount << "\n");
- return ElementCount::getFixed(ClampedConstTripCount);
+ << ClampedUpperTripCount << "\n");
+ return ElementCount::get(
+ ClampedUpperTripCount,
+ FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
}
TargetTransformInfo::RegisterKind RegKind =
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;
- if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
- TTI.shouldMaximizeVectorBandwidth(RegKind))) {
+ if (MaximizeBandwidth ||
+ (MaximizeBandwidth.getNumOccurrences() == 0 &&
+ (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+ (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
ComputeScalableMaxVF);
@@ -5981,7 +5657,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
HasReductions &&
any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
const RecurrenceDescriptor &RdxDesc = Reduction.second;
- return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
+ return RecurrenceDescriptor::isAnyOfRecurrenceKind(
RdxDesc.getRecurrenceKind());
});
if (HasSelectCmpReductions) {
@@ -6149,6 +5825,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
if (ValuesToIgnore.count(I))
continue;
+ collectInLoopReductions();
+
// For each VF find the maximum usage of registers.
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
// Count the number of registers used, per register class, given all open
@@ -6668,10 +6346,11 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(
- Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
+ Instruction *I, ElementCount VF, Type *Ty,
+ TTI::TargetCostKind CostKind) const {
using namespace llvm::PatternMatch;
// Early exit for no inloop reductions
- if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
+ if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
return std::nullopt;
auto *VectorTy = cast<VectorType>(Ty);
@@ -6706,10 +6385,10 @@ LoopVectorizationCostModel::getReductionPatternCost(
// Find the reduction this chain is a part of and calculate the basic cost of
// the reduction on its own.
- Instruction *LastChain = InLoopReductionImmediateChains[RetI];
+ Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
Instruction *ReductionPhi = LastChain;
while (!isa<PHINode>(ReductionPhi))
- ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
+ ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
const RecurrenceDescriptor &RdxDesc =
Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
@@ -7127,6 +6806,168 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
}
}
+void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
+ assert(!VF.isScalar() &&
+ "Trying to set a vectorization decision for a scalar VF");
+
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // For each instruction in the old loop.
+ for (Instruction &I : *BB) {
+ CallInst *CI = dyn_cast<CallInst>(&I);
+
+ if (!CI)
+ continue;
+
+ InstructionCost ScalarCost = InstructionCost::getInvalid();
+ InstructionCost VectorCost = InstructionCost::getInvalid();
+ InstructionCost IntrinsicCost = InstructionCost::getInvalid();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ Function *ScalarFunc = CI->getCalledFunction();
+ Type *ScalarRetTy = CI->getType();
+ SmallVector<Type *, 4> Tys, ScalarTys;
+ bool MaskRequired = Legal->isMaskRequired(CI);
+ for (auto &ArgOp : CI->args())
+ ScalarTys.push_back(ArgOp->getType());
+
+ // Compute corresponding vector type for return value and arguments.
+ Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+ for (Type *ScalarTy : ScalarTys)
+ Tys.push_back(ToVectorTy(ScalarTy, VF));
+
+ // An in-loop reduction using an fmuladd intrinsic is a special case;
+ // we don't want the normal cost for that intrinsic.
+ if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
+ setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
+ getVectorIntrinsicIDForCall(CI, TLI),
+ std::nullopt, *RedCost);
+ continue;
+ }
+
+ // Estimate cost of scalarized vector call. The source operands are
+ // assumed to be vectors, so we need to extract individual elements from
+ // there, execute VF scalar calls, and then gather the result into the
+ // vector return value.
+ InstructionCost ScalarCallCost =
+ TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
+
+ // Compute costs of unpacking argument values for the scalar calls and
+ // packing the return values to a vector.
+ InstructionCost ScalarizationCost =
+ getScalarizationOverhead(CI, VF, CostKind);
+
+ ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
+
+ // Find the cost of vectorizing the call, if we can find a suitable
+ // vector variant of the function.
+ bool UsesMask = false;
+ VFInfo FuncInfo;
+ Function *VecFunc = nullptr;
+ // Search through any available variants for one we can use at this VF.
+ for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
+ // Must match requested VF.
+ if (Info.Shape.VF != VF)
+ continue;
+
+ // Must take a mask argument if one is required
+ if (MaskRequired && !Info.isMasked())
+ continue;
+
+ // Check that all parameter kinds are supported
+ bool ParamsOk = true;
+ for (VFParameter Param : Info.Shape.Parameters) {
+ switch (Param.ParamKind) {
+ case VFParamKind::Vector:
+ break;
+ case VFParamKind::OMP_Uniform: {
+ Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
+ // Make sure the scalar parameter in the loop is invariant.
+ if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
+ TheLoop))
+ ParamsOk = false;
+ break;
+ }
+ case VFParamKind::OMP_Linear: {
+ Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
+ // Find the stride for the scalar parameter in this loop and see if
+ // it matches the stride for the variant.
+ // TODO: do we need to figure out the cost of an extract to get the
+ // first lane? Or do we hope that it will be folded away?
+ ScalarEvolution *SE = PSE.getSE();
+ const auto *SAR =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
+
+ if (!SAR || SAR->getLoop() != TheLoop) {
+ ParamsOk = false;
+ break;
+ }
+
+ const SCEVConstant *Step =
+ dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
+
+ if (!Step ||
+ Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
+ ParamsOk = false;
+
+ break;
+ }
+ case VFParamKind::GlobalPredicate:
+ UsesMask = true;
+ break;
+ default:
+ ParamsOk = false;
+ break;
+ }
+ }
+
+ if (!ParamsOk)
+ continue;
+
+ // Found a suitable candidate, stop here.
+ VecFunc = CI->getModule()->getFunction(Info.VectorName);
+ FuncInfo = Info;
+ break;
+ }
+
+ // Add in the cost of synthesizing a mask if one wasn't required.
+ InstructionCost MaskCost = 0;
+ if (VecFunc && UsesMask && !MaskRequired)
+ MaskCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(IntegerType::getInt1Ty(
+ VecFunc->getFunctionType()->getContext()),
+ VF));
+
+ if (TLI && VecFunc && !CI->isNoBuiltin())
+ VectorCost =
+ TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
+
+ // Find the cost of an intrinsic; some targets may have instructions that
+ // perform the operation without needing an actual call.
+ Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (IID != Intrinsic::not_intrinsic)
+ IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+
+ InstructionCost Cost = ScalarCost;
+ InstWidening Decision = CM_Scalarize;
+
+ if (VectorCost <= Cost) {
+ Cost = VectorCost;
+ Decision = CM_VectorCall;
+ }
+
+ if (IntrinsicCost <= Cost) {
+ Cost = IntrinsicCost;
+ Decision = CM_IntrinsicCall;
+ }
+
+ setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
+ FuncInfo.getParamIndexForOptionalMask(), Cost);
+ }
+ }
+}
+
InstructionCost
LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *&VectorTy) {
@@ -7156,7 +6997,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// With the exception of GEPs and PHIs, after scalarization there should
// only be one copy of the instruction generated in the loop. This is
// because the VF is either 1, or any instructions that need scalarizing
- // have already been dealt with by the the time we get here. As a result,
+ // have already been dealt with by the time we get here. As a result,
// it means we don't have to multiply the instruction cost by VF.
assert(I->getOpcode() == Instruction::GetElementPtr ||
I->getOpcode() == Instruction::PHI ||
@@ -7384,6 +7225,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI::CastContextHint::Reversed;
case LoopVectorizationCostModel::CM_Unknown:
llvm_unreachable("Instr did not go through cost modelling?");
+ case LoopVectorizationCostModel::CM_VectorCall:
+ case LoopVectorizationCostModel::CM_IntrinsicCall:
+ llvm_unreachable_internal("Instr has invalid widening decision");
}
llvm_unreachable("Unhandled case!");
@@ -7441,19 +7285,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
- case Instruction::Call: {
- if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
- return *RedCost;
- Function *Variant;
- CallInst *CI = cast<CallInst>(I);
- InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant);
- if (getVectorIntrinsicIDForCall(CI, TLI)) {
- InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
- return std::min(CallCost, IntrinsicCost);
- }
- return CallCost;
- }
+ case Instruction::Call:
+ return getVectorCallCost(cast<CallInst>(I), VF);
case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
case Instruction::Alloca:
@@ -7521,8 +7354,9 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
SmallVector<Instruction *, 4> ReductionOperations =
RdxDesc.getReductionOpChain(Phi, TheLoop);
bool InLoop = !ReductionOperations.empty();
+
if (InLoop) {
- InLoopReductionChains[Phi] = ReductionOperations;
+ InLoopReductions.insert(Phi);
// Add the elements to InLoopReductionImmediateChains for cost modelling.
Instruction *LastChain = Phi;
for (auto *I : ReductionOperations) {
@@ -7535,21 +7369,38 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
}
}
+VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
+ DebugLoc DL, const Twine &Name) {
+ assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
+ Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
+ return tryInsertInstruction(
+ new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
+}
+
+// This function will select a scalable VF if the target supports scalable
+// vectors and a fixed one otherwise.
// TODO: we could return a pair of values that specify the max VF and
// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
// doesn't have a cost model that can choose which plan to execute if
// more than one is generated.
-static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
- LoopVectorizationCostModel &CM) {
+static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
+ LoopVectorizationCostModel &CM) {
unsigned WidestType;
std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
- return WidestVectorRegBits / WidestType;
+
+ TargetTransformInfo::RegisterKind RegKind =
+ TTI.enableScalableVectorization()
+ ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector;
+
+ TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
+ unsigned N = RegSize.getKnownMinValue() / WidestType;
+ return ElementCount::get(N, RegSize.isScalable());
}
VectorizationFactor
LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
- assert(!UserVF.isScalable() && "scalable vectors not yet supported");
ElementCount VF = UserVF;
// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
@@ -7559,10 +7410,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
// If the user doesn't provide a vectorization factor, determine a
// reasonable one.
if (UserVF.isZero()) {
- VF = ElementCount::getFixed(determineVPlanVF(
- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
- .getFixedValue(),
- CM));
+ VF = determineVPlanVF(TTI, CM);
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
// Make sure we have a VF > 1 for stress testing.
@@ -7571,6 +7419,17 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
<< "overriding computed VF.\n");
VF = ElementCount::getFixed(4);
}
+ } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
+ !ForceTargetSupportsScalableVectors) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
+ << "not supported by the target.\n");
+ reportVectorizationFailure(
+ "Scalable vectorization requested but not supported by the target",
+ "the scalable user-specified vectorization width for outer-loop "
+ "vectorization cannot be used because the target does not support "
+ "scalable vectors.",
+ "ScalableVFUnfeasible", ORE, OrigLoop);
+ return VectorizationFactor::Disabled();
}
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
assert(isPowerOf2_32(VF.getKnownMinValue()) &&
@@ -7624,9 +7483,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
+ CM.collectInLoopReductions();
if (CM.selectUserVectorizationFactor(UserVF)) {
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
- CM.collectInLoopReductions();
buildVPlansWithVPRecipes(UserVF, UserVF);
if (!hasPlanWithVF(UserVF)) {
LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
@@ -7650,6 +7509,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
VFCandidates.insert(VF);
+ CM.collectInLoopReductions();
for (const auto &VF : VFCandidates) {
// Collect Uniform and Scalar instructions after vectorization with VF.
CM.collectUniformsAndScalars(VF);
@@ -7660,7 +7520,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.collectInstsToScalarize(VF);
}
- CM.collectInLoopReductions();
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
@@ -7705,7 +7564,7 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
if (MD) {
const auto *S = dyn_cast<MDString>(MD->getOperand(0));
IsUnrollMetadata =
- S && S->getString().startswith("llvm.loop.unroll.disable");
+ S && S->getString().starts_with("llvm.loop.unroll.disable");
}
MDs.push_back(LoopID->getOperand(i));
}
@@ -7729,7 +7588,7 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
SCEV2ValueTy LoopVectorizationPlanner::executePlan(
ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
- DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
+ const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
assert(BestVPlan.hasVF(BestVF) &&
"Trying to execute plan with unsupported VF");
assert(BestVPlan.hasUF(BestUF) &&
@@ -7745,7 +7604,8 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
// Perform the actual loop transformation.
- VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
+ VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
+ OrigLoop->getHeader()->getContext());
// 0. Generate SCEV-dependent code into the preheader, including TripCount,
// before making any changes to the CFG.
@@ -7798,9 +7658,9 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- BestVPlan.prepareToExecute(
- ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr),
- CanonicalIVStartValue, State, IsEpilogueVectorization);
+ BestVPlan.prepareToExecute(ILV.getTripCount(),
+ ILV.getOrCreateVectorTripCount(nullptr),
+ CanonicalIVStartValue, State);
BestVPlan.execute(&State);
@@ -7964,9 +7824,11 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
EPI.TripCount = Count;
}
- ReplaceInstWithInst(
- TCCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
+ setBranchWeights(BI, MinItersBypassWeights);
+ ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
return TCCheckBlock;
}
@@ -8064,8 +7926,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
// Generate a resume induction for the vector epilogue and put it in the
// vector epilogue preheader
Type *IdxTy = Legal->getWidestInductionType();
- PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
- LoopVectorPreHeader->getFirstNonPHI());
+ PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
+ EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
EPI.MainLoopIterationCountCheck);
@@ -8110,9 +7972,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
EPI.EpilogueVF, EPI.EpilogueUF),
"min.epilog.iters.check");
- ReplaceInstWithInst(
- Insert->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+ unsigned MainLoopStep = UF * VF.getKnownMinValue();
+ unsigned EpilogueLoopStep =
+ EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
+ // We assume the remaining `Count` is equally distributed in
+ // [0, MainLoopStep)
+ // So the probability for `Count < EpilogueLoopStep` should be
+ // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
+ unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
+ const uint32_t Weights[] = {EstimatedSkipCount,
+ MainLoopStep - EstimatedSkipCount};
+ setBranchWeights(BI, Weights);
+ }
+ ReplaceInstWithInst(Insert->getTerminator(), &BI);
LoopBypassBlocks.push_back(Insert);
return Insert;
@@ -8206,6 +8081,33 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
return EdgeMaskCache[Edge] = EdgeMask;
}
+void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
+ BasicBlock *Header = OrigLoop->getHeader();
+
+ // When not folding the tail, use nullptr to model all-true mask.
+ if (!CM.foldTailByMasking()) {
+ BlockMaskCache[Header] = nullptr;
+ return;
+ }
+
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+ // constructing the desired canonical IV in the header block as its first
+ // non-phi instructions.
+
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
+ HeaderVPBB->insert(IV, NewInsertionPoint);
+
+ VPBuilder::InsertPointGuard Guard(Builder);
+ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
+ VPValue *BlockMask = nullptr;
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+ BlockMaskCache[Header] = BlockMask;
+}
+
VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
@@ -8214,45 +8116,12 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
if (BCEntryIt != BlockMaskCache.end())
return BCEntryIt->second;
+ assert(OrigLoop->getHeader() != BB &&
+ "Loop header must have cached block mask");
+
// All-one mask is modelled as no-mask following the convention for masked
// load/store/gather/scatter. Initialize BlockMask to no-mask.
VPValue *BlockMask = nullptr;
-
- if (OrigLoop->getHeader() == BB) {
- if (!CM.blockNeedsPredicationForAnyReason(BB))
- return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
-
- assert(CM.foldTailByMasking() && "must fold the tail");
-
- // If we're using the active lane mask for control flow, then we get the
- // mask from the active lane mask PHI that is cached in the VPlan.
- TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
- if (useActiveLaneMaskForControlFlow(TFStyle))
- return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi();
-
- // Introduce the early-exit compare IV <= BTC to form header block mask.
- // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
- // constructing the desired canonical IV in the header block as its first
- // non-phi instructions.
-
- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
- auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
- HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
-
- VPBuilder::InsertPointGuard Guard(Builder);
- Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
- if (useActiveLaneMask(TFStyle)) {
- VPValue *TC = Plan.getTripCount();
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
- nullptr, "active.lane.mask");
- } else {
- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
- BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
- }
- return BlockMaskCache[BB] = BlockMask;
- }
-
// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
@@ -8458,22 +8327,15 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
bool ShouldUseVectorIntrinsic =
ID && LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) -> bool {
- Function *Variant;
- // Is it beneficial to perform intrinsic call compared to lib
- // call?
- InstructionCost CallCost =
- CM.getVectorCallCost(CI, VF, &Variant);
- InstructionCost IntrinsicCost =
- CM.getVectorIntrinsicCost(CI, VF);
- return IntrinsicCost <= CallCost;
+ return CM.getCallWideningDecision(CI, VF).Kind ==
+ LoopVectorizationCostModel::CM_IntrinsicCall;
},
Range);
if (ShouldUseVectorIntrinsic)
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
Function *Variant = nullptr;
- ElementCount VariantVF;
- bool NeedsMask = false;
+ std::optional<unsigned> MaskPos;
// Is better to call a vectorized version of the function than to to scalarize
// the call?
auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8492,16 +8354,19 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// finds a valid variant.
if (Variant)
return false;
- CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask);
- // If we found a valid vector variant at this VF, then store the VF
- // in case we need to generate a mask.
- if (Variant)
- VariantVF = VF;
- return Variant != nullptr;
+ LoopVectorizationCostModel::CallWideningDecision Decision =
+ CM.getCallWideningDecision(CI, VF);
+ if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
+ Variant = Decision.Variant;
+ MaskPos = Decision.MaskPos;
+ return true;
+ }
+
+ return false;
},
Range);
if (ShouldUseVectorCall) {
- if (NeedsMask) {
+ if (MaskPos.has_value()) {
// We have 2 cases that would require a mask:
// 1) The block needs to be predicated, either due to a conditional
// in the scalar loop or use of an active lane mask with
@@ -8516,17 +8381,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
- VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true);
- unsigned MaskPos = 0;
-
- for (const VFInfo &Info : VFDatabase::getMappings(*CI))
- if (Info.Shape == Shape) {
- assert(Info.isMasked() && "Vector function info shape mismatch");
- MaskPos = Info.getParamIndexForOptionalMask().value();
- break;
- }
-
- Ops.insert(Ops.begin() + MaskPos, Mask);
+ Ops.insert(Ops.begin() + *MaskPos, Mask);
}
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
@@ -8747,8 +8602,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
}
if (auto *CI = dyn_cast<CastInst>(Instr)) {
- return toVPRecipeResult(
- new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));
+ return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
+ CI->getType(), *CI));
}
return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
@@ -8758,27 +8613,26 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
- // Add assume instructions we need to drop to DeadInstructions, to prevent
- // them from being added to the VPlan.
- // TODO: We only need to drop assumes in blocks that get flattend. If the
- // control flow is preserved, we should keep them.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- auto &ConditionalAssumes = Legal->getConditionalAssumes();
- DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
-
auto MaxVFTimes2 = MaxVF * 2;
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
VFRange SubRange = {VF, MaxVFTimes2};
- if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions))
- VPlans.push_back(std::move(*Plan));
+ if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
+ // Now optimize the initial VPlan.
+ if (!Plan->hasVF(ElementCount::getFixed(1)))
+ VPlanTransforms::truncateToMinimalBitwidths(
+ *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
+ VPlanTransforms::optimize(*Plan, *PSE.getSE());
+ assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
+ VPlans.push_back(std::move(Plan));
+ }
VF = SubRange.End;
}
}
// Add the necessary canonical IV and branch recipes required to control the
// loop.
-static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
- TailFoldingStyle Style) {
+static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
+ DebugLoc DL) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
@@ -8790,102 +8644,24 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
// Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
// IV by VF * UF.
- bool HasNUW = Style == TailFoldingStyle::None;
auto *CanonicalIVIncrement =
- new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
- : VPInstruction::CanonicalIVIncrement,
- {CanonicalIVPHI}, DL, "index.next");
+ new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
+ {HasNUW, false}, DL, "index.next");
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
- if (useActiveLaneMaskForControlFlow(Style)) {
- // Create the active lane mask instruction in the vplan preheader.
- VPBasicBlock *VecPreheader =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
-
- // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
- // we have to take unrolling into account. Each part needs to start at
- // Part * VF
- auto *CanonicalIVIncrementParts =
- new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
- : VPInstruction::CanonicalIVIncrementForPart,
- {StartV}, DL, "index.part.next");
- VecPreheader->appendRecipe(CanonicalIVIncrementParts);
-
- // Create the ActiveLaneMask instruction using the correct start values.
- VPValue *TC = Plan.getTripCount();
-
- VPValue *TripCount, *IncrementValue;
- if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
- // When avoiding a runtime check, the active.lane.mask inside the loop
- // uses a modified trip count and the induction variable increment is
- // done after the active.lane.mask intrinsic is called.
- auto *TCMinusVF =
- new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL);
- VecPreheader->appendRecipe(TCMinusVF);
- IncrementValue = CanonicalIVPHI;
- TripCount = TCMinusVF;
- } else {
- // When the loop is guarded by a runtime overflow check for the loop
- // induction variable increment by VF, we can increment the value before
- // the get.active.lane mask and use the unmodified tripcount.
- EB->appendRecipe(CanonicalIVIncrement);
- IncrementValue = CanonicalIVIncrement;
- TripCount = TC;
- }
-
- auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
- {CanonicalIVIncrementParts, TC}, DL,
- "active.lane.mask.entry");
- VecPreheader->appendRecipe(EntryALM);
-
- // Now create the ActiveLaneMaskPhi recipe in the main loop using the
- // preheader ActiveLaneMask instruction.
- auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
- Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
-
- // Create the active lane mask for the next iteration of the loop.
- CanonicalIVIncrementParts =
- new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
- : VPInstruction::CanonicalIVIncrementForPart,
- {IncrementValue}, DL);
- EB->appendRecipe(CanonicalIVIncrementParts);
-
- auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
- {CanonicalIVIncrementParts, TripCount}, DL,
- "active.lane.mask.next");
- EB->appendRecipe(ALM);
- LaneMaskPhi->addOperand(ALM);
-
- if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
- // Do the increment of the canonical IV after the active.lane.mask, because
- // that value is still based off %CanonicalIVPHI
- EB->appendRecipe(CanonicalIVIncrement);
- }
-
- // We have to invert the mask here because a true condition means jumping
- // to the exit block.
- auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
- EB->appendRecipe(NotMask);
-
- VPInstruction *BranchBack =
- new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
- EB->appendRecipe(BranchBack);
- } else {
- EB->appendRecipe(CanonicalIVIncrement);
+ EB->appendRecipe(CanonicalIVIncrement);
- // Add the BranchOnCount VPInstruction to the latch.
- VPInstruction *BranchBack = new VPInstruction(
- VPInstruction::BranchOnCount,
- {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
- EB->appendRecipe(BranchBack);
- }
+ // Add the BranchOnCount VPInstruction to the latch.
+ VPInstruction *BranchBack =
+ new VPInstruction(VPInstruction::BranchOnCount,
+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+ EB->appendRecipe(BranchBack);
}
// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
// original exit block.
-static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
- VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
+static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
VPlan &Plan) {
BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
@@ -8902,8 +8678,8 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
}
}
-std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
- VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+VPlanPtr
+LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8914,24 +8690,6 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// process after constructing the initial VPlan.
// ---------------------------------------------------------------------------
- for (const auto &Reduction : CM.getInLoopReductionChains()) {
- PHINode *Phi = Reduction.first;
- RecurKind Kind =
- Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
- const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
-
- RecipeBuilder.recordRecipeOf(Phi);
- for (const auto &R : ReductionOperations) {
- RecipeBuilder.recordRecipeOf(R);
- // For min/max reductions, where we have a pair of icmp/select, we also
- // need to record the ICmp recipe, so it can be removed later.
- assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
- "Only min/max recurrences allowed for inloop reductions");
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
- RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
- }
- }
-
// For each interleave group which is relevant for this (possibly trimmed)
// Range, add it to the set of groups to be later applied to the VPlan and add
// placeholders for its members' Recipes which we'll be replacing with a
@@ -8972,23 +8730,27 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
- auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
- VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry());
- VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
- VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
+ Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
+ Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
// Don't use getDecisionAndClampRange here, because we don't know the UF
// so this function is better to be conservative, rather than to split
// it up into different VPlans.
+ // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
bool IVUpdateMayOverflow = false;
for (ElementCount VF : Range)
IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
- Instruction *DLInst =
- getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
- DLInst ? DLInst->getDebugLoc() : DebugLoc(),
- CM.getTailFoldingStyle(IVUpdateMayOverflow));
+ DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
+ TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
+ // When not folding the tail, we know that the induction increment will not
+ // overflow.
+ bool HasNUW = Style == TailFoldingStyle::None;
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
+
+ // Proactively create header mask. Masks for other blocks are created on
+ // demand.
+ RecipeBuilder.createHeaderMask(*Plan);
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
@@ -9005,14 +8767,8 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Introduce each ingredient into VPlan.
// TODO: Model and preserve debug intrinsics in VPlan.
- for (Instruction &I : BB->instructionsWithoutDebug(false)) {
+ for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
Instruction *Instr = &I;
-
- // First filter out irrelevant instructions, to ensure no recipes are
- // built for them.
- if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
- continue;
-
SmallVector<VPValue *, 4> Operands;
auto *Phi = dyn_cast<PHINode>(Instr);
if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
@@ -9052,11 +8808,18 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
}
RecipeBuilder.setRecipe(Instr, Recipe);
- if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
- HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
- // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the
- // phi section of HeaderVPBB.
- assert(isa<TruncInst>(Instr));
+ if (isa<VPHeaderPHIRecipe>(Recipe)) {
+ // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
+ // the following cases, VPHeaderPHIRecipes may be created after non-phi
+ // recipes and need to be moved to the phi section of HeaderVPBB:
+ // * tail-folding (non-phi recipes computing the header mask are
+ // introduced earlier than regular header phi recipes, and should appear
+ // after them)
+ // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
+
+ assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
+ CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
+ "unexpected recipe needs moving");
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
} else
VPBB->appendRecipe(Recipe);
@@ -9074,7 +8837,7 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
} else
- addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
+ addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -9088,8 +8851,7 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// ---------------------------------------------------------------------------
// Adjust the recipes for any inloop reductions.
- adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
- RecipeBuilder, Range.Start);
+ adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
@@ -9150,21 +8912,18 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Sink users of fixed-order recurrence past the recipe defining the previous
// value and introduce FirstOrderRecurrenceSplice VPInstructions.
if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
- return std::nullopt;
-
- VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
- VPlanTransforms::removeRedundantInductionCasts(*Plan);
-
- VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
- VPlanTransforms::removeDeadRecipes(*Plan);
-
- VPlanTransforms::createAndOptimizeReplicateRegions(*Plan);
-
- VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
- VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
+ return nullptr;
- assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
- return std::make_optional(std::move(Plan));
+ if (useActiveLaneMask(Style)) {
+ // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
+ // TailFoldingStyle is visible there.
+ bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
+ bool WithoutRuntimeCheck =
+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+ VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
+ WithoutRuntimeCheck);
+ }
+ return Plan;
}
VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
@@ -9198,8 +8957,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
Term->eraseFromParent();
- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
- CM.getTailFoldingStyle());
+ // Tail folding is not supported for outer loops, so the induction increment
+ // is guaranteed to not wrap.
+ bool HasNUW = true;
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
+ DebugLoc());
return Plan;
}
@@ -9211,105 +8973,211 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
void LoopVectorizationPlanner::adjustRecipesForReductions(
VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF) {
- for (const auto &Reduction : CM.getInLoopReductionChains()) {
- PHINode *Phi = Reduction.first;
- const RecurrenceDescriptor &RdxDesc =
- Legal->getReductionVars().find(Phi)->second;
- const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
-
- if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
+ VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();
+ // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
+ // sank outside of the loop would keep the same order as they had in the
+ // original loop.
+ SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
+ for (VPRecipeBase &R : Header->phis()) {
+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
+ ReductionPHIList.emplace_back(ReductionPhi);
+ }
+ bool HasIntermediateStore = false;
+ stable_sort(ReductionPHIList,
+ [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
+ const VPReductionPHIRecipe *R2) {
+ auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
+ auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
+ HasIntermediateStore |= IS1 || IS2;
+
+ // If neither of the recipes has an intermediate store, keep the
+ // order the same.
+ if (!IS1 && !IS2)
+ return false;
+
+ // If only one of the recipes has an intermediate store, then
+ // move it towards the beginning of the list.
+ if (IS1 && !IS2)
+ return true;
+
+ if (!IS1 && IS2)
+ return false;
+
+ // If both recipes have an intermediate store, then the recipe
+ // with the later store should be processed earlier. So it
+ // should go to the beginning of the list.
+ return DT->dominates(IS2, IS1);
+ });
+
+ if (HasIntermediateStore && ReductionPHIList.size() > 1)
+ for (VPRecipeBase *R : ReductionPHIList)
+ R->moveBefore(*Header, Header->getFirstNonPhi());
+
+ SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;
+ for (VPRecipeBase &R : Header->phis()) {
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
continue;
+ InLoopReductionPhis.push_back(PhiR);
+ }
+
+ for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) {
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+ RecurKind Kind = RdxDesc.getRecurrenceKind();
+ assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
+ "AnyOf reductions are not allowed for in-loop reductions");
+
+ // Collect the chain of "link" recipes for the reduction starting at PhiR.
+ SetVector<VPRecipeBase *> Worklist;
+ Worklist.insert(PhiR);
+ for (unsigned I = 0; I != Worklist.size(); ++I) {
+ VPRecipeBase *Cur = Worklist[I];
+ for (VPUser *U : Cur->getVPSingleValue()->users()) {
+ auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
+ if (!UserRecipe)
+ continue;
+ assert(UserRecipe->getNumDefinedValues() == 1 &&
+ "recipes must define exactly one result value");
+ Worklist.insert(UserRecipe);
+ }
+ }
+
+ // Visit operation "Links" along the reduction chain top-down starting from
+ // the phi until LoopExitValue. We keep track of the previous item
+ // (PreviousLink) to tell which of the two operands of a Link will remain
+ // scalar and which will be reduced. For minmax by select(cmp), Link will be
+ // the select instructions.
+ VPRecipeBase *PreviousLink = PhiR; // Aka Worklist[0].
+ for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) {
+ VPValue *PreviousLinkV = PreviousLink->getVPSingleValue();
+
+ Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
- // ReductionOperations are orders top-down from the phi's use to the
- // LoopExitValue. We keep a track of the previous item (the Chain) to tell
- // which of the two operands will remain scalar and which will be reduced.
- // For minmax the chain will be the select instructions.
- Instruction *Chain = Phi;
- for (Instruction *R : ReductionOperations) {
- VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
- RecurKind Kind = RdxDesc.getRecurrenceKind();
-
- VPValue *ChainOp = Plan->getVPValue(Chain);
- unsigned FirstOpId;
- assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
- "Only min/max recurrences allowed for inloop reductions");
+ // Index of the first operand which holds a non-mask vector operand.
+ unsigned IndexOfFirstOperand;
// Recognize a call to the llvm.fmuladd intrinsic.
bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
- assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
- "Expected instruction to be a call to the llvm.fmuladd intrinsic");
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
- assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
- "Expected to replace a VPWidenSelectSC");
- FirstOpId = 1;
+ VPValue *VecOp;
+ VPBasicBlock *LinkVPBB = CurrentLink->getParent();
+ if (IsFMulAdd) {
+ assert(
+ RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
+ "Expected instruction to be a call to the llvm.fmuladd intrinsic");
+ assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
+ isa<VPWidenCallRecipe>(CurrentLink)) &&
+ CurrentLink->getOperand(2) == PreviousLinkV &&
+ "expected a call where the previous link is the added operand");
+
+ // If the instruction is a call to the llvm.fmuladd intrinsic then we
+ // need to create an fmul recipe (multiplying the first two operands of
+ // the fmuladd together) to use as the vector operand for the fadd
+ // reduction.
+ VPInstruction *FMulRecipe = new VPInstruction(
+ Instruction::FMul,
+ {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
+ CurrentLinkI->getFastMathFlags());
+ LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
+ VecOp = FMulRecipe;
} else {
- assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
- (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
- "Expected to replace a VPWidenSC");
- FirstOpId = 0;
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+ if (isa<VPWidenRecipe>(CurrentLink)) {
+ assert(isa<CmpInst>(CurrentLinkI) &&
+ "need to have the compare of the select");
+ continue;
+ }
+ assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
+ "must be a select recipe");
+ IndexOfFirstOperand = 1;
+ } else {
+ assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
+ "Expected to replace a VPWidenSC");
+ IndexOfFirstOperand = 0;
+ }
+ // Note that for non-commutable operands (cmp-selects), the semantics of
+ // the cmp-select are captured in the recurrence kind.
+ unsigned VecOpId =
+ CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLinkV
+ ? IndexOfFirstOperand + 1
+ : IndexOfFirstOperand;
+ VecOp = CurrentLink->getOperand(VecOpId);
+ assert(VecOp != PreviousLinkV &&
+ CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
+ (VecOpId - IndexOfFirstOperand)) ==
+ PreviousLinkV &&
+ "PreviousLinkV must be the operand other than VecOp");
}
- unsigned VecOpId =
- R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
- VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
+ BasicBlock *BB = CurrentLinkI->getParent();
VPValue *CondOp = nullptr;
- if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {
+ if (CM.blockNeedsPredicationForAnyReason(BB)) {
VPBuilder::InsertPointGuard Guard(Builder);
- Builder.setInsertPoint(WidenRecipe->getParent(),
- WidenRecipe->getIterator());
- CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan);
+ Builder.setInsertPoint(CurrentLink);
+ CondOp = RecipeBuilder.createBlockInMask(BB, *Plan);
}
- if (IsFMulAdd) {
- // If the instruction is a call to the llvm.fmuladd intrinsic then we
- // need to create an fmul recipe to use as the vector operand for the
- // fadd reduction.
- VPInstruction *FMulRecipe = new VPInstruction(
- Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
- FMulRecipe->setFastMathFlags(R->getFastMathFlags());
- WidenRecipe->getParent()->insert(FMulRecipe,
- WidenRecipe->getIterator());
- VecOp = FMulRecipe;
- }
- VPReductionRecipe *RedRecipe =
- new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI);
- WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
- Plan->removeVPValueFor(R);
- Plan->addVPValue(R, RedRecipe);
+ VPReductionRecipe *RedRecipe = new VPReductionRecipe(
+ RdxDesc, CurrentLinkI, PreviousLinkV, VecOp, CondOp);
// Append the recipe to the end of the VPBasicBlock because we need to
// ensure that it comes after all of it's inputs, including CondOp.
- WidenRecipe->getParent()->appendRecipe(RedRecipe);
- WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
- WidenRecipe->eraseFromParent();
-
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
- VPRecipeBase *CompareRecipe =
- RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
- assert(isa<VPWidenRecipe>(CompareRecipe) &&
- "Expected to replace a VPWidenSC");
- assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
- "Expected no remaining users");
- CompareRecipe->eraseFromParent();
- }
- Chain = R;
+ // Note that this transformation may leave over dead recipes (including
+ // CurrentLink), which will be cleaned by a later VPlan transform.
+ LinkVPBB->appendRecipe(RedRecipe);
+ CurrentLink->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
+ PreviousLink = RedRecipe;
}
}
-
- // If tail is folded by masking, introduce selects between the phi
- // and the live-out instruction of each reduction, at the beginning of the
- // dedicated latch block.
- if (CM.foldTailByMasking()) {
- Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
+ Builder.setInsertPoint(&*LatchVPBB->begin());
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
- if (!PhiR || PhiR->isInLoop())
- continue;
+ VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (!PhiR || PhiR->isInLoop())
+ continue;
+
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+ auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();
+ // If tail is folded by masking, introduce selects between the phi
+ // and the live-out instruction of each reduction, at the beginning of the
+ // dedicated latch block.
+ if (CM.foldTailByMasking()) {
VPValue *Cond =
RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
VPValue *Red = PhiR->getBackedgeValue();
assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
- Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
+ FastMathFlags FMFs = RdxDesc.getFastMathFlags();
+ Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
+ Result =
+ PhiTy->isFloatingPointTy()
+ ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
+ : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
+ Result->insertBefore(&*Builder.getInsertPoint());
+ Red->replaceUsesWithIf(
+ Result->getVPSingleValue(),
+ [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
+ if (PreferPredicatedReductionSelect ||
+ TTI.preferPredicatedReductionSelect(
+ PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
+ TargetTransformInfo::ReductionFlags()))
+ PhiR->setOperand(1, Result->getVPSingleValue());
+ }
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
+ Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
+ if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+ assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
+ Type *RdxTy = RdxDesc.getRecurrenceType();
+ auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,
+ Result->getVPSingleValue(), RdxTy);
+ auto *Extnd =
+ RdxDesc.isSigned()
+ ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
+ : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
+
+ Trunc->insertAfter(Result);
+ Extnd->insertAfter(Trunc);
+ Result->getVPSingleValue()->replaceAllUsesWith(Extnd);
+ Trunc->setOperand(0, Result->getVPSingleValue());
}
}
@@ -9347,107 +9215,6 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
- assert(!State.Instance && "Int or FP induction being replicated.");
-
- Value *Start = getStartValue()->getLiveInIRValue();
- const InductionDescriptor &ID = getInductionDescriptor();
- TruncInst *Trunc = getTruncInst();
- IRBuilderBase &Builder = State.Builder;
- assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
- assert(State.VF.isVector() && "must have vector VF");
-
- // The value from the original loop to which we are mapping the new induction
- // variable.
- Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
-
- // Fast-math-flags propagate from the original induction instruction.
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
- Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
-
- // Now do the actual transformations, and start with fetching the step value.
- Value *Step = State.get(getStepValue(), VPIteration(0, 0));
-
- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
- "Expected either an induction phi-node or a truncate of it!");
-
- // Construct the initial value of the vector IV in the vector loop preheader
- auto CurrIP = Builder.saveIP();
- BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
- Builder.SetInsertPoint(VectorPH->getTerminator());
- if (isa<TruncInst>(EntryVal)) {
- assert(Start->getType()->isIntegerTy() &&
- "Truncation requires an integer type");
- auto *TruncType = cast<IntegerType>(EntryVal->getType());
- Step = Builder.CreateTrunc(Step, TruncType);
- Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
- }
-
- Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
- Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
- Value *SteppedStart = getStepVector(
- SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
-
- // We create vector phi nodes for both integer and floating-point induction
- // variables. Here, we determine the kind of arithmetic we will perform.
- Instruction::BinaryOps AddOp;
- Instruction::BinaryOps MulOp;
- if (Step->getType()->isIntegerTy()) {
- AddOp = Instruction::Add;
- MulOp = Instruction::Mul;
- } else {
- AddOp = ID.getInductionOpcode();
- MulOp = Instruction::FMul;
- }
-
- // Multiply the vectorization factor by the step using integer or
- // floating-point arithmetic as appropriate.
- Type *StepType = Step->getType();
- Value *RuntimeVF;
- if (Step->getType()->isFloatingPointTy())
- RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
- else
- RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
- Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
- // Create a vector splat to use in the induction update.
- //
- // FIXME: If the step is non-constant, we create the vector splat with
- // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
- // handle a constant vector splat.
- Value *SplatVF = isa<Constant>(Mul)
- ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
- : Builder.CreateVectorSplat(State.VF, Mul);
- Builder.restoreIP(CurrIP);
-
- // We may need to add the step a number of times, depending on the unroll
- // factor. The last of those goes into the PHI.
- PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
- &*State.CFG.PrevBB->getFirstInsertionPt());
- VecInd->setDebugLoc(EntryVal->getDebugLoc());
- Instruction *LastInduction = VecInd;
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- State.set(this, LastInduction, Part);
-
- if (isa<TruncInst>(EntryVal))
- State.addMetadata(LastInduction, EntryVal);
-
- LastInduction = cast<Instruction>(
- Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
- LastInduction->setDebugLoc(EntryVal->getDebugLoc());
- }
-
- LastInduction->setName("vec.ind.next");
- VecInd->addIncoming(SteppedStart, VectorPH);
- // Add induction update using an incorrect block temporarily. The phi node
- // will be fixed after VPlan execution. Note that at this point the latch
- // block cannot be used, as it does not exist yet.
- // TODO: Model increment value in VPlan, by turning the recipe into a
- // multi-def and a subclass of VPHeaderPHIRecipe.
- VecInd->addIncoming(LastInduction, VectorPH);
-}
-
void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
"Not a pointer induction according to InductionDescriptor!");
@@ -9480,7 +9247,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
Value *SclrGep = emitTransformedIndex(
- State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
+ State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
+ IndDesc.getKind(), IndDesc.getInductionBinOp());
SclrGep->setName("next.gep");
State.set(this, SclrGep, VPIteration(Part, Lane));
}
@@ -9547,41 +9315,26 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
// Fast-math-flags propagate from the original induction instruction.
IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
- if (IndDesc.getInductionBinOp() &&
- isa<FPMathOperator>(IndDesc.getInductionBinOp()))
- State.Builder.setFastMathFlags(
- IndDesc.getInductionBinOp()->getFastMathFlags());
+ if (FPBinOp)
+ State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
Value *Step = State.get(getStepValue(), VPIteration(0, 0));
Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
- Value *DerivedIV =
- emitTransformedIndex(State.Builder, CanonicalIV,
- getStartValue()->getLiveInIRValue(), Step, IndDesc);
+ Value *DerivedIV = emitTransformedIndex(
+ State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
+ Kind, cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName("offset.idx");
- if (ResultTy != DerivedIV->getType()) {
- assert(Step->getType()->isIntegerTy() &&
+ if (TruncResultTy) {
+ assert(TruncResultTy != DerivedIV->getType() &&
+ Step->getType()->isIntegerTy() &&
"Truncation requires an integer step");
- DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);
+ DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
}
assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
State.set(this, DerivedIV, VPIteration(0, 0));
}
-void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
- // Fast-math-flags propagate from the original induction instruction.
- IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
- if (IndDesc.getInductionBinOp() &&
- isa<FPMathOperator>(IndDesc.getInductionBinOp()))
- State.Builder.setFastMathFlags(
- IndDesc.getInductionBinOp()->getFastMathFlags());
-
- Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
- Value *Step = State.get(getStepValue(), VPIteration(0, 0));
-
- buildScalarSteps(BaseIV, Step, IndDesc, this, State);
-}
-
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
@@ -9592,48 +9345,51 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
void VPReductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Reduction being replicated.");
Value *PrevInChain = State.get(getChainOp(), 0);
- RecurKind Kind = RdxDesc->getRecurrenceKind();
- bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
+ RecurKind Kind = RdxDesc.getRecurrenceKind();
+ bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
// Propagate the fast-math flags carried by the underlying instruction.
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
- State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
+ State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *NewVecOp = State.get(getVecOp(), Part);
if (VPValue *Cond = getCondOp()) {
- Value *NewCond = State.get(Cond, Part);
- VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
- Value *Iden = RdxDesc->getRecurrenceIdentity(
- Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
- Value *IdenVec =
- State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
- Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
+ Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
+ : State.get(Cond, {Part, 0});
+ VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
+ Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
+ Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
+ RdxDesc.getFastMathFlags());
+ if (State.VF.isVector()) {
+ Iden =
+ State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
+ }
+
+ Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
NewVecOp = Select;
}
Value *NewRed;
Value *NextInChain;
if (IsOrdered) {
if (State.VF.isVector())
- NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
+ NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
PrevInChain);
else
NewRed = State.Builder.CreateBinOp(
- (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
+ (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
NewVecOp);
PrevInChain = NewRed;
} else {
PrevInChain = State.get(getChainOp(), Part);
- NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
+ NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
}
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
- NextInChain =
- createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
- NewRed, PrevInChain);
+ NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
+ NewRed, PrevInChain);
} else if (IsOrdered)
NextInChain = NewRed;
else
NextInChain = State.Builder.CreateBinOp(
- (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
- PrevInChain);
+ (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
State.set(this, NextInChain, Part);
}
}
@@ -9652,7 +9408,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
VectorType::get(UI->getType(), State.VF));
State.set(this, Poison, State.Instance->Part);
}
- State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
+ State.packScalarIntoVectorValue(this, *State.Instance);
}
return;
}
@@ -9718,9 +9474,16 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
bool isMaskRequired = getMask();
- if (isMaskRequired)
- for (unsigned Part = 0; Part < State.UF; ++Part)
- BlockInMaskParts[Part] = State.get(getMask(), Part);
+ if (isMaskRequired) {
+ // Mask reversal is only neede for non-all-one (null) masks, as reverse of a
+ // null all-one mask is a null mask.
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Mask = State.get(getMask(), Part);
+ if (isReverse())
+ Mask = Builder.CreateVectorReverse(Mask, "reverse");
+ BlockInMaskParts[Part] = Mask;
+ }
+ }
const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
// Calculate the pointer for the specific unroll-part.
@@ -9731,7 +9494,8 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
const DataLayout &DL =
Builder.GetInsertBlock()->getModule()->getDataLayout();
Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
- ? DL.getIndexType(ScalarDataTy->getPointerTo())
+ ? DL.getIndexType(PointerType::getUnqual(
+ ScalarDataTy->getContext()))
: Builder.getInt32Ty();
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
@@ -9751,21 +9515,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
PartPtr =
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- BlockInMaskParts[Part] =
- Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
} else {
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
}
- unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
- return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ return PartPtr;
};
// Handle Stores:
if (SI) {
- State.setDebugLocFromInst(SI);
+ State.setDebugLocFrom(SI->getDebugLoc());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Instruction *NewSI = nullptr;
@@ -9798,7 +9558,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
// Handle loads.
assert(LI && "Must have a load instruction");
- State.setDebugLocFromInst(LI);
+ State.setDebugLocFrom(LI->getDebugLoc());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *NewLI;
if (CreateGatherScatter) {
@@ -9877,95 +9637,6 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
return CM_ScalarEpilogueAllowed;
}
-Value *VPTransformState::get(VPValue *Def, unsigned Part) {
- // If Values have been set for this Def return the one relevant for \p Part.
- if (hasVectorValue(Def, Part))
- return Data.PerPartOutput[Def][Part];
-
- auto GetBroadcastInstrs = [this, Def](Value *V) {
- bool SafeToHoist = Def->isDefinedOutsideVectorRegions();
- if (VF.isScalar())
- return V;
- // Place the code for broadcasting invariant variables in the new preheader.
- IRBuilder<>::InsertPointGuard Guard(Builder);
- if (SafeToHoist) {
- BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
- Plan->getVectorLoopRegion()->getSinglePredecessor())];
- if (LoopVectorPreHeader)
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- }
-
- // Place the code for broadcasting invariant variables in the new preheader.
- // Broadcast the scalar into all locations in the vector.
- Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
-
- return Shuf;
- };
-
- if (!hasScalarValue(Def, {Part, 0})) {
- Value *IRV = Def->getLiveInIRValue();
- Value *B = GetBroadcastInstrs(IRV);
- set(Def, B, Part);
- return B;
- }
-
- Value *ScalarValue = get(Def, {Part, 0});
- // If we aren't vectorizing, we can just copy the scalar map values over
- // to the vector map.
- if (VF.isScalar()) {
- set(Def, ScalarValue, Part);
- return ScalarValue;
- }
-
- bool IsUniform = vputils::isUniformAfterVectorization(Def);
-
- unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
- // Check if there is a scalar value for the selected lane.
- if (!hasScalarValue(Def, {Part, LastLane})) {
- // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
- // VPExpandSCEVRecipes can also be uniform.
- assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
- isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
- isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
- "unexpected recipe found to be invariant");
- IsUniform = true;
- LastLane = 0;
- }
-
- auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
- // Set the insert point after the last scalarized instruction or after the
- // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
- // will directly follow the scalar definitions.
- auto OldIP = Builder.saveIP();
- auto NewIP =
- isa<PHINode>(LastInst)
- ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
- : std::next(BasicBlock::iterator(LastInst));
- Builder.SetInsertPoint(&*NewIP);
-
- // However, if we are vectorizing, we need to construct the vector values.
- // If the value is known to be uniform after vectorization, we can just
- // broadcast the scalar value corresponding to lane zero for each unroll
- // iteration. Otherwise, we construct the vector values using
- // insertelement instructions. Since the resulting vectors are stored in
- // State, we will only generate the insertelements once.
- Value *VectorValue = nullptr;
- if (IsUniform) {
- VectorValue = GetBroadcastInstrs(ScalarValue);
- set(Def, VectorValue, Part);
- } else {
- // Initialize packing with insertelements to start from undef.
- assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
- set(Def, Undef, Part);
- for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
- ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
- VectorValue = get(Def, Part);
- }
- Builder.restoreIP(OldIP);
- return VectorValue;
-}
-
// Process the loop in the VPlan-native vectorization path. This path builds
// VPlan upfront in the vectorization pipeline, which allows to apply
// VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -9994,7 +9665,8 @@ static bool processLoopInVPlanNativePath(
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE);
+ LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
+ ORE);
// Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();
@@ -10013,8 +9685,10 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
{
+ bool AddBranchWeights =
+ hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
- F->getParent()->getDataLayout());
+ F->getParent()->getDataLayout(), AddBranchWeights);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10022,6 +9696,8 @@ static bool processLoopInVPlanNativePath(
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
}
+ reportVectorization(ORE, L, VF, 1);
+
// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized();
assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
@@ -10076,7 +9752,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
VectorizationFactor &VF,
std::optional<unsigned> VScale, Loop *L,
- ScalarEvolution &SE) {
+ ScalarEvolution &SE,
+ ScalarEpilogueLowering SEL) {
InstructionCost CheckCost = Checks.getCost();
if (!CheckCost.isValid())
return false;
@@ -10146,11 +9823,13 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
// RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
double MinTC2 = RtC * 10 / ScalarC;
- // Now pick the larger minimum. If it is not a multiple of VF, choose the
- // next closest multiple of VF. This should partly compensate for ignoring
- // the epilogue cost.
+ // Now pick the larger minimum. If it is not a multiple of VF and a scalar
+ // epilogue is allowed, choose the next closest multiple of VF. This should
+ // partly compensate for ignoring the epilogue cost.
uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
- VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
+ if (SEL == CM_ScalarEpilogueAllowed)
+ MinTC = alignTo(MinTC, IntVF);
+ VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
LLVM_DEBUG(
dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
@@ -10270,7 +9949,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
else {
if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
LLVM_DEBUG(dbgs() << "\n");
- SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+ // Predicate tail-folded loops are efficient even when the loop
+ // iteration count is low. However, setting the epilogue policy to
+ // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
+ // with runtime checks. It's more effective to let
+ // `areRuntimeChecksProfitable` determine if vectorization is beneficial
+ // for the loop.
+ if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
} else {
LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
"small to consider vectorizing.\n");
@@ -10334,7 +10020,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
F, &Hints, IAI);
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
+ LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
ORE);
// Get user vectorization factor and interleave count.
@@ -10347,8 +10033,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
+ bool AddBranchWeights =
+ hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
- F->getParent()->getDataLayout());
+ F->getParent()->getDataLayout(), AddBranchWeights);
if (MaybeVF) {
VF = *MaybeVF;
// Select the interleave count.
@@ -10365,7 +10053,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
!areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
- *PSE.getSE())) {
+ *PSE.getSE(), SEL)) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
@@ -10587,13 +10275,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
DisableRuntimeUnroll = true;
}
// Report the vectorization decision.
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
- L->getHeader())
- << "vectorized loop (vectorization width: "
- << NV("VectorizationFactor", VF.Width)
- << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
- });
+ reportVectorization(ORE, L, VF, IC);
}
if (ORE->allowExtraAnalysis(LV_NAME))
@@ -10676,8 +10358,14 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
Changed |= CFGChanged |= processLoop(L);
- if (Changed)
+ if (Changed) {
LAIs->clear();
+
+#ifndef NDEBUG
+ if (VerifySCEV)
+ SE->verify();
+#endif
+ }
}
// Process each loop nest in the function.
@@ -10725,10 +10413,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<ScalarEvolutionAnalysis>();
-
-#ifdef EXPENSIVE_CHECKS
- SE.verify();
-#endif
}
if (Result.MadeCFGChange) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9870ffbb586c..9d799124074c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19,7 +19,6 @@
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
@@ -34,6 +33,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/IVDescriptors.h"
@@ -97,7 +97,6 @@
#include <string>
#include <tuple>
#include <utility>
-#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -108,8 +107,9 @@ using namespace slpvectorizer;
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
-cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
- cl::desc("Run the SLP vectorization passes"));
+static cl::opt<bool>
+ RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
+ cl::desc("Run the SLP vectorization passes"));
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
@@ -140,10 +140,6 @@ static cl::opt<unsigned>
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
-static cl::opt<int>
-MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
- cl::desc("Maximum depth of the lookup for consecutive stores."));
-
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
@@ -232,6 +228,17 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
return isConstant(I->getOperand(2));
}
+#if !defined(NDEBUG)
+/// Print a short descriptor of the instruction bundle suitable for debug output.
+static std::string shortBundleName(ArrayRef<Value *> VL) {
+ std::string Result;
+ raw_string_ostream OS(Result);
+ OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
+ OS.flush();
+ return Result;
+}
+#endif
+
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
@@ -384,8 +391,10 @@ static SmallBitVector isUndefVector(const Value *V,
if (isa<T>(II->getOperand(1)))
continue;
std::optional<unsigned> Idx = getInsertIndex(II);
- if (!Idx)
- continue;
+ if (!Idx) {
+ Res.reset();
+ return Res;
+ }
if (*Idx < UseMask.size() && !UseMask.test(*Idx))
Res.reset(*Idx);
}
@@ -429,26 +438,6 @@ static SmallBitVector isUndefVector(const Value *V,
/// i32 6>
/// %2 = mul <4 x i8> %1, %1
/// ret <4 x i8> %2
-/// We convert this initially to something like:
-/// %x0 = extractelement <4 x i8> %x, i32 0
-/// %x3 = extractelement <4 x i8> %x, i32 3
-/// %y1 = extractelement <4 x i8> %y, i32 1
-/// %y2 = extractelement <4 x i8> %y, i32 2
-/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
-/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
-/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
-/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
-/// %5 = mul <4 x i8> %4, %4
-/// %6 = extractelement <4 x i8> %5, i32 0
-/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
-/// %7 = extractelement <4 x i8> %5, i32 1
-/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
-/// %8 = extractelement <4 x i8> %5, i32 2
-/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
-/// %9 = extractelement <4 x i8> %5, i32 3
-/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
-/// ret <4 x i8> %ins4
-/// InstCombiner transforms this into a shuffle and vector mul
/// Mask will return the Shuffle Mask equivalent to the extracted elements.
/// TODO: Can we split off and reuse the shuffle mask detection from
/// ShuffleVectorInst/getShuffleCost?
@@ -539,117 +528,6 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {
return *EI->idx_begin();
}
-/// Tries to find extractelement instructions with constant indices from fixed
-/// vector type and gather such instructions into a bunch, which highly likely
-/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
-/// successful, the matched scalars are replaced by poison values in \p VL for
-/// future analysis.
-static std::optional<TTI::ShuffleKind>
-tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
- SmallVectorImpl<int> &Mask) {
- // Scan list of gathered scalars for extractelements that can be represented
- // as shuffles.
- MapVector<Value *, SmallVector<int>> VectorOpToIdx;
- SmallVector<int> UndefVectorExtracts;
- for (int I = 0, E = VL.size(); I < E; ++I) {
- auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
- if (!EI) {
- if (isa<UndefValue>(VL[I]))
- UndefVectorExtracts.push_back(I);
- continue;
- }
- auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
- if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
- continue;
- std::optional<unsigned> Idx = getExtractIndex(EI);
- // Undefined index.
- if (!Idx) {
- UndefVectorExtracts.push_back(I);
- continue;
- }
- SmallBitVector ExtractMask(VecTy->getNumElements(), true);
- ExtractMask.reset(*Idx);
- if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
- UndefVectorExtracts.push_back(I);
- continue;
- }
- VectorOpToIdx[EI->getVectorOperand()].push_back(I);
- }
- // Sort the vector operands by the maximum number of uses in extractelements.
- MapVector<unsigned, SmallVector<Value *>> VFToVector;
- for (const auto &Data : VectorOpToIdx)
- VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
- .push_back(Data.first);
- for (auto &Data : VFToVector) {
- stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
- return VectorOpToIdx.find(V1)->second.size() >
- VectorOpToIdx.find(V2)->second.size();
- });
- }
- // Find the best pair of the vectors with the same number of elements or a
- // single vector.
- const int UndefSz = UndefVectorExtracts.size();
- unsigned SingleMax = 0;
- Value *SingleVec = nullptr;
- unsigned PairMax = 0;
- std::pair<Value *, Value *> PairVec(nullptr, nullptr);
- for (auto &Data : VFToVector) {
- Value *V1 = Data.second.front();
- if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
- SingleMax = VectorOpToIdx[V1].size() + UndefSz;
- SingleVec = V1;
- }
- Value *V2 = nullptr;
- if (Data.second.size() > 1)
- V2 = *std::next(Data.second.begin());
- if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
- UndefSz) {
- PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
- PairVec = std::make_pair(V1, V2);
- }
- }
- if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
- return std::nullopt;
- // Check if better to perform a shuffle of 2 vectors or just of a single
- // vector.
- SmallVector<Value *> SavedVL(VL.begin(), VL.end());
- SmallVector<Value *> GatheredExtracts(
- VL.size(), PoisonValue::get(VL.front()->getType()));
- if (SingleMax >= PairMax && SingleMax) {
- for (int Idx : VectorOpToIdx[SingleVec])
- std::swap(GatheredExtracts[Idx], VL[Idx]);
- } else {
- for (Value *V : {PairVec.first, PairVec.second})
- for (int Idx : VectorOpToIdx[V])
- std::swap(GatheredExtracts[Idx], VL[Idx]);
- }
- // Add extracts from undefs too.
- for (int Idx : UndefVectorExtracts)
- std::swap(GatheredExtracts[Idx], VL[Idx]);
- // Check that gather of extractelements can be represented as just a
- // shuffle of a single/two vectors the scalars are extracted from.
- std::optional<TTI::ShuffleKind> Res =
- isFixedVectorShuffle(GatheredExtracts, Mask);
- if (!Res) {
- // TODO: try to check other subsets if possible.
- // Restore the original VL if attempt was not successful.
- VL.swap(SavedVL);
- return std::nullopt;
- }
- // Restore unused scalars from mask, if some of the extractelements were not
- // selected for shuffle.
- for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
- auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
- if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
- !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
- is_contained(UndefVectorExtracts, I))
- continue;
- if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))
- std::swap(VL[I], GatheredExtracts[I]);
- }
- return Res;
-}
-
namespace {
/// Main data required for vectorization of instructions.
@@ -695,7 +573,7 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) {
return S.OpValue;
}
-/// \returns true if \p Opcode is allowed as part of of the main/alternate
+/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
///
/// Example of unsupported opcode is SDIV that can potentially cause UB if the
@@ -889,18 +767,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
- Type *Ty = VL[0]->getType();
- for (int i = 1, e = VL.size(); i < e; i++)
- if (VL[i]->getType() != Ty)
- return false;
-
- return true;
+ Type *Ty = VL.front()->getType();
+ return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
}
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
-static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
- TargetLibraryInfo *TLI) {
+static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+ TargetLibraryInfo *TLI) {
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
@@ -914,11 +788,10 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
- return (CI->getArgOperand(i) == Scalar);
- }
- [[fallthrough]];
+ return any_of(enumerate(CI->args()), [&](auto &&Arg) {
+ return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
+ Arg.value().get() == Scalar;
+ });
}
default:
return false;
@@ -1181,6 +1054,7 @@ public:
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntry.clear();
+ MultiNodeScalars.clear();
MustGather.clear();
EntryToLastInstruction.clear();
ExternalUses.clear();
@@ -1273,7 +1147,7 @@ public:
/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
- unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+ unsigned canMapToVector(Type *T) const;
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
@@ -1324,6 +1198,9 @@ public:
}
LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
#endif
+ bool operator == (const EdgeInfo &Other) const {
+ return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
+ }
};
/// A helper class used for scoring candidates for two consecutive lanes.
@@ -1764,7 +1641,7 @@ public:
auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
return 0;
- return R.areAllUsersVectorized(IdxLaneI, std::nullopt)
+ return R.areAllUsersVectorized(IdxLaneI)
? LookAheadHeuristics::ScoreAllUserVectorized
: 0;
}
@@ -1941,7 +1818,7 @@ public:
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
} else if (NumFreeOpsHash.NumOfAPOs == Min &&
NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
- auto It = HashMap.find(NumFreeOpsHash.Hash);
+ auto *It = HashMap.find(NumFreeOpsHash.Hash);
if (It == HashMap.end())
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
else
@@ -2203,7 +2080,7 @@ public:
for (int Pass = 0; Pass != 2; ++Pass) {
// Check if no need to reorder operands since they're are perfect or
// shuffled diamond match.
- // Need to to do it to avoid extra external use cost counting for
+ // Need to do it to avoid extra external use cost counting for
// shuffled matches, which may cause regressions.
if (SkipReordering())
break;
@@ -2388,6 +2265,18 @@ public:
~BoUpSLP();
private:
+ /// Determine if a vectorized value \p V in can be demoted to
+ /// a smaller type with a truncation. We collect the values that will be
+ /// demoted in ToDemote and additional roots that require investigating in
+ /// Roots.
+ /// \param DemotedConsts list of Instruction/OperandIndex pairs that are
+ /// constant and to be demoted. Required to correctly identify constant nodes
+ /// to be demoted.
+ bool collectValuesToDemote(
+ Value *V, SmallVectorImpl<Value *> &ToDemote,
+ DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
+
/// Check if the operands on the edges \p Edges of the \p UserTE allows
/// reordering (i.e. the operands can be reordered because they have only one
/// user and reordarable).
@@ -2410,12 +2299,25 @@ private:
TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
TreeEntry *TE = nullptr;
- const auto *It = find_if(VL, [this, &TE](Value *V) {
+ const auto *It = find_if(VL, [&](Value *V) {
TE = getTreeEntry(V);
- return TE;
+ if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
+ return true;
+ auto It = MultiNodeScalars.find(V);
+ if (It != MultiNodeScalars.end()) {
+ for (TreeEntry *E : It->second) {
+ if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
+ TE = E;
+ return true;
+ }
+ }
+ }
+ return false;
});
- if (It != VL.end() && TE->isSame(VL))
+ if (It != VL.end()) {
+ assert(TE->isSame(VL) && "Expected same scalars.");
return TE;
+ }
return nullptr;
}
@@ -2428,13 +2330,16 @@ private:
}
/// Checks if all users of \p I are the part of the vectorization tree.
- bool areAllUsersVectorized(Instruction *I,
- ArrayRef<Value *> VectorizedVals) const;
+ bool areAllUsersVectorized(
+ Instruction *I,
+ const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
/// Return information about the vector formed for the specified index
/// of a vector of (the same) instruction.
- TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,
- unsigned OpIdx);
+ TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
+
+ /// \ returns the graph entry for the \p Idx operand of the \p E entry.
+ const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
@@ -2450,15 +2355,22 @@ private:
/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
/// returns false, setting \p CurrentOrder to either an empty vector or a
/// non-identity permutation that allows to reuse extract instructions.
+ /// \param ResizeAllowed indicates whether it is allowed to handle subvector
+ /// extract order.
bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
- SmallVectorImpl<unsigned> &CurrentOrder) const;
+ SmallVectorImpl<unsigned> &CurrentOrder,
+ bool ResizeAllowed = false) const;
/// Vectorize a single entry in the tree.
- Value *vectorizeTree(TreeEntry *E);
+ /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
+ /// avoid issues with def-use order.
+ Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
/// \p E.
- Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
+ /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
+ /// avoid issues with def-use order.
+ Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
/// Create a new vector from a list of scalar values. Produces a sequence
/// which exploits values reused across lanes, and arranges the inserts
@@ -2477,17 +2389,50 @@ private:
/// instruction in the list).
Instruction &getLastInstructionInBundle(const TreeEntry *E);
- /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
- /// tree entries.
+ /// Tries to find extractelement instructions with constant indices from fixed
+ /// vector type and gather such instructions into a bunch, which highly likely
+ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
+ /// was successful, the matched scalars are replaced by poison values in \p VL
+ /// for future analysis.
+ std::optional<TargetTransformInfo::ShuffleKind>
+ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
+ SmallVectorImpl<int> &Mask) const;
+
+ /// Tries to find extractelement instructions with constant indices from fixed
+ /// vector type and gather such instructions into a bunch, which highly likely
+ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
+ /// was successful, the matched scalars are replaced by poison values in \p VL
+ /// for future analysis.
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+ tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+ SmallVectorImpl<int> &Mask,
+ unsigned NumParts) const;
+
+ /// Checks if the gathered \p VL can be represented as a single register
+ /// shuffle(s) of previous tree entries.
/// \param TE Tree entry checked for permutation.
/// \param VL List of scalars (a subset of the TE scalar), checked for
- /// permutations.
+ /// permutations. Must form single-register vector.
/// \returns ShuffleKind, if gathered values can be represented as shuffles of
- /// previous tree entries. \p Mask is filled with the shuffle mask.
+ /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
std::optional<TargetTransformInfo::ShuffleKind>
- isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<const TreeEntry *> &Entries);
+ isGatherShuffledSingleRegisterEntry(
+ const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
+
+ /// Checks if the gathered \p VL can be represented as multi-register
+ /// shuffle(s) of previous tree entries.
+ /// \param TE Tree entry checked for permutation.
+ /// \param VL List of scalars (a subset of the TE scalar), checked for
+ /// permutations.
+ /// \returns per-register series of ShuffleKind, if gathered values can be
+ /// represented as shuffles of previous tree entries. \p Mask is filled with
+ /// the shuffle mask (also on per-register base).
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+ isGatherShuffledEntry(
+ const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
+ unsigned NumParts);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
@@ -2517,14 +2462,14 @@ private:
/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
/// users of \p TE and collects the stores. It returns the map from the store
/// pointers to the collected stores.
- DenseMap<Value *, SmallVector<StoreInst *, 4>>
+ DenseMap<Value *, SmallVector<StoreInst *>>
collectUserStores(const BoUpSLP::TreeEntry *TE) const;
/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
- /// stores in \p StoresVec can form a vector instruction. If so it returns true
- /// and populates \p ReorderIndices with the shuffle indices of the the stores
- /// when compared to the sorted vector.
- bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+ /// stores in \p StoresVec can form a vector instruction. If so it returns
+ /// true and populates \p ReorderIndices with the shuffle indices of the
+ /// stores when compared to the sorted vector.
+ bool canFormVector(ArrayRef<StoreInst *> StoresVec,
OrdersType &ReorderIndices) const;
/// Iterates through the users of \p TE, looking for scalar stores that can be
@@ -2621,10 +2566,18 @@ private:
/// The Scalars are vectorized into this value. It is initialized to Null.
WeakTrackingVH VectorizedValue = nullptr;
+ /// New vector phi instructions emitted for the vectorized phi nodes.
+ PHINode *PHI = nullptr;
+
/// Do we need to gather this sequence or vectorize it
/// (either with vector instruction or with scatter/gather
/// intrinsics for store/load)?
- enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
+ enum EntryState {
+ Vectorize,
+ ScatterVectorize,
+ PossibleStridedVectorize,
+ NeedToGather
+ };
EntryState State;
/// Does this sequence require some shuffling?
@@ -2772,6 +2725,14 @@ private:
return FoundLane;
}
+ /// Build a shuffle mask for graph entry which represents a merge of main
+ /// and alternate operations.
+ void
+ buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<Value *> *OpScalars = nullptr,
+ SmallVectorImpl<Value *> *AltScalars = nullptr) const;
+
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
@@ -2792,6 +2753,9 @@ private:
case ScatterVectorize:
dbgs() << "ScatterVectorize\n";
break;
+ case PossibleStridedVectorize:
+ dbgs() << "PossibleStridedVectorize\n";
+ break;
case NeedToGather:
dbgs() << "NeedToGather\n";
break;
@@ -2892,7 +2856,14 @@ private:
}
if (Last->State != TreeEntry::NeedToGather) {
for (Value *V : VL) {
- assert(!getTreeEntry(V) && "Scalar already in tree!");
+ const TreeEntry *TE = getTreeEntry(V);
+ assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
+ "Scalar already in tree!");
+ if (TE) {
+ if (TE != Last)
+ MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
+ continue;
+ }
ScalarToTreeEntry[V] = Last;
}
// Update the scheduler bundle to point to this TreeEntry.
@@ -2905,7 +2876,8 @@ private:
for (Value *V : VL) {
if (doesNotNeedToBeScheduled(V))
continue;
- assert(BundleMember && "Unexpected end of bundle.");
+ if (!BundleMember)
+ continue;
BundleMember->TE = Last;
BundleMember = BundleMember->NextInBundle;
}
@@ -2913,6 +2885,10 @@ private:
assert(!BundleMember && "Bundle and VL out of sync");
} else {
MustGather.insert(VL.begin(), VL.end());
+ // Build a map for gathered scalars to the nodes where they are used.
+ for (Value *V : VL)
+ if (!isConstant(V))
+ ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
}
if (UserTreeIdx.UserTE)
@@ -2950,6 +2926,10 @@ private:
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
+ /// List of scalars, used in several vectorize nodes, and the list of the
+ /// nodes.
+ SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
+
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
@@ -2995,25 +2975,25 @@ private:
/// is invariant in the calling loop.
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
+ if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
+ return true;
// First check if the result is already in the cache.
- AliasCacheKey key = std::make_pair(Inst1, Inst2);
- std::optional<bool> &result = AliasCache[key];
- if (result) {
- return *result;
- }
- bool aliased = true;
- if (Loc1.Ptr && isSimple(Inst1))
- aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
+ AliasCacheKey Key = std::make_pair(Inst1, Inst2);
+ auto It = AliasCache.find(Key);
+ if (It != AliasCache.end())
+ return It->second;
+ bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
// Store the result in the cache.
- result = aliased;
- return aliased;
+ AliasCache.try_emplace(Key, Aliased);
+ AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
+ return Aliased;
}
using AliasCacheKey = std::pair<Instruction *, Instruction *>;
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
- DenseMap<AliasCacheKey, std::optional<bool>> AliasCache;
+ DenseMap<AliasCacheKey, bool> AliasCache;
// Cache for pointerMayBeCaptured calls inside AA. This is preserved
// globally through SLP because we don't perform any action which
@@ -3047,7 +3027,7 @@ private:
SetVector<Instruction *> GatherShuffleExtractSeq;
/// A list of blocks that we are going to CSE.
- SetVector<BasicBlock *> CSEBlocks;
+ DenseSet<BasicBlock *> CSEBlocks;
/// Contains all scheduling relevant data for an instruction.
/// A ScheduleData either represents a single instruction or a member of an
@@ -3497,7 +3477,7 @@ private:
BasicBlock *BB;
/// Simple memory allocation for ScheduleData.
- std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+ SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
/// The size of a ScheduleData array in ScheduleDataChunks.
int ChunkSize;
@@ -3607,7 +3587,7 @@ private:
/// where "width" indicates the minimum bit width and "signed" is True if the
/// value must be signed-extended, rather than zero-extended, back to its
/// original width.
- MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+ DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
};
} // end namespace slpvectorizer
@@ -3676,7 +3656,7 @@ template <> struct GraphTraits<BoUpSLP *> {
template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
using TreeEntry = BoUpSLP::TreeEntry;
- DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+ DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
std::string Str;
@@ -3699,7 +3679,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
const BoUpSLP *) {
if (Entry->State == TreeEntry::NeedToGather)
return "color=red";
- if (Entry->State == TreeEntry::ScatterVectorize)
+ if (Entry->State == TreeEntry::ScatterVectorize ||
+ Entry->State == TreeEntry::PossibleStridedVectorize)
return "color=blue";
return "";
}
@@ -3761,7 +3742,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
inversePermutation(Order, MaskOrder);
}
reorderReuses(MaskOrder, Mask);
- if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
+ if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) {
Order.clear();
return;
}
@@ -3779,7 +3760,40 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
OrdersType CurrentOrder(NumScalars, NumScalars);
SmallVector<int> Positions;
SmallBitVector UsedPositions(NumScalars);
- const TreeEntry *STE = nullptr;
+ DenseMap<const TreeEntry *, unsigned> UsedEntries;
+ DenseMap<Value *, std::pair<const TreeEntry *, unsigned>> ValueToEntryPos;
+ for (Value *V : TE.Scalars) {
+ if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
+ continue;
+ const auto *LocalSTE = getTreeEntry(V);
+ if (!LocalSTE)
+ continue;
+ unsigned Lane =
+ std::distance(LocalSTE->Scalars.begin(), find(LocalSTE->Scalars, V));
+ if (Lane >= NumScalars)
+ continue;
+ ++UsedEntries.try_emplace(LocalSTE, 0).first->getSecond();
+ ValueToEntryPos.try_emplace(V, LocalSTE, Lane);
+ }
+ if (UsedEntries.empty())
+ return std::nullopt;
+ const TreeEntry &BestSTE =
+ *std::max_element(UsedEntries.begin(), UsedEntries.end(),
+ [](const std::pair<const TreeEntry *, unsigned> &P1,
+ const std::pair<const TreeEntry *, unsigned> &P2) {
+ return P1.second < P2.second;
+ })
+ ->first;
+ UsedEntries.erase(&BestSTE);
+ const TreeEntry *SecondBestSTE = nullptr;
+ if (!UsedEntries.empty())
+ SecondBestSTE =
+ std::max_element(UsedEntries.begin(), UsedEntries.end(),
+ [](const std::pair<const TreeEntry *, unsigned> &P1,
+ const std::pair<const TreeEntry *, unsigned> &P2) {
+ return P1.second < P2.second;
+ })
+ ->first;
// Try to find all gathered scalars that are gets vectorized in other
// vectorize node. Here we can have only one single tree vector node to
// correctly identify order of the gathered scalars.
@@ -3787,58 +3801,56 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
Value *V = TE.Scalars[I];
if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
continue;
- if (const auto *LocalSTE = getTreeEntry(V)) {
- if (!STE)
- STE = LocalSTE;
- else if (STE != LocalSTE)
- // Take the order only from the single vector node.
- return std::nullopt;
- unsigned Lane =
- std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
- if (Lane >= NumScalars)
- return std::nullopt;
- if (CurrentOrder[Lane] != NumScalars) {
- if (Lane != I)
- continue;
- UsedPositions.reset(CurrentOrder[Lane]);
- }
- // The partial identity (where only some elements of the gather node are
- // in the identity order) is good.
- CurrentOrder[Lane] = I;
- UsedPositions.set(I);
+ const auto [LocalSTE, Lane] = ValueToEntryPos.lookup(V);
+ if (!LocalSTE || (LocalSTE != &BestSTE && LocalSTE != SecondBestSTE))
+ continue;
+ if (CurrentOrder[Lane] != NumScalars) {
+ if ((CurrentOrder[Lane] >= BestSTE.Scalars.size() ||
+ BestSTE.Scalars[CurrentOrder[Lane]] == V) &&
+ (Lane != I || LocalSTE == SecondBestSTE))
+ continue;
+ UsedPositions.reset(CurrentOrder[Lane]);
}
+ // The partial identity (where only some elements of the gather node are
+ // in the identity order) is good.
+ CurrentOrder[Lane] = I;
+ UsedPositions.set(I);
}
// Need to keep the order if we have a vector entry and at least 2 scalars or
// the vectorized entry has just 2 scalars.
- if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
- auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
- for (unsigned I = 0; I < NumScalars; ++I)
- if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
- return false;
- return true;
- };
- if (IsIdentityOrder(CurrentOrder))
- return OrdersType();
- auto *It = CurrentOrder.begin();
- for (unsigned I = 0; I < NumScalars;) {
- if (UsedPositions.test(I)) {
- ++I;
- continue;
- }
- if (*It == NumScalars) {
- *It = I;
- ++I;
- }
- ++It;
+ if (BestSTE.Scalars.size() != 2 && UsedPositions.count() <= 1)
+ return std::nullopt;
+ auto IsIdentityOrder = [&](ArrayRef<unsigned> CurrentOrder) {
+ for (unsigned I = 0; I < NumScalars; ++I)
+ if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
+ return false;
+ return true;
+ };
+ if (IsIdentityOrder(CurrentOrder))
+ return OrdersType();
+ auto *It = CurrentOrder.begin();
+ for (unsigned I = 0; I < NumScalars;) {
+ if (UsedPositions.test(I)) {
+ ++I;
+ continue;
}
- return std::move(CurrentOrder);
+ if (*It == NumScalars) {
+ *It = I;
+ ++I;
+ }
+ ++It;
}
- return std::nullopt;
+ return std::move(CurrentOrder);
}
namespace {
/// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState { Gather, Vectorize, ScatterVectorize };
+enum class LoadsState {
+ Gather,
+ Vectorize,
+ ScatterVectorize,
+ PossibleStridedVectorize
+};
} // anonymous namespace
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
@@ -3898,6 +3910,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
if (IsSorted || all_of(PointerOps, [&](Value *P) {
return arePointersCompatible(P, PointerOps.front(), TLI);
})) {
+ bool IsPossibleStrided = false;
if (IsSorted) {
Value *Ptr0;
Value *PtrN;
@@ -3913,6 +3926,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
// Check that the sorted loads are consecutive.
if (static_cast<unsigned>(*Diff) == VL.size() - 1)
return LoadsState::Vectorize;
+ // Simple check if not a strided access - clear order.
+ IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
}
// TODO: need to improve analysis of the pointers, if not all of them are
// GEPs or have > 2 operands, we end up with a gather node, which just
@@ -3934,7 +3949,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
!TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
- return LoadsState::ScatterVectorize;
+ return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
+ : LoadsState::ScatterVectorize;
}
}
@@ -4050,7 +4066,8 @@ static bool areTwoInsertFromSameBuildVector(
// Go through the vector operand of insertelement instructions trying to find
// either VU as the original vector for IE2 or V as the original vector for
// IE1.
- SmallSet<int, 8> ReusedIdx;
+ SmallBitVector ReusedIdx(
+ cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
bool IsReusedIdx = false;
do {
if (IE2 == VU && !IE1)
@@ -4058,16 +4075,18 @@ static bool areTwoInsertFromSameBuildVector(
if (IE1 == V && !IE2)
return V->hasOneUse();
if (IE1 && IE1 != V) {
- IsReusedIdx |=
- !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second;
+ unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
+ IsReusedIdx |= ReusedIdx.test(Idx1);
+ ReusedIdx.set(Idx1);
if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
IE1 = nullptr;
else
IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
}
if (IE2 && IE2 != VU) {
- IsReusedIdx |=
- !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second;
+ unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
+ IsReusedIdx |= ReusedIdx.test(Idx2);
+ ReusedIdx.set(Idx2);
if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
IE2 = nullptr;
else
@@ -4135,13 +4154,16 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::nullopt; // No need to reorder.
return std::move(ResOrder);
}
- if (TE.State == TreeEntry::Vectorize &&
+ if ((TE.State == TreeEntry::Vectorize ||
+ TE.State == TreeEntry::PossibleStridedVectorize) &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
!TE.isAltShuffle())
return TE.ReorderIndices;
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
- auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) {
+ auto PHICompare = [&](unsigned I1, unsigned I2) {
+ Value *V1 = TE.Scalars[I1];
+ Value *V2 = TE.Scalars[I2];
if (V1 == V2)
return false;
if (!V1->hasOneUse() || !V2->hasOneUse())
@@ -4180,14 +4202,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
};
if (!TE.ReorderIndices.empty())
return TE.ReorderIndices;
- DenseMap<Value *, unsigned> PhiToId;
- SmallVector<Value *, 4> Phis;
+ DenseMap<unsigned, unsigned> PhiToId;
+ SmallVector<unsigned> Phis(TE.Scalars.size());
+ std::iota(Phis.begin(), Phis.end(), 0);
OrdersType ResOrder(TE.Scalars.size());
- for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) {
- PhiToId[TE.Scalars[Id]] = Id;
- Phis.push_back(TE.Scalars[Id]);
- }
- llvm::stable_sort(Phis, PHICompare);
+ for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
+ PhiToId[Id] = Id;
+ stable_sort(Phis, PHICompare);
for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
ResOrder[Id] = PhiToId[Phis[Id]];
if (IsIdentityOrder(ResOrder))
@@ -4214,7 +4235,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// Check that gather of extractelements can be represented as
// just a shuffle of a single vector.
OrdersType CurrentOrder;
- bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
+ bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
+ /*ResizeAllowed=*/true);
if (Reuse || !CurrentOrder.empty()) {
if (!CurrentOrder.empty())
fixupOrderingIndices(CurrentOrder);
@@ -4270,7 +4292,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
unsigned Sz) {
ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
- if (ShuffleVectorInst::isIdentityMask(FirstCluster))
+ if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
return false;
for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
ArrayRef<int> Cluster = Mask.slice(I, Sz);
@@ -4386,7 +4408,9 @@ void BoUpSLP::reorderTopToBottom() {
++Cnt;
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
- if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
+ if (!(TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::PossibleStridedVectorize) ||
+ !TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
TE->getOpcode() == Instruction::PHI)
@@ -4409,6 +4433,9 @@ void BoUpSLP::reorderTopToBottom() {
MapVector<OrdersType, unsigned,
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
OrdersUses;
+ // Last chance orders - scatter vectorize. Try to use their orders if no
+ // other orders or the order is counted already.
+ SmallVector<OrdersType> StridedVectorizeOrders;
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
for (const TreeEntry *OpTE : OrderedEntries) {
// No need to reorder this nodes, still need to extend and to use shuffle,
@@ -4455,6 +4482,11 @@ void BoUpSLP::reorderTopToBottom() {
if (Order.empty())
continue;
}
+ // Postpone scatter orders.
+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+ StridedVectorizeOrders.push_back(Order);
+ continue;
+ }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4472,8 +4504,21 @@ void BoUpSLP::reorderTopToBottom() {
}
}
// Set order of the user node.
- if (OrdersUses.empty())
- continue;
+ if (OrdersUses.empty()) {
+ if (StridedVectorizeOrders.empty())
+ continue;
+ // Add (potentially!) strided vectorize orders.
+ for (OrdersType &Order : StridedVectorizeOrders)
+ ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+ } else {
+ // Account (potentially!) strided vectorize orders only if it was used
+ // already.
+ for (OrdersType &Order : StridedVectorizeOrders) {
+ auto *It = OrdersUses.find(Order);
+ if (It != OrdersUses.end())
+ ++It->second;
+ }
+ }
// Choose the most used order.
ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
unsigned Cnt = OrdersUses.front().second;
@@ -4514,7 +4559,8 @@ void BoUpSLP::reorderTopToBottom() {
}
continue;
}
- if (TE->State == TreeEntry::Vectorize &&
+ if ((TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::PossibleStridedVectorize) &&
isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
InsertElementInst>(TE->getMainOp()) &&
!TE->isAltShuffle()) {
@@ -4555,6 +4601,10 @@ bool BoUpSLP::canReorderOperands(
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+ // FIXME: Do not reorder (possible!) strided vectorized nodes, they
+ // require reordering of the operands, which is not implemented yet.
+ if (TE->State == TreeEntry::PossibleStridedVectorize)
+ return false;
// Do not reorder if operand node is used by many user nodes.
if (any_of(TE->UserTreeIndices,
[UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -4567,7 +4617,8 @@ bool BoUpSLP::canReorderOperands(
// simply add to the list of gathered ops.
// If there are reused scalars, process this node as a regular vectorize
// node, just reorder reuses mask.
- if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
+ if (TE->State != TreeEntry::Vectorize &&
+ TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
continue;
}
@@ -4602,18 +4653,19 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Currently the are vectorized loads,extracts without alternate operands +
// some gathering of extracts.
SmallVector<TreeEntry *> NonVectorized;
- for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,
- &NonVectorized](
- const std::unique_ptr<TreeEntry> &TE) {
- if (TE->State != TreeEntry::Vectorize)
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ if (TE->State != TreeEntry::Vectorize &&
+ TE->State != TreeEntry::PossibleStridedVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
- if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
+ if (!(TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::PossibleStridedVectorize) ||
+ !TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}
- });
+ }
// 1. Propagate order to the graph nodes, which use only reordered nodes.
// I.e., if the node has operands, that are reordered, try to make at least
@@ -4627,6 +4679,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> Filtered;
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::PossibleStridedVectorize ||
(TE->State == TreeEntry::NeedToGather &&
GathersToOrders.count(TE))) ||
TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -4649,8 +4702,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
}
// Erase filtered entries.
- for_each(Filtered,
- [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
+ for (TreeEntry *TE : Filtered)
+ OrderedEntries.remove(TE);
SmallVector<
std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
UsersVec(Users.begin(), Users.end());
@@ -4662,10 +4715,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> GatherOps;
if (!canReorderOperands(Data.first, Data.second, NonVectorized,
GatherOps)) {
- for_each(Data.second,
- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
- OrderedEntries.remove(Op.second);
- });
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+ OrderedEntries.remove(Op.second);
continue;
}
// All operands are reordered and used only in this node - propagate the
@@ -4673,6 +4724,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
MapVector<OrdersType, unsigned,
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
OrdersUses;
+ // Last chance orders - scatter vectorize. Try to use their orders if no
+ // other orders or the order is counted already.
+ SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
// Do the analysis for each tree entry only once, otherwise the order of
// the same node my be considered several times, though might be not
// profitable.
@@ -4694,6 +4748,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
return P.second == OpTE;
});
+ // Postpone scatter orders.
+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+ StridedVectorizeOrders.emplace_back(Order, NumOps);
+ continue;
+ }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4754,11 +4813,27 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
// If no orders - skip current nodes and jump to the next one, if any.
if (OrdersUses.empty()) {
- for_each(Data.second,
- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
- OrderedEntries.remove(Op.second);
- });
- continue;
+ if (StridedVectorizeOrders.empty() ||
+ (Data.first->ReorderIndices.empty() &&
+ Data.first->ReuseShuffleIndices.empty() &&
+ !(IgnoreReorder &&
+ Data.first == VectorizableTree.front().get()))) {
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+ OrderedEntries.remove(Op.second);
+ continue;
+ }
+ // Add (potentially!) strided vectorize orders.
+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
+ OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
+ Pair.second;
+ } else {
+ // Account (potentially!) strided vectorize orders only if it was used
+ // already.
+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
+ auto *It = OrdersUses.find(Pair.first);
+ if (It != OrdersUses.end())
+ It->second += Pair.second;
+ }
}
// Choose the best order.
ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
@@ -4771,10 +4846,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
// Set order of the user node (reordering of operands and user nodes).
if (BestOrder.empty()) {
- for_each(Data.second,
- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
- OrderedEntries.remove(Op.second);
- });
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+ OrderedEntries.remove(Op.second);
continue;
}
// Erase operands from OrderedEntries list and adjust their orders.
@@ -4796,7 +4869,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
continue;
}
// Gathers are processed separately.
- if (TE->State != TreeEntry::Vectorize)
+ if (TE->State != TreeEntry::Vectorize &&
+ TE->State != TreeEntry::PossibleStridedVectorize &&
+ (TE->State != TreeEntry::ScatterVectorize ||
+ TE->ReorderIndices.empty()))
continue;
assert((BestOrder.size() == TE->ReorderIndices.size() ||
TE->ReorderIndices.empty()) &&
@@ -4825,7 +4901,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.first->isAltShuffle())
Data.first->reorderOperands(Mask);
if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
- Data.first->isAltShuffle()) {
+ Data.first->isAltShuffle() ||
+ Data.first->State == TreeEntry::PossibleStridedVectorize) {
reorderScalars(Data.first->Scalars, Mask);
reorderOrder(Data.first->ReorderIndices, MaskOrder);
if (Data.first->ReuseShuffleIndices.empty() &&
@@ -4859,10 +4936,12 @@ void BoUpSLP::buildExternalUses(
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
+ if (!isa<Instruction>(Scalar))
+ continue;
int FoundLane = Entry->findLaneForValue(Scalar);
// Check if the scalar is externally used as an extra arg.
- auto ExtI = ExternallyUsedValues.find(Scalar);
+ const auto *ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n");
@@ -4886,7 +4965,8 @@ void BoUpSLP::buildExternalUses(
// be used.
if (UseScalar != U ||
UseEntry->State == TreeEntry::ScatterVectorize ||
- !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+ UseEntry->State == TreeEntry::PossibleStridedVectorize ||
+ !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
@@ -4906,9 +4986,9 @@ void BoUpSLP::buildExternalUses(
}
}
-DenseMap<Value *, SmallVector<StoreInst *, 4>>
+DenseMap<Value *, SmallVector<StoreInst *>>
BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
- DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;
+ DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
Value *V = TE->Scalars[Lane];
// To save compilation time we don't visit if we have too many users.
@@ -4947,14 +5027,14 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
return PtrToStoresMap;
}
-bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
+bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
OrdersType &ReorderIndices) const {
// We check whether the stores in StoreVec can form a vector by sorting them
// and checking whether they are consecutive.
// To avoid calling getPointersDiff() while sorting we create a vector of
// pairs {store, offset from first} and sort this instead.
- SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());
+ SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
StoreInst *S0 = StoresVec[0];
StoreOffsetVec[0] = {S0, 0};
Type *S0Ty = S0->getValueOperand()->getType();
@@ -5023,7 +5103,7 @@ SmallVector<BoUpSLP::OrdersType, 1>
BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
unsigned NumLanes = TE->Scalars.size();
- DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =
+ DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =
collectUserStores(TE);
// Holds the reorder indices for each candidate store vector that is a user of
@@ -5244,6 +5324,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::Vectorize;
case LoadsState::ScatterVectorize:
return TreeEntry::ScatterVectorize;
+ case LoadsState::PossibleStridedVectorize:
+ return TreeEntry::PossibleStridedVectorize;
case LoadsState::Gather:
#ifndef NDEBUG
Type *ScalarTy = VL0->getType();
@@ -5416,7 +5498,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
VFShape Shape = VFShape::get(
- *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
+ CI->getFunctionType(),
+ ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
@@ -5488,9 +5571,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<int> ReuseShuffleIndicies;
SmallVector<Value *> UniqueValues;
- auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
- &UserTreeIdx,
- this](const InstructionsState &S) {
+ SmallVector<Value *> NonUniqueValueVL;
+ auto TryToFindDuplicates = [&](const InstructionsState &S,
+ bool DoNotFail = false) {
// Check that every instruction appears once in this bundle.
DenseMap<Value *, unsigned> UniquePositions(VL.size());
for (Value *V : VL) {
@@ -5517,6 +5600,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
!isConstant(V);
})) ||
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
+ if (DoNotFail && UniquePositions.size() > 1 &&
+ NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
+ all_of(UniqueValues, [=](Value *V) {
+ return isa<ExtractElementInst>(V) ||
+ areAllUsersVectorized(cast<Instruction>(V),
+ UserIgnoreList);
+ })) {
+ unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
+ if (PWSz == VL.size()) {
+ ReuseShuffleIndicies.clear();
+ } else {
+ NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
+ NonUniqueValueVL.append(PWSz - UniqueValues.size(),
+ UniqueValues.back());
+ VL = NonUniqueValueVL;
+ }
+ return true;
+ }
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return false;
@@ -5528,6 +5629,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
InstructionsState S = getSameOpcode(VL, *TLI);
+ // Don't vectorize ephemeral values.
+ if (!EphValues.empty()) {
+ for (Value *V : VL) {
+ if (EphValues.count(V)) {
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+ << ") is ephemeral.\n");
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ }
+ }
+
// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
// a load), in which case peek through to include it in the tree, without
// ballooning over-budget.
@@ -5633,7 +5746,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BasicBlock *BB = nullptr;
bool IsScatterVectorizeUserTE =
UserTreeIdx.UserTE &&
- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
bool AreAllSameInsts =
(S.getOpcode() && allSameBlock(VL)) ||
(S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
@@ -5665,39 +5779,44 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// We now know that this is a vector of instructions of the same type from
// the same block.
- // Don't vectorize ephemeral values.
- if (!EphValues.empty()) {
- for (Value *V : VL) {
- if (EphValues.count(V)) {
- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
- << ") is ephemeral.\n");
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
- return;
- }
- }
- }
-
// Check if this is a duplicate of another entry.
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
if (!E->isSame(VL)) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- if (TryToFindDuplicates(S))
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
+ auto It = MultiNodeScalars.find(S.OpValue);
+ if (It != MultiNodeScalars.end()) {
+ auto *TEIt = find_if(It->getSecond(),
+ [&](TreeEntry *ME) { return ME->isSame(VL); });
+ if (TEIt != It->getSecond().end())
+ E = *TEIt;
+ else
+ E = nullptr;
+ } else {
+ E = nullptr;
+ }
+ }
+ if (!E) {
+ if (!doesNotNeedToBeScheduled(S.OpValue)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ } else {
+ // Record the reuse of the tree node. FIXME, currently this is only used
+ // to properly draw the graph rather than for the actual vectorization.
+ E->UserTreeIndices.push_back(UserTreeIdx);
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+ << ".\n");
return;
}
- // Record the reuse of the tree node. FIXME, currently this is only used to
- // properly draw the graph rather than for the actual vectorization.
- E->UserTreeIndices.push_back(UserTreeIdx);
- LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
- << ".\n");
- return;
}
// Check that none of the instructions in the bundle are already in the tree.
for (Value *V : VL) {
- if (!IsScatterVectorizeUserTE && !isa<Instruction>(V))
+ if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
+ doesNotNeedToBeScheduled(V))
continue;
if (getTreeEntry(V)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
@@ -5725,7 +5844,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Special processing for sorted pointers for ScatterVectorize node with
// constant indeces only.
if (AreAllSameInsts && UserTreeIdx.UserTE &&
- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
!(S.getOpcode() && allSameBlock(VL))) {
assert(S.OpValue->getType()->isPointerTy() &&
count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
@@ -5760,7 +5880,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// Check that every instruction appears once in this bundle.
- if (!TryToFindDuplicates(S))
+ if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
return;
// Perform specific checks for each particular instruction kind.
@@ -5780,7 +5900,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BlockScheduling &BS = *BSRef;
- std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+ std::optional<ScheduleData *> Bundle =
+ BS.tryScheduleBundle(UniqueValues, this, S);
#ifdef EXPENSIVE_CHECKS
// Make sure we didn't break any internal invariants
BS.verify();
@@ -5905,6 +6026,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
TreeEntry *TE = nullptr;
+ fixupOrderingIndices(CurrentOrder);
switch (State) {
case TreeEntry::Vectorize:
if (CurrentOrder.empty()) {
@@ -5913,7 +6035,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
} else {
- fixupOrderingIndices(CurrentOrder);
// Need to reorder.
TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies, CurrentOrder);
@@ -5921,6 +6042,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
break;
+ case TreeEntry::PossibleStridedVectorize:
+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+ if (CurrentOrder.empty()) {
+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ } else {
+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
+ }
+ TE->setOperandsInOrder();
+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+ break;
case TreeEntry::ScatterVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
@@ -5951,13 +6085,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
TE->setOperandsInOrder();
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
+ Operands.push_back(cast<Instruction>(V)->getOperand(I));
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ buildTree_rec(Operands, Depth + 1, {TE, I});
}
return;
}
@@ -6031,13 +6165,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
+ Operands.push_back(cast<Instruction>(V)->getOperand(I));
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ buildTree_rec(Operands, Depth + 1, {TE, I});
}
return;
}
@@ -6087,8 +6221,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!CI)
Operands.back().push_back(Op);
else
- Operands.back().push_back(ConstantExpr::getIntegerCast(
- CI, Ty, CI->getValue().isSignBitSet()));
+ Operands.back().push_back(ConstantFoldIntegerCast(
+ CI, Ty, CI->getValue().isSignBitSet(), *DL));
}
TE->setOperand(IndexIdx, Operands.back());
@@ -6132,18 +6266,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
TE->setOperandsInOrder();
- for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
- // For scalar operands no need to to create an entry since no need to
+ for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+ // For scalar operands no need to create an entry since no need to
// vectorize it.
- if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
continue;
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL) {
auto *CI2 = cast<CallInst>(V);
- Operands.push_back(CI2->getArgOperand(i));
+ Operands.push_back(CI2->getArgOperand(I));
}
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ buildTree_rec(Operands, Depth + 1, {TE, I});
}
return;
}
@@ -6194,13 +6328,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
+ Operands.push_back(cast<Instruction>(V)->getOperand(I));
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ buildTree_rec(Operands, Depth + 1, {TE, I});
}
return;
}
@@ -6210,7 +6344,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
llvm_unreachable("Unexpected vectorization of the instructions.");
}
-unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+unsigned BoUpSLP::canMapToVector(Type *T) const {
unsigned N = 1;
Type *EltTy = T;
@@ -6234,15 +6368,16 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
if (!isValidElementType(EltTy))
return 0;
- uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
+ uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
- VTSize != DL.getTypeStoreSizeInBits(T))
+ VTSize != DL->getTypeStoreSizeInBits(T))
return 0;
return N;
}
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
- SmallVectorImpl<unsigned> &CurrentOrder) const {
+ SmallVectorImpl<unsigned> &CurrentOrder,
+ bool ResizeAllowed) const {
const auto *It = find_if(VL, [](Value *V) {
return isa<ExtractElementInst, ExtractValueInst>(V);
});
@@ -6263,8 +6398,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
if (E0->getOpcode() == Instruction::ExtractValue) {
- const DataLayout &DL = E0->getModule()->getDataLayout();
- NElts = canMapToVector(Vec->getType(), DL);
+ NElts = canMapToVector(Vec->getType());
if (!NElts)
return false;
// Check if load can be rewritten as load of vector.
@@ -6275,46 +6409,55 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
}
- if (NElts != VL.size())
- return false;
-
- // Check that all of the indices extract from the correct offset.
- bool ShouldKeepOrder = true;
unsigned E = VL.size();
- // Assign to all items the initial value E + 1 so we can check if the extract
- // instruction index was used already.
- // Also, later we can check that all the indices are used and we have a
- // consecutive access in the extract instructions, by checking that no
- // element of CurrentOrder still has value E + 1.
- CurrentOrder.assign(E, E);
- unsigned I = 0;
- for (; I < E; ++I) {
- auto *Inst = dyn_cast<Instruction>(VL[I]);
+ if (!ResizeAllowed && NElts != E)
+ return false;
+ SmallVector<int> Indices(E, PoisonMaskElem);
+ unsigned MinIdx = NElts, MaxIdx = 0;
+ for (auto [I, V] : enumerate(VL)) {
+ auto *Inst = dyn_cast<Instruction>(V);
if (!Inst)
continue;
if (Inst->getOperand(0) != Vec)
- break;
+ return false;
if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
if (isa<UndefValue>(EE->getIndexOperand()))
continue;
std::optional<unsigned> Idx = getExtractIndex(Inst);
if (!Idx)
- break;
+ return false;
const unsigned ExtIdx = *Idx;
- if (ExtIdx != I) {
- if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)
- break;
- ShouldKeepOrder = false;
- CurrentOrder[ExtIdx] = I;
- } else {
- if (CurrentOrder[I] != E)
- break;
- CurrentOrder[I] = I;
- }
+ if (ExtIdx >= NElts)
+ continue;
+ Indices[I] = ExtIdx;
+ if (MinIdx > ExtIdx)
+ MinIdx = ExtIdx;
+ if (MaxIdx < ExtIdx)
+ MaxIdx = ExtIdx;
}
- if (I < E) {
- CurrentOrder.clear();
+ if (MaxIdx - MinIdx + 1 > E)
return false;
+ if (MaxIdx + 1 <= E)
+ MinIdx = 0;
+
+ // Check that all of the indices extract from the correct offset.
+ bool ShouldKeepOrder = true;
+ // Assign to all items the initial value E + 1 so we can check if the extract
+ // instruction index was used already.
+ // Also, later we can check that all the indices are used and we have a
+ // consecutive access in the extract instructions, by checking that no
+ // element of CurrentOrder still has value E + 1.
+ CurrentOrder.assign(E, E);
+ for (unsigned I = 0; I < E; ++I) {
+ if (Indices[I] == PoisonMaskElem)
+ continue;
+ const unsigned ExtIdx = Indices[I] - MinIdx;
+ if (CurrentOrder[ExtIdx] != E) {
+ CurrentOrder.clear();
+ return false;
+ }
+ ShouldKeepOrder &= ExtIdx == I;
+ CurrentOrder[ExtIdx] = I;
}
if (ShouldKeepOrder)
CurrentOrder.clear();
@@ -6322,9 +6465,9 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
return ShouldKeepOrder;
}
-bool BoUpSLP::areAllUsersVectorized(Instruction *I,
- ArrayRef<Value *> VectorizedVals) const {
- return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
+bool BoUpSLP::areAllUsersVectorized(
+ Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
+ return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
all_of(I->users(), [this](User *U) {
return ScalarToTreeEntry.count(U) > 0 ||
isVectorLikeInstWithConstOps(U) ||
@@ -6351,8 +6494,8 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
auto IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
- auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
- VecTy->getNumElements())),
+ auto Shape = VFShape::get(CI->getFunctionType(),
+ ElementCount::getFixed(VecTy->getNumElements()),
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
auto LibCost = IntrinsicCost;
@@ -6365,16 +6508,11 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
return {IntrinsicCost, LibCost};
}
-/// Build shuffle mask for shuffle graph entries and lists of main and alternate
-/// operations operands.
-static void
-buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
- ArrayRef<int> ReusesIndices,
- const function_ref<bool(Instruction *)> IsAltOp,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<Value *> *OpScalars = nullptr,
- SmallVectorImpl<Value *> *AltScalars = nullptr) {
- unsigned Sz = VL.size();
+void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
+ const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<Value *> *OpScalars,
+ SmallVectorImpl<Value *> *AltScalars) const {
+ unsigned Sz = Scalars.size();
Mask.assign(Sz, PoisonMaskElem);
SmallVector<int> OrderMask;
if (!ReorderIndices.empty())
@@ -6383,7 +6521,7 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
unsigned Idx = I;
if (!ReorderIndices.empty())
Idx = OrderMask[I];
- auto *OpInst = cast<Instruction>(VL[Idx]);
+ auto *OpInst = cast<Instruction>(Scalars[Idx]);
if (IsAltOp(OpInst)) {
Mask[I] = Sz + Idx;
if (AltScalars)
@@ -6394,9 +6532,9 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
OpScalars->push_back(OpInst);
}
}
- if (!ReusesIndices.empty()) {
- SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem);
- transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {
+ if (!ReuseShuffleIndices.empty()) {
+ SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
+ transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
});
Mask.swap(NewMask);
@@ -6429,52 +6567,27 @@ static bool isAlternateInstruction(const Instruction *I,
return I->getOpcode() == AltOp->getOpcode();
}
-TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,
- unsigned OpIdx) {
- assert(!VL.empty());
- const auto *I0 = cast<Instruction>(*find_if(VL, Instruction::classof));
- const auto *Op0 = I0->getOperand(OpIdx);
+TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
+ assert(!Ops.empty());
+ const auto *Op0 = Ops.front();
- const bool IsConstant = all_of(VL, [&](Value *V) {
+ const bool IsConstant = all_of(Ops, [](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return true;
- auto *Op = I->getOperand(OpIdx);
- return isConstant(Op) && !isa<UndefValue>(Op);
+ return isConstant(V) && !isa<UndefValue>(V);
});
- const bool IsUniform = all_of(VL, [&](Value *V) {
+ const bool IsUniform = all_of(Ops, [=](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
- return I->getOperand(OpIdx) == Op0;
+ return V == Op0;
});
- const bool IsPowerOfTwo = all_of(VL, [&](Value *V) {
+ const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I) {
- assert((isa<UndefValue>(V) ||
- I0->getOpcode() == Instruction::GetElementPtr) &&
- "Expected undef or GEP.");
- return true;
- }
- auto *Op = I->getOperand(OpIdx);
- if (auto *CI = dyn_cast<ConstantInt>(Op))
+ if (auto *CI = dyn_cast<ConstantInt>(V))
return CI->getValue().isPowerOf2();
return false;
});
- const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) {
+ const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I) {
- assert((isa<UndefValue>(V) ||
- I0->getOpcode() == Instruction::GetElementPtr) &&
- "Expected undef or GEP.");
- return true;
- }
- const auto *Op = I->getOperand(OpIdx);
- if (auto *CI = dyn_cast<ConstantInt>(Op))
+ if (auto *CI = dyn_cast<ConstantInt>(V))
return CI->getValue().isNegatedPowerOf2();
return false;
});
@@ -6505,9 +6618,24 @@ protected:
bool IsStrict) {
int Limit = Mask.size();
int VF = VecTy->getNumElements();
- return (VF == Limit || !IsStrict) &&
- all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(Mask);
+ int Index = -1;
+ if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
+ return true;
+ if (!IsStrict) {
+ // Consider extract subvector starting from index 0.
+ if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
+ Index == 0)
+ return true;
+ // All VF-size submasks are identity (e.g.
+ // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
+ if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
+ ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
+ return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
+ ShuffleVectorInst::isIdentityMask(Slice, VF);
+ }))
+ return true;
+ }
+ return false;
}
/// Tries to combine 2 different masks into single one.
@@ -6577,7 +6705,8 @@ protected:
if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
if (!IdentityOp || !SinglePermute ||
(isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
- !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) {
+ !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
+ IdentityMask.size()))) {
IdentityOp = SV;
// Store current mask in the IdentityMask so later we did not lost
// this info if IdentityOp is selected as the best candidate for the
@@ -6647,7 +6776,7 @@ protected:
}
if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
!OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
- ShuffleVectorInst::isZeroEltSplatMask(Mask)) {
+ ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {
if (IdentityOp) {
V = IdentityOp;
assert(Mask.size() == IdentityMask.size() &&
@@ -6663,7 +6792,7 @@ protected:
/*IsStrict=*/true) ||
(Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
Shuffle->isZeroEltSplat() &&
- ShuffleVectorInst::isZeroEltSplatMask(Mask)));
+ ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));
}
V = Op;
return false;
@@ -6768,11 +6897,9 @@ protected:
CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
}
}
- const int Limit = CombinedMask1.size() * 2;
- if (Op1 == Op2 && Limit == 2 * VF &&
- all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&
- (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||
- (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&
+ if (Op1 == Op2 &&
+ (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
+ (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
isa<ShuffleVectorInst>(Op1) &&
cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
ArrayRef(CombinedMask1))))
@@ -6807,10 +6934,29 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
const TargetTransformInfo &TTI;
InstructionCost Cost = 0;
- ArrayRef<Value *> VectorizedVals;
+ SmallDenseSet<Value *> VectorizedVals;
BoUpSLP &R;
SmallPtrSetImpl<Value *> &CheckedExtracts;
constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ /// While set, still trying to estimate the cost for the same nodes and we
+ /// can delay actual cost estimation (virtual shuffle instruction emission).
+ /// May help better estimate the cost if same nodes must be permuted + allows
+ /// to move most of the long shuffles cost estimation to TTI.
+ bool SameNodesEstimated = true;
+
+ static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
+ if (Ty->getScalarType()->isPointerTy()) {
+ Constant *Res = ConstantExpr::getIntToPtr(
+ ConstantInt::getAllOnesValue(
+ IntegerType::get(Ty->getContext(),
+ DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
+ Ty->getScalarType());
+ if (auto *VTy = dyn_cast<VectorType>(Ty))
+ Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
+ return Res;
+ }
+ return Constant::getAllOnesValue(Ty);
+ }
InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
@@ -6821,20 +6967,35 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
// Improve gather cost for gather of loads, if we can group some of the
// loads into vector loads.
InstructionsState S = getSameOpcode(VL, *R.TLI);
- if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&
- !S.isAltShuffle() &&
+ const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
+ unsigned MinVF = R.getMinVF(2 * Sz);
+ if (VL.size() > 2 &&
+ ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
+ (InVectors.empty() &&
+ any_of(seq<unsigned>(0, VL.size() / MinVF),
+ [&](unsigned Idx) {
+ ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
+ InstructionsState S = getSameOpcode(SubVL, *R.TLI);
+ return S.getOpcode() == Instruction::Load &&
+ !S.isAltShuffle();
+ }))) &&
!all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
!isSplat(Gathers)) {
- BoUpSLP::ValueSet VectorizedLoads;
+ SetVector<Value *> VectorizedLoads;
+ SmallVector<LoadInst *> VectorizedStarts;
+ SmallVector<std::pair<unsigned, unsigned>> ScatterVectorized;
unsigned StartIdx = 0;
unsigned VF = VL.size() / 2;
- unsigned VectorizedCnt = 0;
- unsigned ScatterVectorizeCnt = 0;
- const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());
- for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
+ for (; VF >= MinVF; VF /= 2) {
for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
Cnt += VF) {
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
+ InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
+ if (SliceS.getOpcode() != Instruction::Load ||
+ SliceS.isAltShuffle())
+ continue;
+ }
if (!VectorizedLoads.count(Slice.front()) &&
!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
SmallVector<Value *> PointerOps;
@@ -6845,12 +7006,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
switch (LS) {
case LoadsState::Vectorize:
case LoadsState::ScatterVectorize:
+ case LoadsState::PossibleStridedVectorize:
// Mark the vectorized loads so that we don't vectorize them
// again.
- if (LS == LoadsState::Vectorize)
- ++VectorizedCnt;
+ // TODO: better handling of loads with reorders.
+ if (LS == LoadsState::Vectorize && CurrentOrder.empty())
+ VectorizedStarts.push_back(cast<LoadInst>(Slice.front()));
else
- ++ScatterVectorizeCnt;
+ ScatterVectorized.emplace_back(Cnt, VF);
VectorizedLoads.insert(Slice.begin(), Slice.end());
// If we vectorized initial block, no need to try to vectorize
// it again.
@@ -6881,8 +7044,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
// Exclude potentially vectorized loads from list of gathered
// scalars.
- auto *LI = cast<LoadInst>(S.MainOp);
- Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));
+ Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
// The cost for vectorized loads.
InstructionCost ScalarsCost = 0;
for (Value *V : VectorizedLoads) {
@@ -6892,17 +7054,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
LI->getAlign(), LI->getPointerAddressSpace(),
CostKind, TTI::OperandValueInfo(), LI);
}
- auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
- Align Alignment = LI->getAlign();
- GatherCost +=
- VectorizedCnt *
- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
- LI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), LI);
- GatherCost += ScatterVectorizeCnt *
- TTI.getGatherScatterOpCost(
- Instruction::Load, LoadTy, LI->getPointerOperand(),
- /*VariableMask=*/false, Alignment, CostKind, LI);
+ auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
+ for (LoadInst *LI : VectorizedStarts) {
+ Align Alignment = LI->getAlign();
+ GatherCost +=
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+ LI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), LI);
+ }
+ for (std::pair<unsigned, unsigned> P : ScatterVectorized) {
+ auto *LI0 = cast<LoadInst>(VL[P.first]);
+ Align CommonAlignment = LI0->getAlign();
+ for (Value *V : VL.slice(P.first + 1, VF - 1))
+ CommonAlignment =
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ GatherCost += TTI.getGatherScatterOpCost(
+ Instruction::Load, LoadTy, LI0->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
+ }
if (NeedInsertSubvectorAnalysis) {
// Add the cost for the subvectors insert.
for (int I = VF, E = VL.size(); I < E; I += VF)
@@ -6938,77 +7107,137 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
: R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
};
- /// Compute the cost of creating a vector of type \p VecTy containing the
- /// extracted values from \p VL.
- InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
- TTI::ShuffleKind ShuffleKind) {
- auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
- unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-
- if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc ||
- !NumOfParts || VecTy->getNumElements() < NumOfParts)
- return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
- bool AllConsecutive = true;
- unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
- unsigned Idx = -1;
+ /// Compute the cost of creating a vector containing the extracted values from
+ /// \p VL.
+ InstructionCost
+ computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+ unsigned NumParts) {
+ assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
+ unsigned NumElts =
+ std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
+ auto *EE = dyn_cast<ExtractElementInst>(V);
+ if (!EE)
+ return Sz;
+ auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
+ return std::max(Sz, VecTy->getNumElements());
+ });
+ unsigned NumSrcRegs = TTI.getNumberOfParts(
+ FixedVectorType::get(VL.front()->getType(), NumElts));
+ if (NumSrcRegs == 0)
+ NumSrcRegs = 1;
+ // FIXME: this must be moved to TTI for better estimation.
+ unsigned EltsPerVector = PowerOf2Ceil(std::max(
+ divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+ auto CheckPerRegistersShuffle =
+ [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+ DenseSet<int> RegIndices;
+ // Check that if trying to permute same single/2 input vectors.
+ TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
+ int FirstRegId = -1;
+ for (int &I : Mask) {
+ if (I == PoisonMaskElem)
+ continue;
+ int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
+ if (FirstRegId < 0)
+ FirstRegId = RegId;
+ RegIndices.insert(RegId);
+ if (RegIndices.size() > 2)
+ return std::nullopt;
+ if (RegIndices.size() == 2)
+ ShuffleKind = TTI::SK_PermuteTwoSrc;
+ I = (I % NumElts) % EltsPerVector +
+ (RegId == FirstRegId ? 0 : EltsPerVector);
+ }
+ return ShuffleKind;
+ };
InstructionCost Cost = 0;
// Process extracts in blocks of EltsPerVector to check if the source vector
// operand can be re-used directly. If not, add the cost of creating a
// shuffle to extract the values into a vector register.
- SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
- for (auto *V : VL) {
- ++Idx;
-
- // Reached the start of a new vector registers.
- if (Idx % EltsPerVector == 0) {
- RegMask.assign(EltsPerVector, PoisonMaskElem);
- AllConsecutive = true;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ if (!ShuffleKinds[Part])
continue;
- }
-
- // Need to exclude undefs from analysis.
- if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+ ArrayRef<int> MaskSlice =
+ Mask.slice(Part * EltsPerVector,
+ (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
+ ? Mask.size() % EltsPerVector
+ : EltsPerVector);
+ SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
+ copy(MaskSlice, SubMask.begin());
+ std::optional<TTI::ShuffleKind> RegShuffleKind =
+ CheckPerRegistersShuffle(SubMask);
+ if (!RegShuffleKind) {
+ Cost += TTI.getShuffleCost(
+ *ShuffleKinds[Part],
+ FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
continue;
-
- // Check all extracts for a vector register on the target directly
- // extract values in order.
- unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
- if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) {
- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
- CurrentIdx % EltsPerVector == Idx % EltsPerVector;
- RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
}
-
- if (AllConsecutive)
- continue;
-
- // Skip all indices, except for the last index per vector block.
- if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
- continue;
-
- // If we have a series of extracts which are not consecutive and hence
- // cannot re-use the source vector register directly, compute the shuffle
- // cost to extract the vector with EltsPerVector elements.
- Cost += TTI.getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc,
- FixedVectorType::get(VecTy->getElementType(), EltsPerVector),
- RegMask);
+ if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
+ !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
+ Cost += TTI.getShuffleCost(
+ *RegShuffleKind,
+ FixedVectorType::get(VL.front()->getType(), EltsPerVector),
+ SubMask);
+ }
}
return Cost;
}
+ /// Transforms mask \p CommonMask per given \p Mask to make proper set after
+ /// shuffle emission.
+ static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
+ ArrayRef<int> Mask) {
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (Mask[Idx] != PoisonMaskElem)
+ CommonMask[Idx] = Idx;
+ }
+ /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
+ /// mask \p Mask, register number \p Part, that includes \p SliceSize
+ /// elements.
+ void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
+ ArrayRef<int> Mask, unsigned Part,
+ unsigned SliceSize) {
+ if (SameNodesEstimated) {
+ // Delay the cost estimation if the same nodes are reshuffling.
+ // If we already requested the cost of reshuffling of E1 and E2 before, no
+ // need to estimate another cost with the sub-Mask, instead include this
+ // sub-Mask into the CommonMask to estimate it later and avoid double cost
+ // estimation.
+ if ((InVectors.size() == 2 &&
+ InVectors.front().get<const TreeEntry *>() == &E1 &&
+ InVectors.back().get<const TreeEntry *>() == E2) ||
+ (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
+ assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
+ [](int Idx) { return Idx == PoisonMaskElem; }) &&
+ "Expected all poisoned elements.");
+ ArrayRef<int> SubMask =
+ ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
+ copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
+ return;
+ }
+ // Found non-matching nodes - need to estimate the cost for the matched
+ // and transform mask.
+ Cost += createShuffle(InVectors.front(),
+ InVectors.size() == 1 ? nullptr : InVectors.back(),
+ CommonMask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ }
+ SameNodesEstimated = false;
+ Cost += createShuffle(&E1, E2, Mask);
+ transformMaskAfterShuffle(CommonMask, Mask);
+ }
class ShuffleCostBuilder {
const TargetTransformInfo &TTI;
static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
- int Limit = 2 * VF;
+ int Index = -1;
return Mask.empty() ||
(VF == Mask.size() &&
- all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(Mask));
+ ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
+ (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
+ Index == 0);
}
public:
@@ -7021,21 +7250,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
if (isEmptyOrIdentity(Mask, VF))
return TTI::TCC_Free;
- return TTI.getShuffleCost(
- TTI::SK_PermuteTwoSrc,
- FixedVectorType::get(
- cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
- Mask);
+ return TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
+ cast<VectorType>(V1->getType()), Mask);
}
InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
// Empty mask or identity mask are free.
- if (isEmptyOrIdentity(Mask, Mask.size()))
+ unsigned VF =
+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+ if (isEmptyOrIdentity(Mask, VF))
return TTI::TCC_Free;
- return TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc,
- FixedVectorType::get(
- cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
- Mask);
+ return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc,
+ cast<VectorType>(V1->getType()), Mask);
}
InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
InstructionCost createPoison(Type *Ty, unsigned VF) const {
@@ -7052,139 +7277,226 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
const PointerUnion<Value *, const TreeEntry *> &P2,
ArrayRef<int> Mask) {
ShuffleCostBuilder Builder(TTI);
+ SmallVector<int> CommonMask(Mask.begin(), Mask.end());
Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
- unsigned CommonVF = 0;
- if (!V1) {
+ unsigned CommonVF = Mask.size();
+ if (!V1 && !V2 && !P2.isNull()) {
+ // Shuffle 2 entry nodes.
const TreeEntry *E = P1.get<const TreeEntry *>();
unsigned VF = E->getVectorFactor();
- if (V2) {
- unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
- if (V2VF != VF && V2VF == E->Scalars.size())
- VF = E->Scalars.size();
- } else if (!P2.isNull()) {
- const TreeEntry *E2 = P2.get<const TreeEntry *>();
- if (E->Scalars.size() == E2->Scalars.size())
- CommonVF = VF = E->Scalars.size();
- } else {
- // P2 is empty, check that we have same node + reshuffle (if any).
- if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
- VF = E->Scalars.size();
- SmallVector<int> CommonMask(Mask.begin(), Mask.end());
- ::addMask(CommonMask, E->getCommonMask());
- V1 = Constant::getNullValue(
- FixedVectorType::get(E->Scalars.front()->getType(), VF));
- return BaseShuffleAnalysis::createShuffle<InstructionCost>(
- V1, nullptr, CommonMask, Builder);
+ const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ CommonVF = std::max(VF, E2->getVectorFactor());
+ assert(all_of(Mask,
+ [=](int Idx) {
+ return Idx < 2 * static_cast<int>(CommonVF);
+ }) &&
+ "All elements in mask must be less than 2 * CommonVF.");
+ if (E->Scalars.size() == E2->Scalars.size()) {
+ SmallVector<int> EMask = E->getCommonMask();
+ SmallVector<int> E2Mask = E2->getCommonMask();
+ if (!EMask.empty() || !E2Mask.empty()) {
+ for (int &Idx : CommonMask) {
+ if (Idx == PoisonMaskElem)
+ continue;
+ if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
+ Idx = EMask[Idx];
+ else if (Idx >= static_cast<int>(CommonVF))
+ Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
+ E->Scalars.size();
+ }
}
+ CommonVF = E->Scalars.size();
}
V1 = Constant::getNullValue(
- FixedVectorType::get(E->Scalars.front()->getType(), VF));
- }
- if (!V2 && !P2.isNull()) {
- const TreeEntry *E = P2.get<const TreeEntry *>();
+ FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+ V2 = getAllOnesValue(
+ *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+ } else if (!V1 && P2.isNull()) {
+ // Shuffle single entry node.
+ const TreeEntry *E = P1.get<const TreeEntry *>();
unsigned VF = E->getVectorFactor();
- unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
- if (!CommonVF && V1VF == E->Scalars.size())
+ CommonVF = VF;
+ assert(
+ all_of(Mask,
+ [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
+ "All elements in mask must be less than CommonVF.");
+ if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
+ SmallVector<int> EMask = E->getCommonMask();
+ assert(!EMask.empty() && "Expected non-empty common mask.");
+ for (int &Idx : CommonMask) {
+ if (Idx != PoisonMaskElem)
+ Idx = EMask[Idx];
+ }
CommonVF = E->Scalars.size();
- if (CommonVF)
- VF = CommonVF;
- V2 = Constant::getNullValue(
- FixedVectorType::get(E->Scalars.front()->getType(), VF));
- }
- return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,
- Builder);
+ }
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+ } else if (V1 && P2.isNull()) {
+ // Shuffle single vector.
+ CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ assert(
+ all_of(Mask,
+ [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
+ "All elements in mask must be less than CommonVF.");
+ } else if (V1 && !V2) {
+ // Shuffle vector and tree node.
+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ CommonVF = std::max(VF, E2->getVectorFactor());
+ assert(all_of(Mask,
+ [=](int Idx) {
+ return Idx < 2 * static_cast<int>(CommonVF);
+ }) &&
+ "All elements in mask must be less than 2 * CommonVF.");
+ if (E2->Scalars.size() == VF && VF != CommonVF) {
+ SmallVector<int> E2Mask = E2->getCommonMask();
+ assert(!E2Mask.empty() && "Expected non-empty common mask.");
+ for (int &Idx : CommonMask) {
+ if (Idx == PoisonMaskElem)
+ continue;
+ if (Idx >= static_cast<int>(CommonVF))
+ Idx = E2Mask[Idx - CommonVF] + VF;
+ }
+ CommonVF = VF;
+ }
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
+ V2 = getAllOnesValue(
+ *R.DL,
+ FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
+ } else if (!V1 && V2) {
+ // Shuffle vector and tree node.
+ unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
+ const TreeEntry *E1 = P1.get<const TreeEntry *>();
+ CommonVF = std::max(VF, E1->getVectorFactor());
+ assert(all_of(Mask,
+ [=](int Idx) {
+ return Idx < 2 * static_cast<int>(CommonVF);
+ }) &&
+ "All elements in mask must be less than 2 * CommonVF.");
+ if (E1->Scalars.size() == VF && VF != CommonVF) {
+ SmallVector<int> E1Mask = E1->getCommonMask();
+ assert(!E1Mask.empty() && "Expected non-empty common mask.");
+ for (int &Idx : CommonMask) {
+ if (Idx == PoisonMaskElem)
+ continue;
+ if (Idx >= static_cast<int>(CommonVF))
+ Idx = E1Mask[Idx - CommonVF] + VF;
+ }
+ CommonVF = VF;
+ }
+ V1 = Constant::getNullValue(
+ FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
+ V2 = getAllOnesValue(
+ *R.DL,
+ FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
+ } else {
+ assert(V1 && V2 && "Expected both vectors.");
+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ CommonVF =
+ std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
+ assert(all_of(Mask,
+ [=](int Idx) {
+ return Idx < 2 * static_cast<int>(CommonVF);
+ }) &&
+ "All elements in mask must be less than 2 * CommonVF.");
+ if (V1->getType() != V2->getType()) {
+ V1 = Constant::getNullValue(FixedVectorType::get(
+ cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
+ V2 = getAllOnesValue(
+ *R.DL, FixedVectorType::get(
+ cast<FixedVectorType>(V1->getType())->getElementType(),
+ CommonVF));
+ }
+ }
+ InVectors.front() = Constant::getNullValue(FixedVectorType::get(
+ cast<FixedVectorType>(V1->getType())->getElementType(),
+ CommonMask.size()));
+ if (InVectors.size() == 2)
+ InVectors.pop_back();
+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(
+ V1, V2, CommonMask, Builder);
}
public:
ShuffleCostEstimator(TargetTransformInfo &TTI,
ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
SmallPtrSetImpl<Value *> &CheckedExtracts)
- : TTI(TTI), VectorizedVals(VectorizedVals), R(R),
- CheckedExtracts(CheckedExtracts) {}
- Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
- TTI::ShuffleKind ShuffleKind) {
+ : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
+ R(R), CheckedExtracts(CheckedExtracts) {}
+ Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+ unsigned NumParts, bool &UseVecBaseAsInput) {
+ UseVecBaseAsInput = false;
if (Mask.empty())
return nullptr;
Value *VecBase = nullptr;
ArrayRef<Value *> VL = E->Scalars;
- auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
// If the resulting type is scalarized, do not adjust the cost.
- unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
- if (VecNumParts == VecTy->getNumElements())
+ if (NumParts == VL.size())
return nullptr;
- DenseMap<Value *, int> ExtractVectorsTys;
- for (auto [I, V] : enumerate(VL)) {
- // Ignore non-extractelement scalars.
- if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
- continue;
- // If all users of instruction are going to be vectorized and this
- // instruction itself is not going to be vectorized, consider this
- // instruction as dead and remove its cost from the final cost of the
- // vectorized tree.
- // Also, avoid adjusting the cost for extractelements with multiple uses
- // in different graph entries.
- const TreeEntry *VE = R.getTreeEntry(V);
- if (!CheckedExtracts.insert(V).second ||
- !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
- (VE && VE != E))
- continue;
- auto *EE = cast<ExtractElementInst>(V);
- VecBase = EE->getVectorOperand();
- std::optional<unsigned> EEIdx = getExtractIndex(EE);
- if (!EEIdx)
- continue;
- unsigned Idx = *EEIdx;
- if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
- auto It =
- ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
- It->getSecond() = std::min<int>(It->second, Idx);
- }
- // Take credit for instruction that will become dead.
- if (EE->hasOneUse()) {
- Instruction *Ext = EE->user_back();
- if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
- return isa<GetElementPtrInst>(U);
- })) {
- // Use getExtractWithExtendCost() to calculate the cost of
- // extractelement/ext pair.
- Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
- EE->getVectorOperandType(), Idx);
- // Add back the cost of s|zext which is subtracted separately.
- Cost += TTI.getCastInstrCost(
- Ext->getOpcode(), Ext->getType(), EE->getType(),
- TTI::getCastContextHint(Ext), CostKind, Ext);
+ // Check if it can be considered reused if same extractelements were
+ // vectorized already.
+ bool PrevNodeFound = any_of(
+ ArrayRef(R.VectorizableTree).take_front(E->Idx),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return ((!TE->isAltShuffle() &&
+ TE->getOpcode() == Instruction::ExtractElement) ||
+ TE->State == TreeEntry::NeedToGather) &&
+ all_of(enumerate(TE->Scalars), [&](auto &&Data) {
+ return VL.size() > Data.index() &&
+ (Mask[Data.index()] == PoisonMaskElem ||
+ isa<UndefValue>(VL[Data.index()]) ||
+ Data.value() == VL[Data.index()]);
+ });
+ });
+ SmallPtrSet<Value *, 4> UniqueBases;
+ unsigned SliceSize = VL.size() / NumParts;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+ for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+ // Ignore non-extractelement scalars.
+ if (isa<UndefValue>(V) ||
+ (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
continue;
- }
- }
- Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
- Idx);
- }
- // Add a cost for subvector extracts/inserts if required.
- for (const auto &Data : ExtractVectorsTys) {
- auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
- unsigned NumElts = VecTy->getNumElements();
- if (Data.second % NumElts == 0)
- continue;
- if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
- unsigned Idx = (Data.second / NumElts) * NumElts;
- unsigned EENumElts = EEVTy->getNumElements();
- if (Idx % NumElts == 0)
+ // If all users of instruction are going to be vectorized and this
+ // instruction itself is not going to be vectorized, consider this
+ // instruction as dead and remove its cost from the final cost of the
+ // vectorized tree.
+ // Also, avoid adjusting the cost for extractelements with multiple uses
+ // in different graph entries.
+ auto *EE = cast<ExtractElementInst>(V);
+ VecBase = EE->getVectorOperand();
+ UniqueBases.insert(VecBase);
+ const TreeEntry *VE = R.getTreeEntry(V);
+ if (!CheckedExtracts.insert(V).second ||
+ !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+ (VE && VE != E))
continue;
- if (Idx + NumElts <= EENumElts) {
- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, std::nullopt, CostKind, Idx, VecTy);
- } else {
- // Need to round up the subvector type vectorization factor to avoid a
- // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
- // <= EENumElts.
- auto *SubVT =
- FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- EEVTy, std::nullopt, CostKind, Idx, SubVT);
+ std::optional<unsigned> EEIdx = getExtractIndex(EE);
+ if (!EEIdx)
+ continue;
+ unsigned Idx = *EEIdx;
+ // Take credit for instruction that will become dead.
+ if (EE->hasOneUse() || !PrevNodeFound) {
+ Instruction *Ext = EE->user_back();
+ if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+ return isa<GetElementPtrInst>(U);
+ })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ Cost -=
+ TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+ EE->getVectorOperandType(), Idx);
+ // Add back the cost of s|zext which is subtracted separately.
+ Cost += TTI.getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), EE->getType(),
+ TTI::getCastContextHint(Ext), CostKind, Ext);
+ continue;
+ }
}
- } else {
- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
- VecTy, std::nullopt, CostKind, 0, EEVTy);
+ Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
+ CostKind, Idx);
}
}
// Check that gather of extractelements can be represented as just a
@@ -7192,31 +7504,152 @@ public:
// Found the bunch of extractelement instructions that must be gathered
// into a vector and can be represented as a permutation elements in a
// single input vector or of 2 input vectors.
- Cost += computeExtractCost(VL, Mask, ShuffleKind);
+ // Done for reused if same extractelements were vectorized already.
+ if (!PrevNodeFound)
+ Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
+ InVectors.assign(1, E);
+ CommonMask.assign(Mask.begin(), Mask.end());
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ SameNodesEstimated = false;
+ if (NumParts != 1 && UniqueBases.size() != 1) {
+ UseVecBaseAsInput = true;
+ VecBase = Constant::getNullValue(
+ FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
+ }
return VecBase;
}
- void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign({E1, E2});
+ /// Checks if the specified entry \p E needs to be delayed because of its
+ /// dependency nodes.
+ std::optional<InstructionCost>
+ needToDelay(const TreeEntry *,
+ ArrayRef<SmallVector<const TreeEntry *>>) const {
+ // No need to delay the cost estimation during analysis.
+ return std::nullopt;
}
- void add(const TreeEntry *E1, ArrayRef<int> Mask) {
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign(1, E1);
+ void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
+ if (&E1 == &E2) {
+ assert(all_of(Mask,
+ [&](int Idx) {
+ return Idx < static_cast<int>(E1.getVectorFactor());
+ }) &&
+ "Expected single vector shuffle mask.");
+ add(E1, Mask);
+ return;
+ }
+ if (InVectors.empty()) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign({&E1, &E2});
+ return;
+ }
+ assert(!CommonMask.empty() && "Expected non-empty common mask.");
+ auto *MaskVecTy =
+ FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+ unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+ if (NumParts == 0 || NumParts >= Mask.size())
+ NumParts = 1;
+ unsigned SliceSize = Mask.size() / NumParts;
+ const auto *It =
+ find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
+ unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
+ estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
+ }
+ void add(const TreeEntry &E1, ArrayRef<int> Mask) {
+ if (InVectors.empty()) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, &E1);
+ return;
+ }
+ assert(!CommonMask.empty() && "Expected non-empty common mask.");
+ auto *MaskVecTy =
+ FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+ unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+ if (NumParts == 0 || NumParts >= Mask.size())
+ NumParts = 1;
+ unsigned SliceSize = Mask.size() / NumParts;
+ const auto *It =
+ find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
+ unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
+ estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
+ if (!SameNodesEstimated && InVectors.size() == 1)
+ InVectors.emplace_back(&E1);
+ }
+ /// Adds 2 input vectors and the mask for their shuffling.
+ void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
+ // May come only for shuffling of 2 vectors with extractelements, already
+ // handled in adjustExtracts.
+ assert(InVectors.size() == 1 &&
+ all_of(enumerate(CommonMask),
+ [&](auto P) {
+ if (P.value() == PoisonMaskElem)
+ return Mask[P.index()] == PoisonMaskElem;
+ auto *EI =
+ cast<ExtractElementInst>(InVectors.front()
+ .get<const TreeEntry *>()
+ ->Scalars[P.index()]);
+ return EI->getVectorOperand() == V1 ||
+ EI->getVectorOperand() == V2;
+ }) &&
+ "Expected extractelement vectors.");
}
/// Adds another one input vector and the mask for the shuffling.
- void add(Value *V1, ArrayRef<int> Mask) {
- assert(CommonMask.empty() && InVectors.empty() &&
- "Expected empty input mask/vectors.");
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign(1, V1);
+ void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
+ if (InVectors.empty()) {
+ assert(CommonMask.empty() && !ForExtracts &&
+ "Expected empty input mask/vectors.");
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, V1);
+ return;
+ }
+ if (ForExtracts) {
+ // No need to add vectors here, already handled them in adjustExtracts.
+ assert(InVectors.size() == 1 &&
+ InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
+ all_of(enumerate(CommonMask),
+ [&](auto P) {
+ Value *Scalar = InVectors.front()
+ .get<const TreeEntry *>()
+ ->Scalars[P.index()];
+ if (P.value() == PoisonMaskElem)
+ return P.value() == Mask[P.index()] ||
+ isa<UndefValue>(Scalar);
+ if (isa<Constant>(V1))
+ return true;
+ auto *EI = cast<ExtractElementInst>(Scalar);
+ return EI->getVectorOperand() == V1;
+ }) &&
+ "Expected only tree entry for extractelement vectors.");
+ return;
+ }
+ assert(!InVectors.empty() && !CommonMask.empty() &&
+ "Expected only tree entries from extracts/reused buildvectors.");
+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+ if (InVectors.size() == 2) {
+ Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ VF = std::max<unsigned>(VF, CommonMask.size());
+ } else if (const auto *InTE =
+ InVectors.front().dyn_cast<const TreeEntry *>()) {
+ VF = std::max(VF, InTE->getVectorFactor());
+ } else {
+ VF = std::max(
+ VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
+ ->getNumElements());
+ }
+ InVectors.push_back(V1);
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
+ CommonMask[Idx] = Mask[Idx] + VF;
}
- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
+ Value *Root = nullptr) {
Cost += getBuildVectorCost(VL, Root);
if (!Root) {
- assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
// FIXME: Need to find a way to avoid use of getNullValue here.
SmallVector<Constant *> Vals;
- for (Value *V : VL) {
+ unsigned VF = VL.size();
+ if (MaskVF != 0)
+ VF = std::min(VF, MaskVF);
+ for (Value *V : VL.take_front(VF)) {
if (isa<UndefValue>(V)) {
Vals.push_back(cast<Constant>(V));
continue;
@@ -7226,9 +7659,11 @@ public:
return ConstantVector::get(Vals);
}
return ConstantVector::getSplat(
- ElementCount::getFixed(VL.size()),
- Constant::getNullValue(VL.front()->getType()));
+ ElementCount::getFixed(
+ cast<FixedVectorType>(Root->getType())->getNumElements()),
+ getAllOnesValue(*R.DL, VL.front()->getType()));
}
+ InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
/// Finalize emission of the shuffles.
InstructionCost
finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
@@ -7236,31 +7671,24 @@ public:
IsFinalized = true;
if (Action) {
const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
- if (InVectors.size() == 2) {
+ if (InVectors.size() == 2)
Cost += createShuffle(Vec, InVectors.back(), CommonMask);
- InVectors.pop_back();
- } else {
+ else
Cost += createShuffle(Vec, nullptr, CommonMask);
- }
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
if (CommonMask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx;
assert(VF > 0 &&
"Expected vector length for the final value before action.");
- Value *V = Vec.dyn_cast<Value *>();
- if (!Vec.isNull() && !V)
- V = Constant::getNullValue(FixedVectorType::get(
- Vec.get<const TreeEntry *>()->Scalars.front()->getType(),
- CommonMask.size()));
+ Value *V = Vec.get<Value *>();
Action(V, CommonMask);
+ InVectors.front() = V;
}
::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
- if (CommonMask.empty())
- return Cost;
- int Limit = CommonMask.size() * 2;
- if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(CommonMask))
+ if (CommonMask.empty()) {
+ assert(InVectors.size() == 1 && "Expected only one vector with no mask");
return Cost;
+ }
return Cost +
createShuffle(InVectors.front(),
InVectors.size() == 2 ? InVectors.back() : nullptr,
@@ -7273,28 +7701,63 @@ public:
}
};
+const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
+ unsigned Idx) const {
+ Value *Op = E->getOperand(Idx).front();
+ if (const TreeEntry *TE = getTreeEntry(Op)) {
+ if (find_if(E->UserTreeIndices, [&](const EdgeInfo &EI) {
+ return EI.EdgeIdx == Idx && EI.UserTE == E;
+ }) != TE->UserTreeIndices.end())
+ return TE;
+ auto MIt = MultiNodeScalars.find(Op);
+ if (MIt != MultiNodeScalars.end()) {
+ for (const TreeEntry *TE : MIt->second) {
+ if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
+ return EI.EdgeIdx == Idx && EI.UserTE == E;
+ }) != TE->UserTreeIndices.end())
+ return TE;
+ }
+ }
+ }
+ const auto *It =
+ find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::NeedToGather &&
+ find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
+ return EI.EdgeIdx == Idx && EI.UserTE == E;
+ }) != TE->UserTreeIndices.end();
+ });
+ assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
+ return It->get();
+}
+
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
ArrayRef<Value *> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
- if (auto *SI = dyn_cast<StoreInst>(VL[0]))
- ScalarTy = SI->getValueOperand()->getType();
- else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
- ScalarTy = CI->getOperand(0)->getType();
- else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
- ScalarTy = IE->getOperand(1)->getType();
+ if (E->State != TreeEntry::NeedToGather) {
+ if (auto *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
+ ScalarTy = CI->getOperand(0)->getType();
+ else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+ ScalarTy = IE->getOperand(1)->getType();
+ }
+ if (!FixedVectorType::isValidElementType(ScalarTy))
+ return InstructionCost::getInvalid();
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
- if (MinBWs.count(VL[0]))
- VecTy = FixedVectorType::get(
- IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+ auto It = MinBWs.find(E);
+ if (It != MinBWs.end()) {
+ ScalarTy = IntegerType::get(F->getContext(), It->second.first);
+ VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ }
unsigned EntryVF = E->getVectorFactor();
- auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+ auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->State == TreeEntry::NeedToGather) {
@@ -7302,121 +7765,13 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return 0;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
- ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,
- CheckedExtracts);
- unsigned VF = E->getVectorFactor();
- SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
- E->ReuseShuffleIndices.end());
- SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
- // Build a mask out of the reorder indices and reorder scalars per this
- // mask.
- SmallVector<int> ReorderMask;
- inversePermutation(E->ReorderIndices, ReorderMask);
- if (!ReorderMask.empty())
- reorderScalars(GatheredScalars, ReorderMask);
- SmallVector<int> Mask;
- SmallVector<int> ExtractMask;
- std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
- std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
- SmallVector<const TreeEntry *> Entries;
- Type *ScalarTy = GatheredScalars.front()->getType();
- // Check for gathered extracts.
- ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
- SmallVector<Value *> IgnoredVals;
- if (UserIgnoreList)
- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
-
- bool Resized = false;
- if (Value *VecBase = Estimator.adjustExtracts(
- E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))
- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
- if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
- Resized = true;
- GatheredScalars.append(VF - GatheredScalars.size(),
- PoisonValue::get(ScalarTy));
- }
-
- // Do not try to look for reshuffled loads for gathered loads (they will be
- // handled later), for vectorized scalars, and cases, which are definitely
- // not profitable (splats and small gather nodes.)
- if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
- E->isAltShuffle() ||
- all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
- isSplat(E->Scalars) ||
- (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
- GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
- if (GatherShuffle) {
- assert((Entries.size() == 1 || Entries.size() == 2) &&
- "Expected shuffle of 1 or 2 entries.");
- if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
- Entries.front()->isSame(E->Scalars)) {
- // Perfect match in the graph, will reuse the previously vectorized
- // node. Cost is 0.
- LLVM_DEBUG(
- dbgs()
- << "SLP: perfect diamond match for gather bundle that starts with "
- << *VL.front() << ".\n");
- // Restore the mask for previous partially matched values.
- for (auto [I, V] : enumerate(E->Scalars)) {
- if (isa<PoisonValue>(V)) {
- Mask[I] = PoisonMaskElem;
- continue;
- }
- if (Mask[I] == PoisonMaskElem)
- Mask[I] = Entries.front()->findLaneForValue(V);
- }
- Estimator.add(Entries.front(), Mask);
- return Estimator.finalize(E->ReuseShuffleIndices);
- }
- if (!Resized) {
- unsigned VF1 = Entries.front()->getVectorFactor();
- unsigned VF2 = Entries.back()->getVectorFactor();
- if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
- GatheredScalars.append(VF - GatheredScalars.size(),
- PoisonValue::get(ScalarTy));
- }
- // Remove shuffled elements from list of gathers.
- for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
- if (Mask[I] != PoisonMaskElem)
- GatheredScalars[I] = PoisonValue::get(ScalarTy);
- }
- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
- << " entries for bundle that starts with "
- << *VL.front() << ".\n";);
- if (Entries.size() == 1)
- Estimator.add(Entries.front(), Mask);
- else
- Estimator.add(Entries.front(), Entries.back(), Mask);
- if (all_of(GatheredScalars, PoisonValue ::classof))
- return Estimator.finalize(E->ReuseShuffleIndices);
- return Estimator.finalize(
- E->ReuseShuffleIndices, E->Scalars.size(),
- [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
- Vec = Estimator.gather(GatheredScalars,
- Constant::getNullValue(FixedVectorType::get(
- GatheredScalars.front()->getType(),
- GatheredScalars.size())));
- });
- }
- if (!all_of(GatheredScalars, PoisonValue::classof)) {
- auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
- bool SameGathers = VL.equals(Gathers);
- Value *BV = Estimator.gather(
- Gathers, SameGathers ? nullptr
- : Constant::getNullValue(FixedVectorType::get(
- GatheredScalars.front()->getType(),
- GatheredScalars.size())));
- SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
- std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
- Estimator.add(BV, ReuseMask);
- }
- if (ExtractShuffle)
- Estimator.add(E, std::nullopt);
- return Estimator.finalize(E->ReuseShuffleIndices);
+ return processBuildVector<ShuffleCostEstimator, InstructionCost>(
+ E, *TTI, VectorizedVals, *this, CheckedExtracts);
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
- if (!E->ReorderIndices.empty()) {
+ if (!E->ReorderIndices.empty() &&
+ E->State != TreeEntry::PossibleStridedVectorize) {
SmallVector<int> NewMask;
if (E->getOpcode() == Instruction::Store) {
// For stores the order is actually a mask.
@@ -7429,11 +7784,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
if (NeedToShuffleReuses)
::addMask(Mask, E->ReuseShuffleIndices);
- if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))
+ if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
CommonCost =
TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
assert((E->State == TreeEntry::Vectorize ||
- E->State == TreeEntry::ScatterVectorize) &&
+ E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
"Unhandled state");
assert(E->getOpcode() &&
((allSameType(VL) && allSameBlock(VL)) ||
@@ -7443,7 +7799,34 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
- const unsigned Sz = VL.size();
+ SetVector<Value *> UniqueValues(VL.begin(), VL.end());
+ const unsigned Sz = UniqueValues.size();
+ SmallBitVector UsedScalars(Sz, false);
+ for (unsigned I = 0; I < Sz; ++I) {
+ if (getTreeEntry(UniqueValues[I]) == E)
+ continue;
+ UsedScalars.set(I);
+ }
+ auto GetCastContextHint = [&](Value *V) {
+ if (const TreeEntry *OpTE = getTreeEntry(V)) {
+ if (OpTE->State == TreeEntry::ScatterVectorize)
+ return TTI::CastContextHint::GatherScatter;
+ if (OpTE->State == TreeEntry::Vectorize &&
+ OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
+ if (OpTE->ReorderIndices.empty())
+ return TTI::CastContextHint::Normal;
+ SmallVector<int> Mask;
+ inversePermutation(OpTE->ReorderIndices, Mask);
+ if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
+ return TTI::CastContextHint::Reversed;
+ }
+ } else {
+ InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
+ if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
+ return TTI::CastContextHint::GatherScatter;
+ }
+ return TTI::CastContextHint::None;
+ };
auto GetCostDiff =
[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
function_ref<InstructionCost(InstructionCost)> VectorCost) {
@@ -7453,13 +7836,49 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// For some of the instructions no need to calculate cost for each
// particular instruction, we can use the cost of the single
// instruction x total number of scalar instructions.
- ScalarCost = Sz * ScalarEltCost(0);
+ ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
} else {
- for (unsigned I = 0; I < Sz; ++I)
+ for (unsigned I = 0; I < Sz; ++I) {
+ if (UsedScalars.test(I))
+ continue;
ScalarCost += ScalarEltCost(I);
+ }
}
InstructionCost VecCost = VectorCost(CommonCost);
+ // Check if the current node must be resized, if the parent node is not
+ // resized.
+ if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
+ const EdgeInfo &EI = E->UserTreeIndices.front();
+ if ((EI.UserTE->getOpcode() != Instruction::Select ||
+ EI.EdgeIdx != 0) &&
+ It != MinBWs.end()) {
+ auto UserBWIt = MinBWs.find(EI.UserTE);
+ Type *UserScalarTy =
+ EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
+ if (UserBWIt != MinBWs.end())
+ UserScalarTy = IntegerType::get(ScalarTy->getContext(),
+ UserBWIt->second.first);
+ if (ScalarTy != UserScalarTy) {
+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+ unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
+ unsigned VecOpcode;
+ auto *SrcVecTy =
+ FixedVectorType::get(UserScalarTy, E->getVectorFactor());
+ if (BWSz > SrcBWSz)
+ VecOpcode = Instruction::Trunc;
+ else
+ VecOpcode =
+ It->second.second ? Instruction::SExt : Instruction::ZExt;
+ TTI::CastContextHint CCH = GetCastContextHint(VL0);
+ VecCost += TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
+ CostKind);
+ ScalarCost +=
+ Sz * TTI->getCastInstrCost(VecOpcode, ScalarTy, UserScalarTy,
+ CCH, CostKind);
+ }
+ }
+ }
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
ScalarCost, "Calculated costs for Tree"));
return VecCost - ScalarCost;
@@ -7550,7 +7969,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// Count reused scalars.
InstructionCost ScalarCost = 0;
SmallPtrSet<const TreeEntry *, 4> CountedOps;
- for (Value *V : VL) {
+ for (Value *V : UniqueValues) {
auto *PHI = dyn_cast<PHINode>(V);
if (!PHI)
continue;
@@ -7571,8 +7990,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *I = cast<Instruction>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *I = cast<Instruction>(UniqueValues[Idx]);
VectorType *SrcVecTy;
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *EE = cast<ExtractElementInst>(I);
@@ -7680,8 +8099,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// need to shift the vector.
// Do not calculate the cost if the actual size is the register size and
// we can merge this shuffle with the following SK_Select.
- auto *InsertVecTy =
- FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
+ auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
if (!IsIdentity)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
InsertVecTy, Mask);
@@ -7697,8 +8115,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
if (InsertVecSz != VecSz) {
- auto *ActualVecTy =
- FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
+ auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
std::nullopt, CostKind, OffsetBeg - Offset,
InsertVecTy);
@@ -7729,22 +8146,52 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
- return TTI->getCastInstrCost(E->getOpcode(), ScalarTy,
- VI->getOperand(0)->getType(),
+ auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+ Type *SrcScalarTy = VL0->getOperand(0)->getType();
+ auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
+ unsigned Opcode = ShuffleOrOp;
+ unsigned VecOpcode = Opcode;
+ if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
+ (SrcIt != MinBWs.end() || It != MinBWs.end())) {
+ // Check if the values are candidates to demote.
+ unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
+ if (SrcIt != MinBWs.end()) {
+ SrcBWSz = SrcIt->second.first;
+ SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
+ SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
+ }
+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+ if (BWSz == SrcBWSz) {
+ VecOpcode = Instruction::BitCast;
+ } else if (BWSz < SrcBWSz) {
+ VecOpcode = Instruction::Trunc;
+ } else if (It != MinBWs.end()) {
+ assert(BWSz > SrcBWSz && "Invalid cast!");
+ VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+ }
+ }
+ auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
+ // Do not count cost here if minimum bitwidth is in effect and it is just
+ // a bitcast (here it is just a noop).
+ if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
+ return TTI::TCC_Free;
+ auto *VI = VL0->getOpcode() == Opcode
+ ? cast<Instruction>(UniqueValues[Idx])
+ : nullptr;
+ return TTI->getCastInstrCost(Opcode, VL0->getType(),
+ VL0->getOperand(0)->getType(),
TTI::getCastContextHint(VI), CostKind, VI);
};
auto GetVectorCost = [=](InstructionCost CommonCost) {
- Type *SrcTy = VL0->getOperand(0)->getType();
- auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
- InstructionCost VecCost = CommonCost;
- // Check if the values are candidates to demote.
- if (!MinBWs.count(VL0) || VecTy != SrcVecTy)
- VecCost +=
- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
- TTI::getCastContextHint(VL0), CostKind, VL0);
- return VecCost;
+ // Do not count cost here if minimum bitwidth is in effect and it is just
+ // a bitcast (here it is just a noop).
+ if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
+ return CommonCost;
+ auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
+ TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
+ return CommonCost +
+ TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
+ VecOpcode == Opcode ? VI : nullptr);
};
return GetCostDiff(GetScalarCost, GetVectorCost);
}
@@ -7761,7 +8208,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
auto GetScalarCost = [&](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
+ auto *VI = cast<Instruction>(UniqueValues[Idx]);
CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
@@ -7821,8 +8268,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *VI = cast<Instruction>(UniqueValues[Idx]);
unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
TTI::OperandValueInfo Op2Info =
@@ -7833,8 +8280,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
};
auto GetVectorCost = [=](InstructionCost CommonCost) {
unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
- TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);
- TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);
+ TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
+ TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
Op2Info) +
CommonCost;
@@ -7845,23 +8292,25 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return CommonCost + GetGEPCostDiff(VL, VL0);
}
case Instruction::Load: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<LoadInst>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *VI = cast<LoadInst>(UniqueValues[Idx]);
return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
VI->getPointerAddressSpace(), CostKind,
TTI::OperandValueInfo(), VI);
};
auto *LI0 = cast<LoadInst>(VL0);
- auto GetVectorCost = [=](InstructionCost CommonCost) {
+ auto GetVectorCost = [&](InstructionCost CommonCost) {
InstructionCost VecLdCost;
if (E->State == TreeEntry::Vectorize) {
VecLdCost = TTI->getMemoryOpCost(
Instruction::Load, VecTy, LI0->getAlign(),
LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
} else {
- assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+ assert((E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
+ "Unknown EntryState");
Align CommonAlignment = LI0->getAlign();
- for (Value *V : VL)
+ for (Value *V : UniqueValues)
CommonAlignment =
std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
VecLdCost = TTI->getGatherScatterOpCost(
@@ -7874,7 +8323,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
// If this node generates masked gather load then it is not a terminal node.
// Hence address operand cost is estimated separately.
- if (E->State == TreeEntry::ScatterVectorize)
+ if (E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize)
return Cost;
// Estimate cost of GEPs since this tree node is a terminator.
@@ -7887,7 +8337,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
bool IsReorder = !E->ReorderIndices.empty();
auto GetScalarCost = [=](unsigned Idx) {
auto *VI = cast<StoreInst>(VL[Idx]);
- TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
VI->getPointerAddressSpace(), CostKind,
OpInfo, VI);
@@ -7896,7 +8346,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
auto GetVectorCost = [=](InstructionCost CommonCost) {
// We know that we can merge the stores. Calculate the cost.
- TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);
+ TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
BaseSI->getPointerAddressSpace(), CostKind,
OpInfo) +
@@ -7912,8 +8362,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
}
case Instruction::Call: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *CI = cast<CallInst>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *CI = cast<CallInst>(UniqueValues[Idx]);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (ID != Intrinsic::not_intrinsic) {
IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
@@ -7954,8 +8404,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
return false;
};
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
+ auto GetScalarCost = [&](unsigned Idx) {
+ auto *VI = cast<Instruction>(UniqueValues[Idx]);
assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
(void)E;
return TTI->getInstructionCost(VI, CostKind);
@@ -7995,21 +8445,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
TTI::CastContextHint::None, CostKind);
}
- if (E->ReuseShuffleIndices.empty()) {
- VecCost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
- } else {
- SmallVector<int> Mask;
- buildShuffleEntryMask(
- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
- [E](Instruction *I) {
- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- return I->getOpcode() == E->getAltOpcode();
- },
- Mask);
- VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
- FinalVecTy, Mask);
- }
+ SmallVector<int> Mask;
+ E->buildAltOpShuffleMask(
+ [E](Instruction *I) {
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ return I->getOpcode() == E->getAltOpcode();
+ },
+ Mask);
+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+ FinalVecTy, Mask);
return VecCost;
};
return GetCostDiff(GetScalarCost, GetVectorCost);
@@ -8065,7 +8509,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
// Gathering cost would be too much for tiny trees.
if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
- VectorizableTree[0]->State != TreeEntry::ScatterVectorize))
+ VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
+ VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
return false;
return true;
@@ -8144,6 +8589,23 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
allConstant(VectorizableTree[1]->Scalars))))
return true;
+ // If the graph includes only PHI nodes and gathers, it is defnitely not
+ // profitable for the vectorization, we can skip it, if the cost threshold is
+ // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
+ // gathers/buildvectors.
+ constexpr int Limit = 4;
+ if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
+ !VectorizableTree.empty() &&
+ all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return (TE->State == TreeEntry::NeedToGather &&
+ TE->getOpcode() != Instruction::ExtractElement &&
+ count_if(TE->Scalars,
+ [](Value *V) { return isa<ExtractElementInst>(V); }) <=
+ Limit) ||
+ TE->getOpcode() == Instruction::PHI;
+ }))
+ return true;
+
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
@@ -8435,16 +8897,6 @@ static T *performExtractsShuffleAction(
}
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
- // Build a map for gathered scalars to the nodes where they are used.
- ValueToGatherNodes.clear();
- for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
- if (EntryPtr->State != TreeEntry::NeedToGather)
- continue;
- for (Value *V : EntryPtr->Scalars)
- if (!isConstant(V))
- ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
- EntryPtr.get());
- }
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
@@ -8460,8 +8912,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
E->isSame(TE.Scalars)) {
// Some gather nodes might be absolutely the same as some vectorizable
// nodes after reordering, need to handle it.
- LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with "
- << *TE.Scalars[0] << ".\n"
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
+ << shortBundleName(TE.Scalars) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
continue;
}
@@ -8469,9 +8921,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
Cost += C;
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for bundle that starts with " << *TE.Scalars[0]
- << ".\n"
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
+ << shortBundleName(TE.Scalars) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
}
@@ -8480,6 +8931,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
SmallVector<APInt> DemandedElts;
+ SmallDenseSet<Value *, 4> UsedInserts;
+ DenseSet<Value *> VectorCasts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -8500,6 +8953,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// to detect it as a final shuffled/identity match.
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
+ if (!UsedInserts.insert(VU).second)
+ continue;
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
@@ -8546,6 +9001,28 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
FirstUsers.emplace_back(VU, ScalarTE);
DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
VecId = FirstUsers.size() - 1;
+ auto It = MinBWs.find(ScalarTE);
+ if (It != MinBWs.end() && VectorCasts.insert(EU.Scalar).second) {
+ unsigned BWSz = It->second.second;
+ unsigned SrcBWSz = DL->getTypeSizeInBits(FTy->getElementType());
+ unsigned VecOpcode;
+ if (BWSz < SrcBWSz)
+ VecOpcode = Instruction::Trunc;
+ else
+ VecOpcode =
+ It->second.second ? Instruction::SExt : Instruction::ZExt;
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost C = TTI->getCastInstrCost(
+ VecOpcode, FTy,
+ FixedVectorType::get(
+ IntegerType::get(FTy->getContext(), It->second.first),
+ FTy->getNumElements()),
+ TTI::CastContextHint::None, CostKind);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for extending externally used vector with "
+ "non-equal minimum bitwidth.\n");
+ Cost += C;
+ }
} else {
if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
It->first = VU;
@@ -8567,11 +9044,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
- if (MinBWs.count(ScalarRoot)) {
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
- auto Extend =
- MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
+ auto It = MinBWs.find(getTreeEntry(EU.Scalar));
+ if (It != MinBWs.end()) {
+ auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
+ unsigned Extend =
+ It->second.second ? Instruction::SExt : Instruction::ZExt;
VecTy = FixedVectorType::get(MinTy, BundleWidth);
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
@@ -8580,6 +9057,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
CostKind, EU.Lane);
}
}
+ // Add reduced value cost, if resized.
+ if (!VectorizedVals.empty()) {
+ auto BWIt = MinBWs.find(VectorizableTree.front().get());
+ if (BWIt != MinBWs.end()) {
+ Type *DstTy = VectorizableTree.front()->Scalars.front()->getType();
+ unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
+ unsigned Opcode = Instruction::Trunc;
+ if (OriginalSz < BWIt->second.first)
+ Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
+ Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);
+ Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
+ TTI::CastContextHint::None,
+ TTI::TCK_RecipThroughput);
+ }
+ }
InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
@@ -8590,9 +9082,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
unsigned VecVF = TE->getVectorFactor();
if (VF != VecVF &&
(any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
- (all_of(Mask,
- [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
- !ShuffleVectorInst::isIdentityMask(Mask)))) {
+ !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
OrigMask.begin());
@@ -8611,19 +9101,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// Calculate the cost of the reshuffled vectors, if any.
for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
- unsigned VF = ShuffleMasks[I].begin()->second.size();
- auto *FTy = FixedVectorType::get(
- cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);
auto Vector = ShuffleMasks[I].takeVector();
- auto &&EstimateShufflesCost = [this, FTy,
- &Cost](ArrayRef<int> Mask,
- ArrayRef<const TreeEntry *> TEs) {
+ unsigned VF = 0;
+ auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
+ ArrayRef<const TreeEntry *> TEs) {
assert((TEs.size() == 1 || TEs.size() == 2) &&
"Expected exactly 1 or 2 tree entries.");
if (TEs.size() == 1) {
- int Limit = 2 * Mask.size();
- if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||
- !ShuffleVectorInst::isIdentityMask(Mask)) {
+ if (VF == 0)
+ VF = TEs.front()->getVectorFactor();
+ auto *FTy =
+ FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
+ if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
+ !all_of(enumerate(Mask), [=](const auto &Data) {
+ return Data.value() == PoisonMaskElem ||
+ (Data.index() < VF &&
+ static_cast<int>(Data.index()) == Data.value());
+ })) {
InstructionCost C =
TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
@@ -8634,6 +9128,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
Cost += C;
}
} else {
+ if (VF == 0) {
+ if (TEs.front() &&
+ TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
+ VF = TEs.front()->getVectorFactor();
+ else
+ VF = Mask.size();
+ }
+ auto *FTy =
+ FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
InstructionCost C =
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
@@ -8643,6 +9146,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
dbgs() << "SLP: Current total cost = " << Cost << "\n");
Cost += C;
}
+ VF = Mask.size();
return TEs.back();
};
(void)performExtractsShuffleAction<const TreeEntry>(
@@ -8671,54 +9175,198 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return Cost;
}
-std::optional<TargetTransformInfo::ShuffleKind>
-BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<const TreeEntry *> &Entries) {
- Entries.clear();
- // No need to check for the topmost gather node.
- if (TE == VectorizableTree.front().get())
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+std::optional<TTI::ShuffleKind>
+BoUpSLP::tryToGatherSingleRegisterExtractElements(
+ MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
+ // Scan list of gathered scalars for extractelements that can be represented
+ // as shuffles.
+ MapVector<Value *, SmallVector<int>> VectorOpToIdx;
+ SmallVector<int> UndefVectorExtracts;
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+ if (!EI) {
+ if (isa<UndefValue>(VL[I]))
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
+ if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
+ continue;
+ std::optional<unsigned> Idx = getExtractIndex(EI);
+ // Undefined index.
+ if (!Idx) {
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ SmallBitVector ExtractMask(VecTy->getNumElements(), true);
+ ExtractMask.reset(*Idx);
+ if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
+ UndefVectorExtracts.push_back(I);
+ continue;
+ }
+ VectorOpToIdx[EI->getVectorOperand()].push_back(I);
+ }
+ // Sort the vector operands by the maximum number of uses in extractelements.
+ MapVector<unsigned, SmallVector<Value *>> VFToVector;
+ for (const auto &Data : VectorOpToIdx)
+ VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
+ .push_back(Data.first);
+ for (auto &Data : VFToVector) {
+ stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
+ return VectorOpToIdx.find(V1)->second.size() >
+ VectorOpToIdx.find(V2)->second.size();
+ });
+ }
+ // Find the best pair of the vectors with the same number of elements or a
+ // single vector.
+ const int UndefSz = UndefVectorExtracts.size();
+ unsigned SingleMax = 0;
+ Value *SingleVec = nullptr;
+ unsigned PairMax = 0;
+ std::pair<Value *, Value *> PairVec(nullptr, nullptr);
+ for (auto &Data : VFToVector) {
+ Value *V1 = Data.second.front();
+ if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
+ SingleMax = VectorOpToIdx[V1].size() + UndefSz;
+ SingleVec = V1;
+ }
+ Value *V2 = nullptr;
+ if (Data.second.size() > 1)
+ V2 = *std::next(Data.second.begin());
+ if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
+ UndefSz) {
+ PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
+ PairVec = std::make_pair(V1, V2);
+ }
+ }
+ if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
+ return std::nullopt;
+ // Check if better to perform a shuffle of 2 vectors or just of a single
+ // vector.
+ SmallVector<Value *> SavedVL(VL.begin(), VL.end());
+ SmallVector<Value *> GatheredExtracts(
+ VL.size(), PoisonValue::get(VL.front()->getType()));
+ if (SingleMax >= PairMax && SingleMax) {
+ for (int Idx : VectorOpToIdx[SingleVec])
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ } else {
+ for (Value *V : {PairVec.first, PairVec.second})
+ for (int Idx : VectorOpToIdx[V])
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ }
+ // Add extracts from undefs too.
+ for (int Idx : UndefVectorExtracts)
+ std::swap(GatheredExtracts[Idx], VL[Idx]);
+ // Check that gather of extractelements can be represented as just a
+ // shuffle of a single/two vectors the scalars are extracted from.
+ std::optional<TTI::ShuffleKind> Res =
+ isFixedVectorShuffle(GatheredExtracts, Mask);
+ if (!Res) {
+ // TODO: try to check other subsets if possible.
+ // Restore the original VL if attempt was not successful.
+ copy(SavedVL, VL.begin());
return std::nullopt;
+ }
+ // Restore unused scalars from mask, if some of the extractelements were not
+ // selected for shuffle.
+ for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
+ if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
+ isa<UndefValue>(GatheredExtracts[I])) {
+ std::swap(VL[I], GatheredExtracts[I]);
+ continue;
+ }
+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+ if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
+ !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
+ is_contained(UndefVectorExtracts, I))
+ continue;
+ }
+ return Res;
+}
+
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+SmallVector<std::optional<TTI::ShuffleKind>>
+BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+ SmallVectorImpl<int> &Mask,
+ unsigned NumParts) const {
+ assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
+ SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
Mask.assign(VL.size(), PoisonMaskElem);
- assert(TE->UserTreeIndices.size() == 1 &&
- "Expected only single user of the gather node.");
+ unsigned SliceSize = VL.size() / NumParts;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ // Scan list of gathered scalars for extractelements that can be represented
+ // as shuffles.
+ MutableArrayRef<Value *> SubVL =
+ MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+ SmallVector<int> SubMask;
+ std::optional<TTI::ShuffleKind> Res =
+ tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
+ ShufflesRes[Part] = Res;
+ copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
+ }
+ if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
+ return Res.has_value();
+ }))
+ ShufflesRes.clear();
+ return ShufflesRes;
+}
+
+std::optional<TargetTransformInfo::ShuffleKind>
+BoUpSLP::isGatherShuffledSingleRegisterEntry(
+ const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
+ Entries.clear();
// TODO: currently checking only for Scalars in the tree entry, need to count
// reused elements too for better cost estimation.
- Instruction &UserInst =
- getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE);
- BasicBlock *ParentBB = nullptr;
+ const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
+ const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
+ const BasicBlock *TEInsertBlock = nullptr;
// Main node of PHI entries keeps the correct order of operands/incoming
// blocks.
- if (auto *PHI =
- dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) {
- ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx);
+ if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
+ TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
+ TEInsertPt = TEInsertBlock->getTerminator();
} else {
- ParentBB = UserInst.getParent();
+ TEInsertBlock = TEInsertPt->getParent();
}
- auto *NodeUI = DT->getNode(ParentBB);
+ auto *NodeUI = DT->getNode(TEInsertBlock);
assert(NodeUI && "Should only process reachable instructions");
SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
- auto CheckOrdering = [&](Instruction *LastEI) {
- // Check if the user node of the TE comes after user node of EntryPtr,
- // otherwise EntryPtr depends on TE.
- // Gather nodes usually are not scheduled and inserted before their first
- // user node. So, instead of checking dependency between the gather nodes
- // themselves, we check the dependency between their user nodes.
- // If one user node comes before the second one, we cannot use the second
- // gather node as the source vector for the first gather node, because in
- // the list of instructions it will be emitted later.
- auto *EntryParent = LastEI->getParent();
- auto *NodeEUI = DT->getNode(EntryParent);
+ auto CheckOrdering = [&](const Instruction *InsertPt) {
+ // Argument InsertPt is an instruction where vector code for some other
+ // tree entry (one that shares one or more scalars with TE) is going to be
+ // generated. This lambda returns true if insertion point of vector code
+ // for the TE dominates that point (otherwise dependency is the other way
+ // around). The other node is not limited to be of a gather kind. Gather
+ // nodes are not scheduled and their vector code is inserted before their
+ // first user. If user is PHI, that is supposed to be at the end of a
+ // predecessor block. Otherwise it is the last instruction among scalars of
+ // the user node. So, instead of checking dependency between instructions
+ // themselves, we check dependency between their insertion points for vector
+ // code (since each scalar instruction ends up as a lane of a vector
+ // instruction).
+ const BasicBlock *InsertBlock = InsertPt->getParent();
+ auto *NodeEUI = DT->getNode(InsertBlock);
if (!NodeEUI)
return false;
assert((NodeUI == NodeEUI) ==
(NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
// Check the order of the gather nodes users.
- if (UserInst.getParent() != EntryParent &&
+ if (TEInsertPt->getParent() != InsertBlock &&
(DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
return false;
- if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI))
+ if (TEInsertPt->getParent() == InsertBlock &&
+ TEInsertPt->comesBefore(InsertPt))
return false;
return true;
};
@@ -8743,43 +9391,42 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
[&](Value *V) { return GatheredScalars.contains(V); }) &&
"Must contain at least single gathered value.");
assert(TEPtr->UserTreeIndices.size() == 1 &&
- "Expected only single user of the gather node.");
- PHINode *EntryPHI =
- dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp());
- Instruction *EntryUserInst =
- EntryPHI ? nullptr
- : &getLastInstructionInBundle(
- TEPtr->UserTreeIndices.front().UserTE);
- if (&UserInst == EntryUserInst) {
- assert(!EntryPHI && "Unexpected phi node entry.");
- // If 2 gathers are operands of the same entry, compare operands
- // indices, use the earlier one as the base.
- if (TE->UserTreeIndices.front().UserTE ==
- TEPtr->UserTreeIndices.front().UserTE &&
- TE->UserTreeIndices.front().EdgeIdx <
- TEPtr->UserTreeIndices.front().EdgeIdx)
+ "Expected only single user of a gather node.");
+ const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
+
+ PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
+ const Instruction *InsertPt =
+ UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
+ : &getLastInstructionInBundle(UseEI.UserTE);
+ if (TEInsertPt == InsertPt) {
+ // If 2 gathers are operands of the same entry (regardless of whether
+ // user is PHI or else), compare operands indices, use the earlier one
+ // as the base.
+ if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
+ continue;
+ // If the user instruction is used for some reason in different
+ // vectorized nodes - make it depend on index.
+ if (TEUseEI.UserTE != UseEI.UserTE &&
+ TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
continue;
}
- // Check if the user node of the TE comes after user node of EntryPtr,
- // otherwise EntryPtr depends on TE.
- auto *EntryI =
- EntryPHI
- ? EntryPHI
- ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx)
- ->getTerminator()
- : EntryUserInst;
- if ((ParentBB != EntryI->getParent() ||
- TE->UserTreeIndices.front().EdgeIdx <
- TEPtr->UserTreeIndices.front().EdgeIdx ||
- TE->UserTreeIndices.front().UserTE !=
- TEPtr->UserTreeIndices.front().UserTE) &&
- !CheckOrdering(EntryI))
+
+ // Check if the user node of the TE comes after user node of TEPtr,
+ // otherwise TEPtr depends on TE.
+ if ((TEInsertBlock != InsertPt->getParent() ||
+ TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
+ !CheckOrdering(InsertPt))
continue;
VToTEs.insert(TEPtr);
}
if (const TreeEntry *VTE = getTreeEntry(V)) {
- Instruction &EntryUserInst = getLastInstructionInBundle(VTE);
- if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst))
+ Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
+ if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
+ continue;
+ auto It = MinBWs.find(VTE);
+ // If vectorize node is demoted - do not match.
+ if (It != MinBWs.end() &&
+ It->second.first != DL->getTypeSizeInBits(V->getType()))
continue;
VToTEs.insert(VTE);
}
@@ -8823,8 +9470,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
}
}
- if (UsedTEs.empty())
+ if (UsedTEs.empty()) {
+ Entries.clear();
return std::nullopt;
+ }
unsigned VF = 0;
if (UsedTEs.size() == 1) {
@@ -8838,9 +9487,19 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
});
- if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {
+ if (It != FirstEntries.end() &&
+ ((*It)->getVectorFactor() == VL.size() ||
+ ((*It)->getVectorFactor() == TE->Scalars.size() &&
+ TE->ReuseShuffleIndices.size() == VL.size() &&
+ (*It)->isSame(TE->Scalars)))) {
Entries.push_back(*It);
- std::iota(Mask.begin(), Mask.end(), 0);
+ if ((*It)->getVectorFactor() == VL.size()) {
+ std::iota(std::next(Mask.begin(), Part * VL.size()),
+ std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
+ } else {
+ SmallVector<int> CommonMask = TE->getCommonMask();
+ copy(CommonMask, Mask.begin());
+ }
// Clear undef scalars.
for (int I = 0, Sz = VL.size(); I < Sz; ++I)
if (isa<PoisonValue>(VL[I]))
@@ -8923,12 +9582,9 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
// by extractelements processing) or may form vector node in future.
auto MightBeIgnored = [=](Value *V) {
auto *I = dyn_cast<Instruction>(V);
- SmallVector<Value *> IgnoredVals;
- if (UserIgnoreList)
- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
!isVectorLikeInstWithConstOps(I) &&
- !areAllUsersVectorized(I, IgnoredVals) && isSimple(I);
+ !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
};
// Check that the neighbor instruction may form a full vector node with the
// current instruction V. It is possible, if they have same/alternate opcode
@@ -8980,7 +9636,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
TempEntries.push_back(Entries[I]);
}
Entries.swap(TempEntries);
- if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {
+ if (EntryLanes.size() == Entries.size() &&
+ !VL.equals(ArrayRef(TE->Scalars)
+ .slice(Part * VL.size(),
+ std::min<int>(VL.size(), TE->Scalars.size())))) {
// We may have here 1 or 2 entries only. If the number of scalars is equal
// to the number of entries, no need to do the analysis, it is not very
// profitable. Since VL is not the same as TE->Scalars, it means we already
@@ -8993,9 +9652,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
// Pair.first is the offset to the vector, while Pair.second is the index of
// scalar in the list.
for (const std::pair<unsigned, int> &Pair : EntryLanes) {
- Mask[Pair.second] = Pair.first * VF +
- Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
- IsIdentity &= Mask[Pair.second] == Pair.second;
+ unsigned Idx = Part * VL.size() + Pair.second;
+ Mask[Idx] = Pair.first * VF +
+ Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+ IsIdentity &= Mask[Idx] == Pair.second;
}
switch (Entries.size()) {
case 1:
@@ -9010,9 +9670,64 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
break;
}
Entries.clear();
+ // Clear the corresponding mask elements.
+ std::fill(std::next(Mask.begin(), Part * VL.size()),
+ std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
return std::nullopt;
}
+SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+BoUpSLP::isGatherShuffledEntry(
+ const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
+ unsigned NumParts) {
+ assert(NumParts > 0 && NumParts < VL.size() &&
+ "Expected positive number of registers.");
+ Entries.clear();
+ // No need to check for the topmost gather node.
+ if (TE == VectorizableTree.front().get())
+ return {};
+ Mask.assign(VL.size(), PoisonMaskElem);
+ assert(TE->UserTreeIndices.size() == 1 &&
+ "Expected only single user of the gather node.");
+ assert(VL.size() % NumParts == 0 &&
+ "Number of scalars must be divisible by NumParts.");
+ unsigned SliceSize = VL.size() / NumParts;
+ SmallVector<std::optional<TTI::ShuffleKind>> Res;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
+ SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
+ std::optional<TTI::ShuffleKind> SubRes =
+ isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
+ if (!SubRes)
+ SubEntries.clear();
+ Res.push_back(SubRes);
+ if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
+ SubEntries.front()->getVectorFactor() == VL.size() &&
+ (SubEntries.front()->isSame(TE->Scalars) ||
+ SubEntries.front()->isSame(VL))) {
+ SmallVector<const TreeEntry *> LocalSubEntries;
+ LocalSubEntries.swap(SubEntries);
+ Entries.clear();
+ Res.clear();
+ std::iota(Mask.begin(), Mask.end(), 0);
+ // Clear undef scalars.
+ for (int I = 0, Sz = VL.size(); I < Sz; ++I)
+ if (isa<PoisonValue>(VL[I]))
+ Mask[I] = PoisonMaskElem;
+ Entries.emplace_back(1, LocalSubEntries.front());
+ Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
+ return Res;
+ }
+ }
+ if (all_of(Res,
+ [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
+ Entries.clear();
+ return {};
+ }
+ return Res;
+}
+
InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
bool ForPoisonSrc) const {
// Find the type of the operands in VL.
@@ -9224,18 +9939,20 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
auto *Front = E->getMainOp();
Instruction *LastInst = &getLastInstructionInBundle(E);
assert(LastInst && "Failed to find last instruction in bundle");
+ BasicBlock::iterator LastInstIt = LastInst->getIterator();
// If the instruction is PHI, set the insert point after all the PHIs.
bool IsPHI = isa<PHINode>(LastInst);
if (IsPHI)
- LastInst = LastInst->getParent()->getFirstNonPHI();
+ LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
if (IsPHI || (E->State != TreeEntry::NeedToGather &&
doesNotNeedToSchedule(E->Scalars))) {
- Builder.SetInsertPoint(LastInst);
+ Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
} else {
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
- Builder.SetInsertPoint(LastInst->getParent(),
- std::next(LastInst->getIterator()));
+ Builder.SetInsertPoint(
+ LastInst->getParent(),
+ LastInst->getNextNonDebugInstruction()->getIterator());
}
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
@@ -9271,10 +9988,12 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
GatherShuffleExtractSeq.insert(InsElt);
CSEBlocks.insert(InsElt->getParent());
// Add to our 'need-to-extract' list.
- if (TreeEntry *Entry = getTreeEntry(V)) {
- // Find which lane we need to extract.
- unsigned FoundLane = Entry->findLaneForValue(V);
- ExternalUses.emplace_back(V, InsElt, FoundLane);
+ if (isa<Instruction>(V)) {
+ if (TreeEntry *Entry = getTreeEntry(V)) {
+ // Find which lane we need to extract.
+ unsigned FoundLane = Entry->findLaneForValue(V);
+ ExternalUses.emplace_back(V, InsElt, FoundLane);
+ }
}
return Vec;
};
@@ -9367,12 +10086,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> &GatherShuffleExtractSeq;
/// A list of blocks that we are going to CSE.
- SetVector<BasicBlock *> &CSEBlocks;
+ DenseSet<BasicBlock *> &CSEBlocks;
public:
ShuffleIRBuilder(IRBuilderBase &Builder,
SetVector<Instruction *> &GatherShuffleExtractSeq,
- SetVector<BasicBlock *> &CSEBlocks)
+ DenseSet<BasicBlock *> &CSEBlocks)
: Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
CSEBlocks(CSEBlocks) {}
~ShuffleIRBuilder() = default;
@@ -9392,7 +10111,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
return V1;
unsigned VF = Mask.size();
unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
- if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask))
+ if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
return V1;
Value *Vec = Builder.CreateShuffleVector(V1, Mask);
if (auto *I = dyn_cast<Instruction>(Vec)) {
@@ -9455,7 +10174,11 @@ public:
: Builder(Builder), R(R) {}
/// Adjusts extractelements after reusing them.
- Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
+ Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+ unsigned NumParts, bool &UseVecBaseAsInput) {
+ UseVecBaseAsInput = false;
+ SmallPtrSet<Value *, 4> UniqueBases;
Value *VecBase = nullptr;
for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
int Idx = Mask[I];
@@ -9463,6 +10186,10 @@ public:
continue;
auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
VecBase = EI->getVectorOperand();
+ if (const TreeEntry *TE = R.getTreeEntry(VecBase))
+ VecBase = TE->VectorizedValue;
+ assert(VecBase && "Expected vectorized value.");
+ UniqueBases.insert(VecBase);
// If the only one use is vectorized - can delete the extractelement
// itself.
if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
@@ -9471,14 +10198,97 @@ public:
continue;
R.eraseInstruction(EI);
}
- return VecBase;
+ if (NumParts == 1 || UniqueBases.size() == 1)
+ return VecBase;
+ UseVecBaseAsInput = true;
+ auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
+ for (auto [I, Idx] : enumerate(Mask))
+ if (Idx != PoisonMaskElem)
+ Idx = I;
+ };
+ // Perform multi-register vector shuffle, joining them into a single virtual
+ // long vector.
+ // Need to shuffle each part independently and then insert all this parts
+ // into a long virtual vector register, forming the original vector.
+ Value *Vec = nullptr;
+ SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+ unsigned SliceSize = E->Scalars.size() / NumParts;
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ ArrayRef<Value *> VL =
+ ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
+ MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+ constexpr int MaxBases = 2;
+ SmallVector<Value *, MaxBases> Bases(MaxBases);
+#ifndef NDEBUG
+ int PrevSize = 0;
+#endif // NDEBUG
+ for (const auto [I, V]: enumerate(VL)) {
+ if (SubMask[I] == PoisonMaskElem)
+ continue;
+ Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
+ if (const TreeEntry *TE = R.getTreeEntry(VecOp))
+ VecOp = TE->VectorizedValue;
+ assert(VecOp && "Expected vectorized value.");
+ const int Size =
+ cast<FixedVectorType>(VecOp->getType())->getNumElements();
+#ifndef NDEBUG
+ assert((PrevSize == Size || PrevSize == 0) &&
+ "Expected vectors of the same size.");
+ PrevSize = Size;
+#endif // NDEBUG
+ Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
+ }
+ if (!Bases.front())
+ continue;
+ Value *SubVec;
+ if (Bases.back()) {
+ SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
+ TransformToIdentity(SubMask);
+ } else {
+ SubVec = Bases.front();
+ }
+ if (!Vec) {
+ Vec = SubVec;
+ assert((Part == 0 || all_of(seq<unsigned>(0, Part),
+ [&](unsigned P) {
+ ArrayRef<int> SubMask =
+ Mask.slice(P * SliceSize, SliceSize);
+ return all_of(SubMask, [](int Idx) {
+ return Idx == PoisonMaskElem;
+ });
+ })) &&
+ "Expected first part or all previous parts masked.");
+ copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
+ } else {
+ unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+ if (Vec->getType() != SubVec->getType()) {
+ unsigned SubVecVF =
+ cast<FixedVectorType>(SubVec->getType())->getNumElements();
+ VF = std::max(VF, SubVecVF);
+ }
+ // Adjust SubMask.
+ for (auto [I, Idx] : enumerate(SubMask))
+ if (Idx != PoisonMaskElem)
+ Idx += VF;
+ copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
+ Vec = createShuffle(Vec, SubVec, VecMask);
+ TransformToIdentity(VecMask);
+ }
+ }
+ copy(VecMask, Mask.begin());
+ return Vec;
}
/// Checks if the specified entry \p E needs to be delayed because of its
/// dependency nodes.
- Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
+ std::optional<Value *>
+ needToDelay(const TreeEntry *E,
+ ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
// No need to delay emission if all deps are ready.
- if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
- return nullptr;
+ if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
+ return all_of(
+ TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
+ }))
+ return std::nullopt;
// Postpone gather emission, will be emitted after the end of the
// process to keep correct order.
auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
@@ -9487,6 +10297,16 @@ public:
VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),
MaybeAlign());
}
+ /// Adds 2 input vectors (in form of tree entries) and the mask for their
+ /// shuffling.
+ void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
+ add(E1.VectorizedValue, E2.VectorizedValue, Mask);
+ }
+ /// Adds single input vector (in form of tree entry) and the mask for its
+ /// shuffling.
+ void add(const TreeEntry &E1, ArrayRef<int> Mask) {
+ add(E1.VectorizedValue, Mask);
+ }
/// Adds 2 input vectors and the mask for their shuffling.
void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
@@ -9516,7 +10336,7 @@ public:
InVectors.push_back(V1);
}
/// Adds another one input vector and the mask for the shuffling.
- void add(Value *V1, ArrayRef<int> Mask) {
+ void add(Value *V1, ArrayRef<int> Mask, bool = false) {
if (InVectors.empty()) {
if (!isa<FixedVectorType>(V1->getType())) {
V1 = createShuffle(V1, nullptr, CommonMask);
@@ -9578,7 +10398,8 @@ public:
inversePermutation(Order, NewMask);
add(V1, NewMask);
}
- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
+ Value *Root = nullptr) {
return R.gather(VL, Root);
}
Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
@@ -9639,8 +10460,14 @@ public:
}
};
-Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
- ArrayRef<Value *> VL = E->getOperand(NodeIdx);
+Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
+ bool PostponedPHIs) {
+ ValueList &VL = E->getOperand(NodeIdx);
+ if (E->State == TreeEntry::PossibleStridedVectorize &&
+ !E->ReorderIndices.empty()) {
+ SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
+ reorderScalars(VL, Mask);
+ }
const unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
@@ -9651,23 +10478,39 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
S = getSameOpcode(*It, *TLI);
}
if (S.getOpcode()) {
- if (TreeEntry *VE = getTreeEntry(S.OpValue);
- VE && VE->isSame(VL) &&
- (any_of(VE->UserTreeIndices,
- [E, NodeIdx](const EdgeInfo &EI) {
- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
- }) ||
- any_of(VectorizableTree,
- [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isOperandGatherNode({E, NodeIdx}) &&
- VE->isSame(TE->Scalars);
- }))) {
+ auto CheckSameVE = [&](const TreeEntry *VE) {
+ return VE->isSame(VL) &&
+ (any_of(VE->UserTreeIndices,
+ [E, NodeIdx](const EdgeInfo &EI) {
+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
+ }) ||
+ any_of(VectorizableTree,
+ [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isOperandGatherNode({E, NodeIdx}) &&
+ VE->isSame(TE->Scalars);
+ }));
+ };
+ TreeEntry *VE = getTreeEntry(S.OpValue);
+ bool IsSameVE = VE && CheckSameVE(VE);
+ if (!IsSameVE) {
+ auto It = MultiNodeScalars.find(S.OpValue);
+ if (It != MultiNodeScalars.end()) {
+ auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
+ return TE != VE && CheckSameVE(TE);
+ });
+ if (I != It->getSecond().end()) {
+ VE = *I;
+ IsSameVE = true;
+ }
+ }
+ }
+ if (IsSameVE) {
auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
ShuffleBuilder.add(V, Mask);
return ShuffleBuilder.finalize(std::nullopt);
};
- Value *V = vectorizeTree(VE);
+ Value *V = vectorizeTree(VE, PostponedPHIs);
if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
if (!VE->ReuseShuffleIndices.empty()) {
// Reshuffle to get only unique values.
@@ -9740,14 +10583,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
assert(I->get()->UserTreeIndices.size() == 1 &&
"Expected only single user for the gather node.");
assert(I->get()->isSame(VL) && "Expected same list of scalars.");
- IRBuilder<>::InsertPointGuard Guard(Builder);
- if (E->getOpcode() != Instruction::InsertElement &&
- E->getOpcode() != Instruction::PHI) {
- Instruction *LastInst = &getLastInstructionInBundle(E);
- assert(LastInst && "Failed to find last instruction in bundle");
- Builder.SetInsertPoint(LastInst);
- }
- return vectorizeTree(I->get());
+ return vectorizeTree(I->get(), PostponedPHIs);
}
template <typename BVTy, typename ResTy, typename... Args>
@@ -9765,7 +10601,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
inversePermutation(E->ReorderIndices, ReorderMask);
if (!ReorderMask.empty())
reorderScalars(GatheredScalars, ReorderMask);
- auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {
+ auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF) {
if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
return isa<UndefValue>(V) && !isa<PoisonValue>(V);
}))
@@ -9782,70 +10618,102 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
});
if (It == VectorizableTree.end())
return false;
- unsigned I =
- *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
- int Sz = Mask.size();
- if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&
- ShuffleVectorInst::isIdentityMask(Mask))
+ int Idx;
+ if ((Mask.size() < InputVF &&
+ ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
+ Idx == 0) ||
+ (Mask.size() == InputVF &&
+ ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
std::iota(Mask.begin(), Mask.end(), 0);
- else
+ } else {
+ unsigned I =
+ *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
std::fill(Mask.begin(), Mask.end(), I);
+ }
return true;
};
BVTy ShuffleBuilder(Params...);
ResTy Res = ResTy();
SmallVector<int> Mask;
- SmallVector<int> ExtractMask;
- std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
- std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
- SmallVector<const TreeEntry *> Entries;
+ SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
+ SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
+ Value *ExtractVecBase = nullptr;
+ bool UseVecBaseAsInput = false;
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
+ SmallVector<SmallVector<const TreeEntry *>> Entries;
Type *ScalarTy = GatheredScalars.front()->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
+ unsigned NumParts = TTI->getNumberOfParts(VecTy);
+ if (NumParts == 0 || NumParts >= GatheredScalars.size())
+ NumParts = 1;
if (!all_of(GatheredScalars, UndefValue::classof)) {
// Check for gathered extracts.
- ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
- SmallVector<Value *> IgnoredVals;
- if (UserIgnoreList)
- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
bool Resized = false;
- if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
- if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
- Resized = true;
- GatheredScalars.append(VF - GatheredScalars.size(),
- PoisonValue::get(ScalarTy));
- }
+ ExtractShuffles =
+ tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+ if (!ExtractShuffles.empty()) {
+ SmallVector<const TreeEntry *> ExtractEntries;
+ for (auto [Idx, I] : enumerate(ExtractMask)) {
+ if (I == PoisonMaskElem)
+ continue;
+ if (const auto *TE = getTreeEntry(
+ cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
+ ExtractEntries.push_back(TE);
+ }
+ if (std::optional<ResTy> Delayed =
+ ShuffleBuilder.needToDelay(E, ExtractEntries)) {
+ // Delay emission of gathers which are not ready yet.
+ PostponedGathers.insert(E);
+ // Postpone gather emission, will be emitted after the end of the
+ // process to keep correct order.
+ return *Delayed;
+ }
+ if (Value *VecBase = ShuffleBuilder.adjustExtracts(
+ E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
+ ExtractVecBase = VecBase;
+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+ if (VF == VecBaseTy->getNumElements() &&
+ GatheredScalars.size() != VF) {
+ Resized = true;
+ GatheredScalars.append(VF - GatheredScalars.size(),
+ PoisonValue::get(ScalarTy));
+ }
+ }
+ }
// Gather extracts after we check for full matched gathers only.
- if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+ if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
E->isAltShuffle() ||
all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
isSplat(E->Scalars) ||
(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
- GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+ GatherShuffles =
+ isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
}
- if (GatherShuffle) {
- if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
+ if (!GatherShuffles.empty()) {
+ if (std::optional<ResTy> Delayed =
+ ShuffleBuilder.needToDelay(E, Entries)) {
// Delay emission of gathers which are not ready yet.
PostponedGathers.insert(E);
// Postpone gather emission, will be emitted after the end of the
// process to keep correct order.
- return Delayed;
+ return *Delayed;
}
- assert((Entries.size() == 1 || Entries.size() == 2) &&
- "Expected shuffle of 1 or 2 entries.");
- if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
- Entries.front()->isSame(E->Scalars)) {
+ if (GatherShuffles.size() == 1 &&
+ *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+ Entries.front().front()->isSame(E->Scalars)) {
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
LLVM_DEBUG(
dbgs()
- << "SLP: perfect diamond match for gather bundle that starts with "
- << *E->Scalars.front() << ".\n");
+ << "SLP: perfect diamond match for gather bundle "
+ << shortBundleName(E->Scalars) << ".\n");
// Restore the mask for previous partially matched values.
- if (Entries.front()->ReorderIndices.empty() &&
- ((Entries.front()->ReuseShuffleIndices.empty() &&
- E->Scalars.size() == Entries.front()->Scalars.size()) ||
- (E->Scalars.size() ==
- Entries.front()->ReuseShuffleIndices.size()))) {
+ Mask.resize(E->Scalars.size());
+ const TreeEntry *FrontTE = Entries.front().front();
+ if (FrontTE->ReorderIndices.empty() &&
+ ((FrontTE->ReuseShuffleIndices.empty() &&
+ E->Scalars.size() == FrontTE->Scalars.size()) ||
+ (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
std::iota(Mask.begin(), Mask.end(), 0);
} else {
for (auto [I, V] : enumerate(E->Scalars)) {
@@ -9853,17 +10721,20 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
Mask[I] = PoisonMaskElem;
continue;
}
- Mask[I] = Entries.front()->findLaneForValue(V);
+ Mask[I] = FrontTE->findLaneForValue(V);
}
}
- ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+ ShuffleBuilder.add(*FrontTE, Mask);
Res = ShuffleBuilder.finalize(E->getCommonMask());
return Res;
}
if (!Resized) {
- unsigned VF1 = Entries.front()->getVectorFactor();
- unsigned VF2 = Entries.back()->getVectorFactor();
- if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+ if (GatheredScalars.size() != VF &&
+ any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
+ return any_of(TEs, [&](const TreeEntry *TE) {
+ return TE->getVectorFactor() == VF;
+ });
+ }))
GatheredScalars.append(VF - GatheredScalars.size(),
PoisonValue::get(ScalarTy));
}
@@ -9943,78 +10814,108 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
if (It != Scalars.end()) {
// Replace undefs by the non-poisoned scalars and emit broadcast.
int Pos = std::distance(Scalars.begin(), It);
- for_each(UndefPos, [&](int I) {
+ for (int I : UndefPos) {
// Set the undef position to the non-poisoned scalar.
ReuseMask[I] = Pos;
// Replace the undef by the poison, in the mask it is replaced by
// non-poisoned scalar already.
if (I != Pos)
Scalars[I] = PoisonValue::get(ScalarTy);
- });
+ }
} else {
// Replace undefs by the poisons, emit broadcast and then emit
// freeze.
- for_each(UndefPos, [&](int I) {
+ for (int I : UndefPos) {
ReuseMask[I] = PoisonMaskElem;
if (isa<UndefValue>(Scalars[I]))
Scalars[I] = PoisonValue::get(ScalarTy);
- });
+ }
NeedFreeze = true;
}
}
};
- if (ExtractShuffle || GatherShuffle) {
+ if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
bool IsNonPoisoned = true;
- bool IsUsedInExpr = false;
+ bool IsUsedInExpr = true;
Value *Vec1 = nullptr;
- if (ExtractShuffle) {
+ if (!ExtractShuffles.empty()) {
// Gather of extractelements can be represented as just a shuffle of
// a single/two vectors the scalars are extracted from.
// Find input vectors.
Value *Vec2 = nullptr;
for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
- if (ExtractMask[I] == PoisonMaskElem ||
- (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
+ if (!Mask.empty() && Mask[I] != PoisonMaskElem)
ExtractMask[I] = PoisonMaskElem;
- continue;
- }
- if (isa<UndefValue>(E->Scalars[I]))
- continue;
- auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
- if (!Vec1) {
- Vec1 = EI->getVectorOperand();
- } else if (Vec1 != EI->getVectorOperand()) {
- assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
- "Expected only 1 or 2 vectors shuffle.");
- Vec2 = EI->getVectorOperand();
+ }
+ if (UseVecBaseAsInput) {
+ Vec1 = ExtractVecBase;
+ } else {
+ for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+ if (ExtractMask[I] == PoisonMaskElem)
+ continue;
+ if (isa<UndefValue>(E->Scalars[I]))
+ continue;
+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+ Value *VecOp = EI->getVectorOperand();
+ if (const auto *TE = getTreeEntry(VecOp))
+ if (TE->VectorizedValue)
+ VecOp = TE->VectorizedValue;
+ if (!Vec1) {
+ Vec1 = VecOp;
+ } else if (Vec1 != EI->getVectorOperand()) {
+ assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+ "Expected only 1 or 2 vectors shuffle.");
+ Vec2 = VecOp;
+ }
}
}
if (Vec2) {
+ IsUsedInExpr = false;
IsNonPoisoned &=
isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
} else if (Vec1) {
- IsUsedInExpr = FindReusedSplat(ExtractMask);
- ShuffleBuilder.add(Vec1, ExtractMask);
+ IsUsedInExpr &= FindReusedSplat(
+ ExtractMask,
+ cast<FixedVectorType>(Vec1->getType())->getNumElements());
+ ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
} else {
+ IsUsedInExpr = false;
ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
ScalarTy, GatheredScalars.size())),
- ExtractMask);
+ ExtractMask, /*ForExtracts=*/true);
}
}
- if (GatherShuffle) {
- if (Entries.size() == 1) {
- IsUsedInExpr = FindReusedSplat(Mask);
- ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
- } else {
- ShuffleBuilder.add(Entries.front()->VectorizedValue,
- Entries.back()->VectorizedValue, Mask);
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
- isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
+ if (!GatherShuffles.empty()) {
+ unsigned SliceSize = E->Scalars.size() / NumParts;
+ SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+ for (const auto [I, TEs] : enumerate(Entries)) {
+ if (TEs.empty()) {
+ assert(!GatherShuffles[I] &&
+ "No shuffles with empty entries list expected.");
+ continue;
+ }
+ assert((TEs.size() == 1 || TEs.size() == 2) &&
+ "Expected shuffle of 1 or 2 entries.");
+ auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+ VecMask.assign(VecMask.size(), PoisonMaskElem);
+ copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
+ if (TEs.size() == 1) {
+ IsUsedInExpr &=
+ FindReusedSplat(VecMask, TEs.front()->getVectorFactor());
+ ShuffleBuilder.add(*TEs.front(), VecMask);
+ if (TEs.front()->VectorizedValue)
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
+ } else {
+ IsUsedInExpr = false;
+ ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
+ if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
+ isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
+ }
}
}
// Try to figure out best way to combine values: build a shuffle and insert
@@ -10025,16 +10926,24 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
int MSz = Mask.size();
// Try to build constant vector and shuffle with it only if currently we
// have a single permutation and more than 1 scalar constants.
- bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
+ bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
bool IsIdentityShuffle =
- (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
- TTI::SK_PermuteSingleSrc &&
+ ((UseVecBaseAsInput ||
+ all_of(ExtractShuffles,
+ [](const std::optional<TTI::ShuffleKind> &SK) {
+ return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc;
+ })) &&
none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
- ShuffleVectorInst::isIdentityMask(ExtractMask)) ||
- (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
- TTI::SK_PermuteSingleSrc &&
+ ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
+ (!GatherShuffles.empty() &&
+ all_of(GatherShuffles,
+ [](const std::optional<TTI::ShuffleKind> &SK) {
+ return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc;
+ }) &&
none_of(Mask, [&](int I) { return I >= MSz; }) &&
- ShuffleVectorInst::isIdentityMask(Mask));
+ ShuffleVectorInst::isIdentityMask(Mask, MSz));
bool EnoughConstsForShuffle =
IsSingleShuffle &&
(none_of(GatheredScalars,
@@ -10064,7 +10973,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
if (!all_of(GatheredScalars, PoisonValue::classof)) {
SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
- Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
ShuffleBuilder.add(BV, BVMask);
}
if (all_of(NonConstants, [=](Value *V) {
@@ -10078,13 +10987,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
E->ReuseShuffleIndices, E->Scalars.size(),
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
- Vec = ShuffleBuilder.gather(NonConstants, Vec);
+ Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
});
} else if (!allConstant(GatheredScalars)) {
// Gather unique scalars and all constants.
SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
- Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
ShuffleBuilder.add(BV, ReuseMask);
Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
} else {
@@ -10109,10 +11018,12 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
*this);
}
-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
IRBuilder<>::InsertPointGuard Guard(Builder);
- if (E->VectorizedValue) {
+ if (E->VectorizedValue &&
+ (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
+ E->isAltShuffle())) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
}
@@ -10126,13 +11037,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return Vec;
}
- auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
+ auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
+ bool IsSigned) {
+ if (V->getType() != VecTy)
+ V = Builder.CreateIntCast(V, VecTy, IsSigned);
ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
if (E->getOpcode() == Instruction::Store) {
ArrayRef<int> Mask =
ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
E->ReorderIndices.size());
ShuffleBuilder.add(V, Mask);
+ } else if (E->State == TreeEntry::PossibleStridedVectorize) {
+ ShuffleBuilder.addOrdered(V, std::nullopt);
} else {
ShuffleBuilder.addOrdered(V, E->ReorderIndices);
}
@@ -10140,7 +11056,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
};
assert((E->State == TreeEntry::Vectorize ||
- E->State == TreeEntry::ScatterVectorize) &&
+ E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
"Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -10150,6 +11067,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ScalarTy = Store->getValueOperand()->getType();
else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
ScalarTy = IE->getOperand(1)->getType();
+ bool IsSigned = false;
+ auto It = MinBWs.find(E);
+ if (It != MinBWs.end()) {
+ ScalarTy = IntegerType::get(F->getContext(), It->second.first);
+ IsSigned = It->second.second;
+ }
auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
switch (ShuffleOrOp) {
case Instruction::PHI: {
@@ -10157,32 +11080,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E != VectorizableTree.front().get() ||
!E->UserTreeIndices.empty()) &&
"PHI reordering is free.");
+ if (PostponedPHIs && E->VectorizedValue)
+ return E->VectorizedValue;
auto *PH = cast<PHINode>(VL0);
- Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
- Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
- Value *V = NewPhi;
-
- // Adjust insertion point once all PHI's have been generated.
- Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());
+ Builder.SetInsertPoint(PH->getParent(),
+ PH->getParent()->getFirstNonPHIIt());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+ if (PostponedPHIs || !E->VectorizedValue) {
+ PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+ E->PHI = NewPhi;
+ Value *V = NewPhi;
+
+ // Adjust insertion point once all PHI's have been generated.
+ Builder.SetInsertPoint(PH->getParent(),
+ PH->getParent()->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
- E->VectorizedValue = V;
+ E->VectorizedValue = V;
+ if (PostponedPHIs)
+ return V;
+ }
+ PHINode *NewPhi = cast<PHINode>(E->PHI);
+ // If phi node is fully emitted - exit.
+ if (NewPhi->getNumIncomingValues() != 0)
+ return NewPhi;
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
SmallPtrSet<BasicBlock *, 4> VisitedBBs;
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
ValueList Operands;
- BasicBlock *IBB = PH->getIncomingBlock(i);
+ BasicBlock *IBB = PH->getIncomingBlock(I);
// Stop emission if all incoming values are generated.
if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
- return V;
+ return NewPhi;
}
if (!VisitedBBs.insert(IBB).second) {
@@ -10192,37 +11128,54 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- Value *Vec = vectorizeOperand(E, i);
+ Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
+ if (VecTy != Vec->getType()) {
+ assert(MinBWs.contains(getOperandEntry(E, I)) &&
+ "Expected item in MinBWs.");
+ Vec = Builder.CreateIntCast(Vec, VecTy, It->second.second);
+ }
NewPhi->addIncoming(Vec, IBB);
}
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
"Invalid number of incoming values");
- return V;
+ return NewPhi;
}
case Instruction::ExtractElement: {
Value *V = E->getSingleOperand(0);
setInsertPointAfterBundle(E);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
auto *LI = cast<LoadInst>(E->getSingleOperand(0));
Builder.SetInsertPoint(LI);
- auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
- Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+ Value *Ptr = LI->getPointerOperand();
LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
Value *NewV = propagateMetadata(V, E->Scalars);
- NewV = FinalShuffle(NewV, E);
+ NewV = FinalShuffle(NewV, E, VecTy, IsSigned);
E->VectorizedValue = NewV;
return NewV;
}
case Instruction::InsertElement: {
assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
- Value *V = vectorizeOperand(E, 1);
+ Value *V = vectorizeOperand(E, 1, PostponedPHIs);
+ ArrayRef<Value *> Op = E->getOperand(1);
+ Type *ScalarTy = Op.front()->getType();
+ if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
+ assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
+ std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
+ assert(Res.first > 0 && "Expected item in MinBWs.");
+ V = Builder.CreateIntCast(
+ V,
+ FixedVectorType::get(
+ ScalarTy,
+ cast<FixedVectorType>(V->getType())->getNumElements()),
+ Res.second);
+ }
// Create InsertVector shuffle if necessary
auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
@@ -10255,7 +11208,57 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Mask[InsertIdx - Offset] = I;
}
if (!IsIdentity || NumElts != NumScalars) {
- V = Builder.CreateShuffleVector(V, Mask);
+ Value *V2 = nullptr;
+ bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
+ SmallVector<int> InsertMask(Mask);
+ if (NumElts != NumScalars && Offset == 0) {
+ // Follow all insert element instructions from the current buildvector
+ // sequence.
+ InsertElementInst *Ins = cast<InsertElementInst>(VL0);
+ do {
+ std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
+ if (!InsertIdx)
+ break;
+ if (InsertMask[*InsertIdx] == PoisonMaskElem)
+ InsertMask[*InsertIdx] = *InsertIdx;
+ if (!Ins->hasOneUse())
+ break;
+ Ins = dyn_cast_or_null<InsertElementInst>(
+ Ins->getUniqueUndroppableUser());
+ } while (Ins);
+ SmallBitVector UseMask =
+ buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
+ SmallBitVector IsFirstPoison =
+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+ SmallBitVector IsFirstUndef =
+ isUndefVector(FirstInsert->getOperand(0), UseMask);
+ if (!IsFirstPoison.all()) {
+ unsigned Idx = 0;
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
+ IsFirstUndef.test(I)) {
+ if (IsVNonPoisonous) {
+ InsertMask[I] = I < NumScalars ? I : 0;
+ continue;
+ }
+ if (!V2)
+ V2 = UndefValue::get(V->getType());
+ if (Idx >= NumScalars)
+ Idx = NumScalars - 1;
+ InsertMask[I] = NumScalars + Idx;
+ ++Idx;
+ } else if (InsertMask[I] != PoisonMaskElem &&
+ Mask[I] == PoisonMaskElem) {
+ InsertMask[I] = PoisonMaskElem;
+ }
+ }
+ } else {
+ InsertMask = Mask;
+ }
+ }
+ if (!V2)
+ V2 = PoisonValue::get(V->getType());
+ V = Builder.CreateShuffleVector(V, V2, InsertMask);
if (auto *I = dyn_cast<Instruction>(V)) {
GatherShuffleExtractSeq.insert(I);
CSEBlocks.insert(I->getParent());
@@ -10274,15 +11277,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
NumElts != NumScalars) {
if (IsFirstUndef.all()) {
- if (!ShuffleVectorInst::isIdentityMask(InsertMask)) {
- SmallBitVector IsFirstPoison =
- isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
- if (!IsFirstPoison.all()) {
- for (unsigned I = 0; I < NumElts; I++) {
- if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
- InsertMask[I] = I + NumElts;
+ if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
+ SmallBitVector IsFirstPoison =
+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+ if (!IsFirstPoison.all()) {
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
+ InsertMask[I] = I + NumElts;
+ }
}
- }
V = Builder.CreateShuffleVector(
V,
IsFirstPoison.all() ? PoisonValue::get(V->getType())
@@ -10330,15 +11333,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::BitCast: {
setInsertPointAfterBundle(E);
- Value *InVec = vectorizeOperand(E, 0);
+ Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
auto *CI = cast<CastInst>(VL0);
- Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
- V = FinalShuffle(V, E);
+ Instruction::CastOps VecOpcode = CI->getOpcode();
+ Type *SrcScalarTy = VL0->getOperand(0)->getType();
+ auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+ if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
+ (SrcIt != MinBWs.end() || It != MinBWs.end())) {
+ // Check if the values are candidates to demote.
+ unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
+ if (SrcIt != MinBWs.end())
+ SrcBWSz = SrcIt->second.first;
+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+ if (BWSz == SrcBWSz) {
+ VecOpcode = Instruction::BitCast;
+ } else if (BWSz < SrcBWSz) {
+ VecOpcode = Instruction::Trunc;
+ } else if (It != MinBWs.end()) {
+ assert(BWSz > SrcBWSz && "Invalid cast!");
+ VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+ }
+ }
+ Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
+ ? InVec
+ : Builder.CreateCast(VecOpcode, InVec, VecTy);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10348,21 +11372,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::ICmp: {
setInsertPointAfterBundle(E);
- Value *L = vectorizeOperand(E, 0);
+ Value *L = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- Value *R = vectorizeOperand(E, 1);
+ Value *R = vectorizeOperand(E, 1, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (L->getType() != R->getType()) {
+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+ MinBWs.contains(getOperandEntry(E, 1))) &&
+ "Expected item in MinBWs.");
+ L = Builder.CreateIntCast(L, VecTy, IsSigned);
+ R = Builder.CreateIntCast(R, VecTy, IsSigned);
+ }
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
- V = FinalShuffle(V, E);
+ // Do not cast for cmps.
+ VecTy = cast<FixedVectorType>(V->getType());
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10371,24 +11404,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Select: {
setInsertPointAfterBundle(E);
- Value *Cond = vectorizeOperand(E, 0);
+ Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- Value *True = vectorizeOperand(E, 1);
+ Value *True = vectorizeOperand(E, 1, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- Value *False = vectorizeOperand(E, 2);
+ Value *False = vectorizeOperand(E, 2, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (True->getType() != False->getType()) {
+ assert((MinBWs.contains(getOperandEntry(E, 1)) ||
+ MinBWs.contains(getOperandEntry(E, 2))) &&
+ "Expected item in MinBWs.");
+ True = Builder.CreateIntCast(True, VecTy, IsSigned);
+ False = Builder.CreateIntCast(False, VecTy, IsSigned);
+ }
Value *V = Builder.CreateSelect(Cond, True, False);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10397,7 +11437,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::FNeg: {
setInsertPointAfterBundle(E);
- Value *Op = vectorizeOperand(E, 0);
+ Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -10410,7 +11450,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10437,16 +11477,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Xor: {
setInsertPointAfterBundle(E);
- Value *LHS = vectorizeOperand(E, 0);
+ Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- Value *RHS = vectorizeOperand(E, 1);
+ Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (LHS->getType() != RHS->getType()) {
+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+ MinBWs.contains(getOperandEntry(E, 1))) &&
+ "Expected item in MinBWs.");
+ LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
+ RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
+ }
Value *V = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
@@ -10455,7 +11502,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10476,14 +11523,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// The pointer operand uses an in-tree scalar so we add the new
// LoadInst to ExternalUses list to make sure that an extract will
// be generated in the future.
- if (TreeEntry *Entry = getTreeEntry(PO)) {
- // Find which lane we need to extract.
- unsigned FoundLane = Entry->findLaneForValue(PO);
- ExternalUses.emplace_back(PO, NewLI, FoundLane);
+ if (isa<Instruction>(PO)) {
+ if (TreeEntry *Entry = getTreeEntry(PO)) {
+ // Find which lane we need to extract.
+ unsigned FoundLane = Entry->findLaneForValue(PO);
+ ExternalUses.emplace_back(PO, NewLI, FoundLane);
+ }
}
} else {
- assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
- Value *VecPtr = vectorizeOperand(E, 0);
+ assert((E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
+ "Unhandled state");
+ Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -10497,35 +11548,32 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = propagateMetadata(NewLI, E->Scalars);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
}
case Instruction::Store: {
auto *SI = cast<StoreInst>(VL0);
- unsigned AS = SI->getPointerAddressSpace();
setInsertPointAfterBundle(E);
- Value *VecValue = vectorizeOperand(E, 0);
- VecValue = FinalShuffle(VecValue, E);
+ Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
+ VecValue = FinalShuffle(VecValue, E, VecTy, IsSigned);
- Value *ScalarPtr = SI->getPointerOperand();
- Value *VecPtr = Builder.CreateBitCast(
- ScalarPtr, VecValue->getType()->getPointerTo(AS));
+ Value *Ptr = SI->getPointerOperand();
StoreInst *ST =
- Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());
+ Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
- // The pointer operand uses an in-tree scalar, so add the new BitCast or
- // StoreInst to ExternalUses to make sure that an extract will be
- // generated in the future.
- if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {
- // Find which lane we need to extract.
- unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);
- ExternalUses.push_back(ExternalUser(
- ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST,
- FoundLane));
+ // The pointer operand uses an in-tree scalar, so add the new StoreInst to
+ // ExternalUses to make sure that an extract will be generated in the
+ // future.
+ if (isa<Instruction>(Ptr)) {
+ if (TreeEntry *Entry = getTreeEntry(Ptr)) {
+ // Find which lane we need to extract.
+ unsigned FoundLane = Entry->findLaneForValue(Ptr);
+ ExternalUses.push_back(ExternalUser(Ptr, ST, FoundLane));
+ }
}
Value *V = propagateMetadata(ST, E->Scalars);
@@ -10538,7 +11586,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *GEP0 = cast<GetElementPtrInst>(VL0);
setInsertPointAfterBundle(E);
- Value *Op0 = vectorizeOperand(E, 0);
+ Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -10546,7 +11594,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<Value *> OpVecs;
for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
- Value *OpVec = vectorizeOperand(E, J);
+ Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
@@ -10564,7 +11612,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V = propagateMetadata(I, GEPs);
}
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10586,41 +11634,42 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
VecCallCosts.first <= VecCallCosts.second;
Value *ScalarArg = nullptr;
- std::vector<Value *> OpVecs;
+ SmallVector<Value *> OpVecs;
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1))
TysForDecl.push_back(
FixedVectorType::get(CI->getType(), E->Scalars.size()));
- for (int j = 0, e = CI->arg_size(); j < e; ++j) {
+ for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
- if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {
+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
CallInst *CEI = cast<CallInst>(VL0);
- ScalarArg = CEI->getArgOperand(j);
- OpVecs.push_back(CEI->getArgOperand(j));
- if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
+ ScalarArg = CEI->getArgOperand(I);
+ OpVecs.push_back(CEI->getArgOperand(I));
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))
TysForDecl.push_back(ScalarArg->getType());
continue;
}
- Value *OpVec = vectorizeOperand(E, j);
+ Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
- if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))
TysForDecl.push_back(OpVec->getType());
}
Function *CF;
if (!UseIntrinsic) {
VFShape Shape =
- VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
- VecTy->getNumElements())),
+ VFShape::get(CI->getFunctionType(),
+ ElementCount::getFixed(
+ static_cast<unsigned>(VecTy->getNumElements())),
false /*HasGlobalPred*/);
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
} else {
@@ -10634,7 +11683,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// The scalar argument uses an in-tree scalar so we add the new vectorized
// call to ExternalUses list to make sure that an extract will be
// generated in the future.
- if (ScalarArg) {
+ if (isa_and_present<Instruction>(ScalarArg)) {
if (TreeEntry *Entry = getTreeEntry(ScalarArg)) {
// Find which lane we need to extract.
unsigned FoundLane = Entry->findLaneForValue(ScalarArg);
@@ -10644,7 +11693,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
propagateIRFlags(V, E->Scalars, VL0);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10662,20 +11711,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *LHS = nullptr, *RHS = nullptr;
if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
setInsertPointAfterBundle(E);
- LHS = vectorizeOperand(E, 0);
+ LHS = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
- RHS = vectorizeOperand(E, 1);
+ RHS = vectorizeOperand(E, 1, PostponedPHIs);
} else {
setInsertPointAfterBundle(E);
- LHS = vectorizeOperand(E, 0);
+ LHS = vectorizeOperand(E, 0, PostponedPHIs);
}
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
}
+ if (LHS && RHS && LHS->getType() != RHS->getType()) {
+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+ MinBWs.contains(getOperandEntry(E, 1))) &&
+ "Expected item in MinBWs.");
+ LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
+ RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
+ }
Value *V0, *V1;
if (Instruction::isBinaryOp(E->getOpcode())) {
@@ -10708,8 +11764,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// each vector operation.
ValueList OpScalars, AltScalars;
SmallVector<int> Mask;
- buildShuffleEntryMask(
- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+ E->buildAltOpShuffleMask(
[E, this](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
@@ -10727,6 +11782,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CSEBlocks.insert(I->getParent());
}
+ if (V->getType() != VecTy && !isa<CmpInst>(VL0))
+ V = Builder.CreateIntCast(
+ V, FixedVectorType::get(ScalarTy, E->getVectorFactor()), IsSigned);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -10767,9 +11825,19 @@ Value *BoUpSLP::vectorizeTree(
// need to rebuild it.
EntryToLastInstruction.clear();
- Builder.SetInsertPoint(ReductionRoot ? ReductionRoot
- : &F->getEntryBlock().front());
- auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+ if (ReductionRoot)
+ Builder.SetInsertPoint(ReductionRoot->getParent(),
+ ReductionRoot->getIterator());
+ else
+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
+
+ // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
+ (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
+ if (TE->State == TreeEntry::Vectorize &&
+ TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
+ TE->VectorizedValue)
+ (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
// Run through the list of postponed gathers and emit them, replacing the temp
// emitted allocas with actual vector instructions.
ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
@@ -10786,9 +11854,32 @@ Value *BoUpSLP::vectorizeTree(
TE->VectorizedValue = nullptr;
auto *UserI =
cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
- Builder.SetInsertPoint(PrevVec);
+ // If user is a PHI node, its vector code have to be inserted right before
+ // block terminator. Since the node was delayed, there were some unresolved
+ // dependencies at the moment when stab instruction was emitted. In a case
+ // when any of these dependencies turn out an operand of another PHI, coming
+ // from this same block, position of a stab instruction will become invalid.
+ // The is because source vector that supposed to feed this gather node was
+ // inserted at the end of the block [after stab instruction]. So we need
+ // to adjust insertion point again to the end of block.
+ if (isa<PHINode>(UserI)) {
+ // Insert before all users.
+ Instruction *InsertPt = PrevVec->getParent()->getTerminator();
+ for (User *U : PrevVec->users()) {
+ if (U == UserI)
+ continue;
+ auto *UI = dyn_cast<Instruction>(U);
+ if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
+ continue;
+ if (UI->comesBefore(InsertPt))
+ InsertPt = UI;
+ }
+ Builder.SetInsertPoint(InsertPt);
+ } else {
+ Builder.SetInsertPoint(PrevVec);
+ }
Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
- Value *Vec = vectorizeTree(TE);
+ Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
PrevVec->replaceAllUsesWith(Vec);
PostponedValues.try_emplace(Vec).first->second.push_back(TE);
// Replace the stub vector node, if it was used before for one of the
@@ -10801,26 +11892,6 @@ Value *BoUpSLP::vectorizeTree(
eraseInstruction(PrevVec);
}
- // If the vectorized tree can be rewritten in a smaller type, we truncate the
- // vectorized root. InstCombine will then rewrite the entire expression. We
- // sign extend the extracted values below.
- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
- if (MinBWs.count(ScalarRoot)) {
- if (auto *I = dyn_cast<Instruction>(VectorRoot)) {
- // If current instr is a phi and not the last phi, insert it after the
- // last phi node.
- if (isa<PHINode>(I))
- Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt());
- else
- Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
- }
- auto BundleWidth = VectorizableTree[0]->Scalars.size();
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
- auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
- auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
- VectorizableTree[0]->VectorizedValue = Trunc;
- }
-
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
<< " values .\n");
@@ -10830,6 +11901,8 @@ Value *BoUpSLP::vectorizeTree(
// Maps extract Scalar to the corresponding extractelement instruction in the
// basic block. Only one extractelement per block should be emitted.
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
+ SmallDenseSet<Value *, 4> UsedInserts;
+ DenseMap<Value *, Value *> VectorCasts;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
@@ -10864,7 +11937,8 @@ Value *BoUpSLP::vectorizeTree(
Instruction *I = EEIt->second;
if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
Builder.GetInsertPoint()->comesBefore(I))
- I->moveBefore(&*Builder.GetInsertPoint());
+ I->moveBefore(*Builder.GetInsertPoint()->getParent(),
+ Builder.GetInsertPoint());
Ex = I;
}
}
@@ -10887,11 +11961,10 @@ Value *BoUpSLP::vectorizeTree(
}
// If necessary, sign-extend or zero-extend ScalarRoot
// to the larger type.
- if (!MinBWs.count(ScalarRoot))
- return Ex;
- if (MinBWs[ScalarRoot].second)
- return Builder.CreateSExt(Ex, Scalar->getType());
- return Builder.CreateZExt(Ex, Scalar->getType());
+ if (Scalar->getType() != Ex->getType())
+ return Builder.CreateIntCast(Ex, Scalar->getType(),
+ MinBWs.find(E)->second.second);
+ return Ex;
}
assert(isa<FixedVectorType>(Scalar->getType()) &&
isa<InsertElementInst>(Scalar) &&
@@ -10909,12 +11982,13 @@ Value *BoUpSLP::vectorizeTree(
"ExternallyUsedValues map");
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
if (auto *PHI = dyn_cast<PHINode>(VecI))
- Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI());
+ Builder.SetInsertPoint(PHI->getParent(),
+ PHI->getParent()->getFirstNonPHIIt());
else
Builder.SetInsertPoint(VecI->getParent(),
std::next(VecI->getIterator()));
} else {
- Builder.SetInsertPoint(&F->getEntryBlock().front());
+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
}
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
// Required to update internally referenced instructions.
@@ -10927,12 +12001,26 @@ Value *BoUpSLP::vectorizeTree(
// Skip if the scalar is another vector op or Vec is not an instruction.
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
+ if (!UsedInserts.insert(VU).second)
+ continue;
+ // Need to use original vector, if the root is truncated.
+ auto BWIt = MinBWs.find(E);
+ if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
+ auto VecIt = VectorCasts.find(Scalar);
+ if (VecIt == VectorCasts.end()) {
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ if (auto *IVec = dyn_cast<Instruction>(Vec))
+ Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
+ Vec = Builder.CreateIntCast(Vec, VU->getType(),
+ BWIt->second.second);
+ VectorCasts.try_emplace(Scalar, Vec);
+ } else {
+ Vec = VecIt->second;
+ }
+ }
+
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
- // Need to use original vector, if the root is truncated.
- if (MinBWs.count(Scalar) &&
- VectorizableTree[0]->VectorizedValue == Vec)
- Vec = VectorRoot;
auto *It =
find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
// Checks if 2 insertelements are from the same buildvector.
@@ -10992,18 +12080,18 @@ Value *BoUpSLP::vectorizeTree(
// Find the insertion point for the extractelement lane.
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
if (PHINode *PH = dyn_cast<PHINode>(User)) {
- for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
- if (PH->getIncomingValue(i) == Scalar) {
+ for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
+ if (PH->getIncomingValue(I) == Scalar) {
Instruction *IncomingTerminator =
- PH->getIncomingBlock(i)->getTerminator();
+ PH->getIncomingBlock(I)->getTerminator();
if (isa<CatchSwitchInst>(IncomingTerminator)) {
Builder.SetInsertPoint(VecI->getParent(),
std::next(VecI->getIterator()));
} else {
- Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+ Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
}
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
- PH->setOperand(i, NewInst);
+ PH->setOperand(I, NewInst);
}
}
} else {
@@ -11012,7 +12100,7 @@ Value *BoUpSLP::vectorizeTree(
User->replaceUsesOfWith(Scalar, NewInst);
}
} else {
- Builder.SetInsertPoint(&F->getEntryBlock().front());
+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
User->replaceUsesOfWith(Scalar, NewInst);
}
@@ -11085,7 +12173,7 @@ Value *BoUpSLP::vectorizeTree(
// non-resizing mask.
if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
->getNumElements() ||
- !ShuffleVectorInst::isIdentityMask(Mask))
+ !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
return CreateShuffle(Vals.front(), nullptr, Mask);
return Vals.front();
}
@@ -11676,7 +12764,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
}
- auto makeControlDependent = [&](Instruction *I) {
+ auto MakeControlDependent = [&](Instruction *I) {
auto *DepDest = getScheduleData(I);
assert(DepDest && "must be in schedule window");
DepDest->ControlDependencies.push_back(BundleMember);
@@ -11698,7 +12786,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
continue;
// Add the dependency
- makeControlDependent(I);
+ MakeControlDependent(I);
if (!isGuaranteedToTransferExecutionToSuccessor(I))
// Everything past here must be control dependent on I.
@@ -11724,7 +12812,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
continue;
// Add the dependency
- makeControlDependent(I);
+ MakeControlDependent(I);
}
}
@@ -11742,7 +12830,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
continue;
// Add the dependency
- makeControlDependent(I);
+ MakeControlDependent(I);
break;
}
}
@@ -11757,7 +12845,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
"NextLoadStore list for non memory effecting bundle?");
MemoryLocation SrcLoc = getLocation(SrcInst);
bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
- unsigned numAliased = 0;
+ unsigned NumAliased = 0;
unsigned DistToSrc = 1;
for (; DepDest; DepDest = DepDest->NextLoadStore) {
@@ -11772,13 +12860,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
// check this limit even between two read-only instructions.
if (DistToSrc >= MaxMemDepDistance ||
((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
- (numAliased >= AliasedCheckLimit ||
+ (NumAliased >= AliasedCheckLimit ||
SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
// We increment the counter only if the locations are aliased
// (instead of counting all alias checks). This gives a better
// balance between reduced runtime and accurate dependencies.
- numAliased++;
+ NumAliased++;
DepDest->MemoryDependencies.push_back(BundleMember);
BundleMember->Dependencies++;
@@ -11880,20 +12968,20 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
// Do the "real" scheduling.
while (!ReadyInsts.empty()) {
- ScheduleData *picked = *ReadyInsts.begin();
+ ScheduleData *Picked = *ReadyInsts.begin();
ReadyInsts.erase(ReadyInsts.begin());
// Move the scheduled instruction(s) to their dedicated places, if not
// there yet.
- for (ScheduleData *BundleMember = picked; BundleMember;
+ for (ScheduleData *BundleMember = Picked; BundleMember;
BundleMember = BundleMember->NextInBundle) {
- Instruction *pickedInst = BundleMember->Inst;
- if (pickedInst->getNextNode() != LastScheduledInst)
- pickedInst->moveBefore(LastScheduledInst);
- LastScheduledInst = pickedInst;
+ Instruction *PickedInst = BundleMember->Inst;
+ if (PickedInst->getNextNode() != LastScheduledInst)
+ PickedInst->moveBefore(LastScheduledInst);
+ LastScheduledInst = PickedInst;
}
- BS->schedule(picked, ReadyInsts);
+ BS->schedule(Picked, ReadyInsts);
}
// Check that we didn't break any of our invariants.
@@ -11994,21 +13082,22 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
// Determine if a value V in a vectorizable expression Expr can be demoted to a
// smaller type with a truncation. We collect the values that will be demoted
// in ToDemote and additional roots that require investigating in Roots.
-static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
- SmallVectorImpl<Value *> &ToDemote,
- SmallVectorImpl<Value *> &Roots) {
+bool BoUpSLP::collectValuesToDemote(
+ Value *V, SmallVectorImpl<Value *> &ToDemote,
+ DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {
// We can always demote constants.
- if (isa<Constant>(V)) {
- ToDemote.push_back(V);
+ if (isa<Constant>(V))
return true;
- }
- // If the value is not an instruction in the expression with only one use, it
- // cannot be demoted.
+ // If the value is not a vectorized instruction in the expression with only
+ // one use, it cannot be demoted.
auto *I = dyn_cast<Instruction>(V);
- if (!I || !I->hasOneUse() || !Expr.count(I))
+ if (!I || !I->hasOneUse() || !getTreeEntry(I) || !Visited.insert(I).second)
return false;
+ unsigned Start = 0;
+ unsigned End = I->getNumOperands();
switch (I->getOpcode()) {
// We can always demote truncations and extensions. Since truncations can
@@ -12030,16 +13119,21 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
- !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
+ if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,
+ Visited) ||
+ !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,
+ Visited))
return false;
break;
// We can demote selects if we can demote their true and false values.
case Instruction::Select: {
+ Start = 1;
SelectInst *SI = cast<SelectInst>(I);
- if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
- !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
+ if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,
+ Roots, Visited) ||
+ !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,
+ Roots, Visited))
return false;
break;
}
@@ -12049,7 +13143,8 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
case Instruction::PHI: {
PHINode *PN = cast<PHINode>(I);
for (Value *IncValue : PN->incoming_values())
- if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
+ if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,
+ Visited))
return false;
break;
}
@@ -12059,6 +13154,10 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
return false;
}
+ // Gather demoted constant operands.
+ for (unsigned Idx : seq<unsigned>(Start, End))
+ if (isa<Constant>(I->getOperand(Idx)))
+ DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
// Record the value that we can demote.
ToDemote.push_back(V);
return true;
@@ -12076,44 +13175,26 @@ void BoUpSLP::computeMinimumValueSizes() {
if (!TreeRootIT)
return;
- // If the expression is not rooted by a store, these roots should have
- // external uses. We will rely on InstCombine to rewrite the expression in
- // the narrower type. However, InstCombine only rewrites single-use values.
- // This means that if a tree entry other than a root is used externally, it
- // must have multiple uses and InstCombine will not rewrite it. The code
- // below ensures that only the roots are used externally.
- SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
- for (auto &EU : ExternalUses)
- if (!Expr.erase(EU.Scalar))
- return;
- if (!Expr.empty())
+ // Ensure the roots of the vectorizable tree don't form a cycle.
+ if (!VectorizableTree.front()->UserTreeIndices.empty())
return;
- // Collect the scalar values of the vectorizable expression. We will use this
- // context to determine which values can be demoted. If we see a truncation,
- // we mark it as seeding another demotion.
- for (auto &EntryPtr : VectorizableTree)
- Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
-
- // Ensure the roots of the vectorizable tree don't form a cycle. They must
- // have a single external user that is not in the vectorizable tree.
- for (auto *Root : TreeRoot)
- if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
- return;
-
// Conservatively determine if we can actually truncate the roots of the
// expression. Collect the values that can be demoted in ToDemote and
// additional roots that require investigating in Roots.
SmallVector<Value *, 32> ToDemote;
+ DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
SmallVector<Value *, 4> Roots;
- for (auto *Root : TreeRoot)
- if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+ for (auto *Root : TreeRoot) {
+ DenseSet<Value *> Visited;
+ if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))
return;
+ }
// The maximum bit width required to represent all the values that can be
// demoted without loss of precision. It would be safe to truncate the roots
// of the expression to this width.
- auto MaxBitWidth = 8u;
+ auto MaxBitWidth = 1u;
// We first check if all the bits of the roots are demanded. If they're not,
// we can truncate the roots to this narrower type.
@@ -12138,9 +13219,9 @@ void BoUpSLP::computeMinimumValueSizes() {
// maximum bit width required to store the scalar by using ValueTracking to
// compute the number of high-order bits we can truncate.
if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
- llvm::all_of(TreeRoot, [](Value *R) {
- assert(R->hasOneUse() && "Root should have only one use!");
- return isa<GetElementPtrInst>(R->user_back());
+ all_of(TreeRoot, [](Value *V) {
+ return all_of(V->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); });
})) {
MaxBitWidth = 8u;
@@ -12189,12 +13270,39 @@ void BoUpSLP::computeMinimumValueSizes() {
// If we can truncate the root, we must collect additional values that might
// be demoted as a result. That is, those seeded by truncations we will
// modify.
- while (!Roots.empty())
- collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
+ while (!Roots.empty()) {
+ DenseSet<Value *> Visited;
+ collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots,
+ Visited);
+ }
// Finally, map the values we can demote to the maximum bit with we computed.
- for (auto *Scalar : ToDemote)
- MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
+ for (auto *Scalar : ToDemote) {
+ auto *TE = getTreeEntry(Scalar);
+ assert(TE && "Expected vectorized scalar.");
+ if (MinBWs.contains(TE))
+ continue;
+ bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
+ KnownBits Known = computeKnownBits(R, *DL);
+ return !Known.isNonNegative();
+ });
+ MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
+ const auto *I = cast<Instruction>(Scalar);
+ auto DCIt = DemotedConsts.find(I);
+ if (DCIt != DemotedConsts.end()) {
+ for (unsigned Idx : DCIt->getSecond()) {
+ // Check that all instructions operands are demoted.
+ if (all_of(TE->Scalars, [&](Value *V) {
+ auto SIt = DemotedConsts.find(cast<Instruction>(V));
+ return SIt != DemotedConsts.end() &&
+ is_contained(SIt->getSecond(), Idx);
+ })) {
+ const TreeEntry *CTE = getOperandEntry(TE, Idx);
+ MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
+ }
+ }
+ }
+ }
}
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
@@ -12348,139 +13456,206 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
- int E = Stores.size();
- SmallBitVector Tails(E, false);
- int MaxIter = MaxStoreLookup.getValue();
- SmallVector<std::pair<int, int>, 16> ConsecutiveChain(
- E, std::make_pair(E, INT_MAX));
- SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));
- int IterCnt;
- auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
- &CheckedPairs,
- &ConsecutiveChain](int K, int Idx) {
- if (IterCnt >= MaxIter)
- return true;
- if (CheckedPairs[Idx].test(K))
- return ConsecutiveChain[K].second == 1 &&
- ConsecutiveChain[K].first == Idx;
- ++IterCnt;
- CheckedPairs[Idx].set(K);
- CheckedPairs[K].set(Idx);
- std::optional<int> Diff = getPointersDiff(
- Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),
- Stores[Idx]->getValueOperand()->getType(),
- Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);
- if (!Diff || *Diff == 0)
- return false;
- int Val = *Diff;
- if (Val < 0) {
- if (ConsecutiveChain[Idx].second > -Val) {
- Tails.set(K);
- ConsecutiveChain[Idx] = std::make_pair(K, -Val);
- }
- return false;
+ // Stores the pair of stores (first_store, last_store) in a range, that were
+ // already tried to be vectorized. Allows to skip the store ranges that were
+ // already tried to be vectorized but the attempts were unsuccessful.
+ DenseSet<std::pair<Value *, Value *>> TriedSequences;
+ struct StoreDistCompare {
+ bool operator()(const std::pair<unsigned, int> &Op1,
+ const std::pair<unsigned, int> &Op2) const {
+ return Op1.second < Op2.second;
}
- if (ConsecutiveChain[K].second <= Val)
- return false;
-
- Tails.set(Idx);
- ConsecutiveChain[K] = std::make_pair(Idx, Val);
- return Val == 1;
};
- // Do a quadratic search on all of the given stores in reverse order and find
- // all of the pairs of stores that follow each other.
- for (int Idx = E - 1; Idx >= 0; --Idx) {
- // If a store has multiple consecutive store candidates, search according
- // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
- // This is because usually pairing with immediate succeeding or preceding
- // candidate create the best chance to find slp vectorization opportunity.
- const int MaxLookDepth = std::max(E - Idx, Idx + 1);
- IterCnt = 0;
- for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
- if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
- (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
- break;
- }
-
- // Tracks if we tried to vectorize stores starting from the given tail
- // already.
- SmallBitVector TriedTails(E, false);
- // For stores that start but don't end a link in the chain:
- for (int Cnt = E; Cnt > 0; --Cnt) {
- int I = Cnt - 1;
- if (ConsecutiveChain[I].first == E || Tails.test(I))
- continue;
- // We found a store instr that starts a chain. Now follow the chain and try
- // to vectorize it.
+ // A set of pairs (index of store in Stores array ref, Distance of the store
+ // address relative to base store address in units).
+ using StoreIndexToDistSet =
+ std::set<std::pair<unsigned, int>, StoreDistCompare>;
+ auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
+ int PrevDist = -1;
BoUpSLP::ValueList Operands;
// Collect the chain into a list.
- while (I != E && !VectorizedStores.count(Stores[I])) {
- Operands.push_back(Stores[I]);
- Tails.set(I);
- if (ConsecutiveChain[I].second != 1) {
- // Mark the new end in the chain and go back, if required. It might be
- // required if the original stores come in reversed order, for example.
- if (ConsecutiveChain[I].first != E &&
- Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&
- !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {
- TriedTails.set(I);
- Tails.reset(ConsecutiveChain[I].first);
- if (Cnt < ConsecutiveChain[I].first + 2)
- Cnt = ConsecutiveChain[I].first + 2;
+ for (auto [Idx, Data] : enumerate(Set)) {
+ if (Operands.empty() || Data.second - PrevDist == 1) {
+ Operands.push_back(Stores[Data.first]);
+ PrevDist = Data.second;
+ if (Idx != Set.size() - 1)
+ continue;
+ }
+ if (Operands.size() <= 1) {
+ Operands.clear();
+ Operands.push_back(Stores[Data.first]);
+ PrevDist = Data.second;
+ continue;
+ }
+
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
+ unsigned EltSize = R.getVectorElementSize(Operands[0]);
+ unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
+
+ unsigned MaxVF =
+ std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
+ auto *Store = cast<StoreInst>(Operands[0]);
+ Type *StoreTy = Store->getValueOperand()->getType();
+ Type *ValueTy = StoreTy;
+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+ ValueTy = Trunc->getSrcTy();
+ unsigned MinVF = TTI->getStoreMinimumVF(
+ R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
+
+ if (MaxVF <= MinVF) {
+ LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
+ << ") <= "
+ << "MinVF (" << MinVF << ")\n");
+ }
+
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ unsigned StartIdx = 0;
+ for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
+ ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+ assert(
+ all_of(
+ Slice,
+ [&](Value *V) {
+ return cast<StoreInst>(V)->getValueOperand()->getType() ==
+ cast<StoreInst>(Slice.front())
+ ->getValueOperand()
+ ->getType();
+ }) &&
+ "Expected all operands of same type.");
+ if (!VectorizedStores.count(Slice.front()) &&
+ !VectorizedStores.count(Slice.back()) &&
+ TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
+ .second &&
+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Slice.begin(), Slice.end());
+ Changed = true;
+ // If we vectorized initial block, no need to try to vectorize it
+ // again.
+ if (Cnt == StartIdx)
+ StartIdx += Size;
+ Cnt += Size;
+ continue;
+ }
+ ++Cnt;
}
- break;
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= Operands.size())
+ break;
}
- // Move to the next value in the chain.
- I = ConsecutiveChain[I].first;
+ Operands.clear();
+ Operands.push_back(Stores[Data.first]);
+ PrevDist = Data.second;
}
- assert(!Operands.empty() && "Expected non-empty list of stores.");
+ };
- unsigned MaxVecRegSize = R.getMaxVecRegSize();
- unsigned EltSize = R.getVectorElementSize(Operands[0]);
- unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
-
- unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
- MaxElts);
- auto *Store = cast<StoreInst>(Operands[0]);
- Type *StoreTy = Store->getValueOperand()->getType();
- Type *ValueTy = StoreTy;
- if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
- ValueTy = Trunc->getSrcTy();
- unsigned MinVF = TTI->getStoreMinimumVF(
- R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
-
- if (MaxVF <= MinVF) {
- LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= "
- << "MinVF (" << MinVF << ")\n");
- }
-
- // FIXME: Is division-by-2 the correct step? Should we assert that the
- // register size is a power-of-2?
- unsigned StartIdx = 0;
- for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
- for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
- ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
- if (!VectorizedStores.count(Slice.front()) &&
- !VectorizedStores.count(Slice.back()) &&
- vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
- // Mark the vectorized stores so that we don't vectorize them again.
- VectorizedStores.insert(Slice.begin(), Slice.end());
- Changed = true;
- // If we vectorized initial block, no need to try to vectorize it
- // again.
- if (Cnt == StartIdx)
- StartIdx += Size;
- Cnt += Size;
- continue;
- }
- ++Cnt;
+ // Stores pair (first: index of the store into Stores array ref, address of
+ // which taken as base, second: sorted set of pairs {index, dist}, which are
+ // indices of stores in the set and their store location distances relative to
+ // the base address).
+
+ // Need to store the index of the very first store separately, since the set
+ // may be reordered after the insertion and the first store may be moved. This
+ // container allows to reduce number of calls of getPointersDiff() function.
+ SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
+ // Inserts the specified store SI with the given index Idx to the set of the
+ // stores. If the store with the same distance is found already - stop
+ // insertion, try to vectorize already found stores. If some stores from this
+ // sequence were not vectorized - try to vectorize them with the new store
+ // later. But this logic is applied only to the stores, that come before the
+ // previous store with the same distance.
+ // Example:
+ // 1. store x, %p
+ // 2. store y, %p+1
+ // 3. store z, %p+2
+ // 4. store a, %p
+ // 5. store b, %p+3
+ // - Scan this from the last to first store. The very first bunch of stores is
+ // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
+ // vector).
+ // - The next store in the list - #1 - has the same distance from store #5 as
+ // the store #4.
+ // - Try to vectorize sequence of stores 4,2,3,5.
+ // - If all these stores are vectorized - just drop them.
+ // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
+ // - Start new stores sequence.
+ // The new bunch of stores is {1, {1, 0}}.
+ // - Add the stores from previous sequence, that were not vectorized.
+ // Here we consider the stores in the reversed order, rather they are used in
+ // the IR (Stores are reversed already, see vectorizeStoreChains() function).
+ // Store #3 can be added -> comes after store #4 with the same distance as
+ // store #1.
+ // Store #5 cannot be added - comes before store #4.
+ // This logic allows to improve the compile time, we assume that the stores
+ // after previous store with the same distance most likely have memory
+ // dependencies and no need to waste compile time to try to vectorize them.
+ // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
+ auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
+ for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
+ std::optional<int> Diff = getPointersDiff(
+ Stores[Set.first]->getValueOperand()->getType(),
+ Stores[Set.first]->getPointerOperand(),
+ SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
+ /*StrictCheck=*/true);
+ if (!Diff)
+ continue;
+ auto It = Set.second.find(std::make_pair(Idx, *Diff));
+ if (It == Set.second.end()) {
+ Set.second.emplace(Idx, *Diff);
+ return;
}
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= Operands.size())
- break;
+ // Try to vectorize the first found set to avoid duplicate analysis.
+ TryToVectorize(Set.second);
+ StoreIndexToDistSet PrevSet;
+ PrevSet.swap(Set.second);
+ Set.first = Idx;
+ Set.second.emplace(Idx, 0);
+ // Insert stores that followed previous match to try to vectorize them
+ // with this store.
+ unsigned StartIdx = It->first + 1;
+ SmallBitVector UsedStores(Idx - StartIdx);
+ // Distances to previously found dup store (or this store, since they
+ // store to the same addresses).
+ SmallVector<int> Dists(Idx - StartIdx, 0);
+ for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
+ // Do not try to vectorize sequences, we already tried.
+ if (Pair.first <= It->first ||
+ VectorizedStores.contains(Stores[Pair.first]))
+ break;
+ unsigned BI = Pair.first - StartIdx;
+ UsedStores.set(BI);
+ Dists[BI] = Pair.second - It->second;
+ }
+ for (unsigned I = StartIdx; I < Idx; ++I) {
+ unsigned BI = I - StartIdx;
+ if (UsedStores.test(BI))
+ Set.second.emplace(I, Dists[BI]);
+ }
+ return;
}
+ auto &Res = SortedStores.emplace_back();
+ Res.first = Idx;
+ Res.second.emplace(Idx, 0);
+ };
+ StoreInst *PrevStore = Stores.front();
+ for (auto [I, SI] : enumerate(Stores)) {
+ // Check that we do not try to vectorize stores of different types.
+ if (PrevStore->getValueOperand()->getType() !=
+ SI->getValueOperand()->getType()) {
+ for (auto &Set : SortedStores)
+ TryToVectorize(Set.second);
+ SortedStores.clear();
+ PrevStore = SI;
+ }
+ FillStoresSet(I, SI);
}
+ // Final vectorization attempt.
+ for (auto &Set : SortedStores)
+ TryToVectorize(Set.second);
+
return Changed;
}
@@ -12507,8 +13682,10 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
// constant index, or a pointer operand that doesn't point to a scalar
// type.
else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
- auto Idx = GEP->idx_begin()->get();
- if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+ if (GEP->getNumIndices() != 1)
+ continue;
+ Value *Idx = GEP->idx_begin()->get();
+ if (isa<Constant>(Idx))
continue;
if (!isValidElementType(Idx->getType()))
continue;
@@ -12542,8 +13719,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// NOTE: the following will give user internal llvm type name, which may
// not be useful.
R.getORE()->emit([&]() {
- std::string type_str;
- llvm::raw_string_ostream rso(type_str);
+ std::string TypeStr;
+ llvm::raw_string_ostream rso(TypeStr);
Ty->print(rso);
return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
<< "Cannot SLP vectorize list: type "
@@ -12878,10 +14055,12 @@ class HorizontalReduction {
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
Value *RHS, const Twine &Name,
const ReductionOpsListType &ReductionOps) {
- bool UseSelect = ReductionOps.size() == 2 ||
- // Logical or/and.
- (ReductionOps.size() == 1 &&
- isa<SelectInst>(ReductionOps.front().front()));
+ bool UseSelect =
+ ReductionOps.size() == 2 ||
+ // Logical or/and.
+ (ReductionOps.size() == 1 && any_of(ReductionOps.front(), [](Value *V) {
+ return isa<SelectInst>(V);
+ }));
assert((!UseSelect || ReductionOps.size() != 2 ||
isa<SelectInst>(ReductionOps[1][0])) &&
"Expected cmp + select pairs for reduction");
@@ -13315,12 +14494,26 @@ public:
// Update the final value in the reduction.
Builder.SetCurrentDebugLocation(
cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+ if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
+ (isGuaranteedNotToBePoison(Res) &&
+ !isGuaranteedNotToBePoison(VectorizedTree))) {
+ auto It = ReducedValsToOps.find(Res);
+ if (It != ReducedValsToOps.end() &&
+ any_of(It->getSecond(),
+ [](Instruction *I) { return isBoolLogicOp(I); }))
+ std::swap(VectorizedTree, Res);
+ }
+
return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
ReductionOps);
}
// Initialize the final value in the reduction.
return Res;
};
+ bool AnyBoolLogicOp =
+ any_of(ReductionOps.back(), [](Value *V) {
+ return isBoolLogicOp(cast<Instruction>(V));
+ });
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
@@ -13364,10 +14557,12 @@ public:
// Check if the reduction value was not overriden by the extractelement
// instruction because of the vectorization and exclude it, if it is not
// compatible with other values.
- if (auto *Inst = dyn_cast<Instruction>(RdxVal))
- if (isVectorLikeInstWithConstOps(Inst) &&
- (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))
- continue;
+ // Also check if the instruction was folded to constant/other value.
+ auto *Inst = dyn_cast<Instruction>(RdxVal);
+ if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
+ (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
+ (S.getOpcode() && !Inst))
+ continue;
Candidates.push_back(RdxVal);
TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
}
@@ -13543,11 +14738,9 @@ public:
for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
continue;
- for_each(ReducedVals[Cnt],
- [&LocalExternallyUsedValues, &TrackedVals](Value *V) {
- if (isa<Instruction>(V))
- LocalExternallyUsedValues[TrackedVals[V]];
- });
+ for (Value *V : ReducedVals[Cnt])
+ if (isa<Instruction>(V))
+ LocalExternallyUsedValues[TrackedVals[V]];
}
if (!IsSupportedHorRdxIdentityOp) {
// Number of uses of the candidates in the vector of values.
@@ -13591,7 +14784,7 @@ public:
// Update LocalExternallyUsedValues for the scalar, replaced by
// extractelement instructions.
for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
- auto It = ExternallyUsedValues.find(Pair.first);
+ auto *It = ExternallyUsedValues.find(Pair.first);
if (It == ExternallyUsedValues.end())
continue;
LocalExternallyUsedValues[Pair.second].append(It->second);
@@ -13605,7 +14798,8 @@ public:
InstructionCost ReductionCost =
getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
InstructionCost Cost = TreeCost + ReductionCost;
- LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
+ << " for reduction\n");
if (!Cost.isValid())
return nullptr;
if (Cost >= -SLPCostThreshold) {
@@ -13652,7 +14846,9 @@ public:
// To prevent poison from leaking across what used to be sequential,
// safe, scalar boolean logic operations, the reduction operand must be
// frozen.
- if (isBoolLogicOp(RdxRootInst))
+ if ((isBoolLogicOp(RdxRootInst) ||
+ (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
+ !isGuaranteedNotToBePoison(VectorizedRoot))
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
// Emit code to correctly handle reused reduced values, if required.
@@ -13664,6 +14860,16 @@ public:
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+ if (ReducedSubTree->getType() != VL.front()->getType()) {
+ ReducedSubTree = Builder.CreateIntCast(
+ ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
+ KnownBits Known = computeKnownBits(
+ R, cast<Instruction>(ReductionOps.front().front())
+ ->getModule()
+ ->getDataLayout());
+ return !Known.isNonNegative();
+ }));
+ }
// Improved analysis for add/fadd/xor reductions with same scale factor
// for all operands of reductions. We can emit scalar ops for them
@@ -13716,31 +14922,33 @@ public:
// RedOp2 = select i1 ?, i1 RHS, i1 false
// Then, we must freeze LHS in the new op.
- auto &&FixBoolLogicalOps =
- [&Builder, VectorizedTree](Value *&LHS, Value *&RHS,
- Instruction *RedOp1, Instruction *RedOp2) {
- if (!isBoolLogicOp(RedOp1))
- return;
- if (LHS == VectorizedTree || getRdxOperand(RedOp1, 0) == LHS ||
- isGuaranteedNotToBePoison(LHS))
- return;
- if (!isBoolLogicOp(RedOp2))
- return;
- if (RHS == VectorizedTree || getRdxOperand(RedOp2, 0) == RHS ||
- isGuaranteedNotToBePoison(RHS)) {
- std::swap(LHS, RHS);
- return;
- }
- LHS = Builder.CreateFreeze(LHS);
- };
+ auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
+ Instruction *RedOp1,
+ Instruction *RedOp2,
+ bool InitStep) {
+ if (!AnyBoolLogicOp)
+ return;
+ if (isBoolLogicOp(RedOp1) &&
+ ((!InitStep && LHS == VectorizedTree) ||
+ getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
+ return;
+ if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
+ getRdxOperand(RedOp2, 0) == RHS ||
+ isGuaranteedNotToBePoison(RHS))) {
+ std::swap(LHS, RHS);
+ return;
+ }
+ if (LHS != VectorizedTree)
+ LHS = Builder.CreateFreeze(LHS);
+ };
// Finish the reduction.
// Need to add extra arguments and not vectorized possible reduction
// values.
// Try to avoid dependencies between the scalar remainders after
// reductions.
- auto &&FinalGen =
- [this, &Builder, &TrackedVals, &FixBoolLogicalOps](
- ArrayRef<std::pair<Instruction *, Value *>> InstVals) {
+ auto FinalGen =
+ [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
+ bool InitStep) {
unsigned Sz = InstVals.size();
SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
Sz % 2);
@@ -13761,7 +14969,7 @@ public:
// sequential, safe, scalar boolean logic operations, the
// reduction operand must be frozen.
FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
- RedOp);
+ RedOp, InitStep);
Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
StableRdxVal2, "op.rdx", ReductionOps);
ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
@@ -13791,11 +14999,13 @@ public:
ExtraReductions.emplace_back(I, Pair.first);
}
// Iterate through all not-vectorized reduction values/extra arguments.
+ bool InitStep = true;
while (ExtraReductions.size() > 1) {
VectorizedTree = ExtraReductions.front().second;
SmallVector<std::pair<Instruction *, Value *>> NewReds =
- FinalGen(ExtraReductions);
+ FinalGen(ExtraReductions, InitStep);
ExtraReductions.swap(NewReds);
+ InitStep = false;
}
VectorizedTree = ExtraReductions.front().second;
@@ -13842,8 +15052,7 @@ private:
bool IsCmpSelMinMax, unsigned ReduxWidth,
FastMathFlags FMF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- Value *FirstReducedVal = ReducedVals.front();
- Type *ScalarTy = FirstReducedVal->getType();
+ Type *ScalarTy = ReducedVals.front()->getType();
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
InstructionCost VectorCost = 0, ScalarCost;
// If all of the reduced values are constant, the vector cost is 0, since
@@ -13917,7 +15126,7 @@ private:
}
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
- << " for reduction that starts with " << *FirstReducedVal
+ << " for reduction of " << shortBundleName(ReducedVals)
<< " (It is a splitting reduction)\n");
return VectorCost - ScalarCost;
}
@@ -13932,7 +15141,7 @@ private:
"A call to the llvm.fmuladd intrinsic is not handled yet");
++NumVectorInstructions;
- return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);
+ return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
}
/// Emits optimized code for unique scalar value reused \p Cnt times.
@@ -13979,8 +15188,8 @@ private:
case RecurKind::Mul:
case RecurKind::FMul:
case RecurKind::FMulAdd:
- case RecurKind::SelectICmp:
- case RecurKind::SelectFCmp:
+ case RecurKind::IAnyOf:
+ case RecurKind::FAnyOf:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for repeated scalar.");
}
@@ -14068,8 +15277,8 @@ private:
case RecurKind::Mul:
case RecurKind::FMul:
case RecurKind::FMulAdd:
- case RecurKind::SelectICmp:
- case RecurKind::SelectFCmp:
+ case RecurKind::IAnyOf:
+ case RecurKind::FAnyOf:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for reused scalars.");
}
@@ -14164,8 +15373,8 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
InsertElts.resize(*AggregateSize);
findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
- llvm::erase_value(BuildVectorOpds, nullptr);
- llvm::erase_value(InsertElts, nullptr);
+ llvm::erase(BuildVectorOpds, nullptr);
+ llvm::erase(InsertElts, nullptr);
if (BuildVectorOpds.size() >= 2)
return true;
@@ -14401,8 +15610,7 @@ bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
BasicBlock *BB, BoUpSLP &R) {
- const DataLayout &DL = BB->getModule()->getDataLayout();
- if (!R.canMapToVector(IVI->getType(), DL))
+ if (!R.canMapToVector(IVI->getType()))
return false;
SmallVector<Value *, 16> BuildVectorOpds;
@@ -14541,11 +15749,11 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
if (BasePred1 > BasePred2)
return false;
// Compare operands.
- bool LEPreds = Pred1 <= Pred2;
- bool GEPreds = Pred1 >= Pred2;
+ bool CI1Preds = Pred1 == BasePred1;
+ bool CI2Preds = Pred2 == BasePred1;
for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
- auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);
- auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);
+ auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
+ auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
if (Op1->getValueID() < Op2->getValueID())
return !IsCompatibility;
if (Op1->getValueID() > Op2->getValueID())
@@ -14691,14 +15899,20 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
return true;
if (Opcodes1.size() > Opcodes2.size())
return false;
- std::optional<bool> ConstOrder;
for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
// Undefs are compatible with any other value.
if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {
- if (!ConstOrder)
- ConstOrder =
- !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);
- continue;
+ if (isa<Instruction>(Opcodes1[I]))
+ return true;
+ if (isa<Instruction>(Opcodes2[I]))
+ return false;
+ if (isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]))
+ return true;
+ if (isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]))
+ return false;
+ if (isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]))
+ continue;
+ return isa<UndefValue>(Opcodes2[I]);
}
if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
@@ -14714,21 +15928,26 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
- if (S.getOpcode())
+ if (S.getOpcode() && !S.isAltShuffle())
continue;
return I1->getOpcode() < I2->getOpcode();
}
- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {
- if (!ConstOrder)
- ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();
- continue;
- }
+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+ return Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();
+ if (isa<Instruction>(Opcodes1[I]))
+ return true;
+ if (isa<Instruction>(Opcodes2[I]))
+ return false;
+ if (isa<Constant>(Opcodes1[I]))
+ return true;
+ if (isa<Constant>(Opcodes2[I]))
+ return false;
if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
return true;
if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
return false;
}
- return ConstOrder && *ConstOrder;
+ return false;
};
auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
if (V1 == V2)
@@ -14776,6 +15995,9 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
Incoming.push_back(P);
}
+ if (Incoming.size() <= 1)
+ break;
+
// Find the corresponding non-phi nodes for better matching when trying to
// build the tree.
for (Value *V : Incoming) {
@@ -14838,41 +16060,41 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
return I->use_empty() &&
(I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
};
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
// Skip instructions with scalable type. The num of elements is unknown at
// compile-time for scalable type.
- if (isa<ScalableVectorType>(it->getType()))
+ if (isa<ScalableVectorType>(It->getType()))
continue;
// Skip instructions marked for the deletion.
- if (R.isDeleted(&*it))
+ if (R.isDeleted(&*It))
continue;
// We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(&*it).second) {
- if (HasNoUsers(&*it) &&
- VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) {
+ if (!VisitedInstrs.insert(&*It).second) {
+ if (HasNoUsers(&*It) &&
+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
- it = BB->begin();
- e = BB->end();
+ It = BB->begin();
+ E = BB->end();
}
continue;
}
- if (isa<DbgInfoIntrinsic>(it))
+ if (isa<DbgInfoIntrinsic>(It))
continue;
// Try to vectorize reductions that use PHINodes.
- if (PHINode *P = dyn_cast<PHINode>(it)) {
+ if (PHINode *P = dyn_cast<PHINode>(It)) {
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() == 2) {
// Try to match and vectorize a horizontal reduction.
Instruction *Root = getReductionInstr(DT, P, BB, LI);
if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
Changed = true;
- it = BB->begin();
- e = BB->end();
+ It = BB->begin();
+ E = BB->end();
continue;
}
}
@@ -14897,23 +16119,23 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
- if (HasNoUsers(&*it)) {
+ if (HasNoUsers(&*It)) {
bool OpsChanged = false;
- auto *SI = dyn_cast<StoreInst>(it);
+ auto *SI = dyn_cast<StoreInst>(It);
bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
if (SI) {
- auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
+ auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
// Try to vectorize chain in store, if this is the only store to the
// address in the block.
// TODO: This is just a temporarily solution to save compile time. Need
// to investigate if we can safely turn on slp-vectorize-hor-store
// instead to allow lookup for reduction chains in all non-vectorized
// stores (need to check side effects and compile time).
- TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) &&
- SI->getValueOperand()->hasOneUse();
+ TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
+ SI->getValueOperand()->hasOneUse();
}
if (TryToVectorizeRoot) {
- for (auto *V : it->operand_values()) {
+ for (auto *V : It->operand_values()) {
// Postponed instructions should not be vectorized here, delay their
// vectorization.
if (auto *VI = dyn_cast<Instruction>(V);
@@ -14926,21 +16148,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// top-tree instructions to try to vectorize as many instructions as
// possible.
OpsChanged |=
- VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator());
+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
if (OpsChanged) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
- it = BB->begin();
- e = BB->end();
+ It = BB->begin();
+ E = BB->end();
continue;
}
}
- if (isa<InsertElementInst, InsertValueInst>(it))
- PostProcessInserts.insert(&*it);
- else if (isa<CmpInst>(it))
- PostProcessCmps.insert(cast<CmpInst>(&*it));
+ if (isa<InsertElementInst, InsertValueInst>(It))
+ PostProcessInserts.insert(&*It);
+ else if (isa<CmpInst>(It))
+ PostProcessCmps.insert(cast<CmpInst>(&*It));
}
return Changed;
@@ -15044,6 +16266,12 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
// compatible (have the same opcode, same parent), otherwise it is
// definitely not profitable to try to vectorize them.
auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
+ if (V->getValueOperand()->getType()->getTypeID() <
+ V2->getValueOperand()->getType()->getTypeID())
+ return true;
+ if (V->getValueOperand()->getType()->getTypeID() >
+ V2->getValueOperand()->getType()->getTypeID())
+ return false;
if (V->getPointerOperandType()->getTypeID() <
V2->getPointerOperandType()->getTypeID())
return true;
@@ -15082,6 +16310,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
if (V1 == V2)
return true;
+ if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
+ return false;
if (V1->getPointerOperandType() != V2->getPointerOperandType())
return false;
// Undefs are compatible with any other value.
@@ -15113,8 +16343,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
continue;
+ // Reverse stores to do bottom-to-top analysis. This is important if the
+ // values are stores to the same addresses several times, in this case need
+ // to follow the stores order (reversed to meet the memory dependecies).
+ SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
+ Pair.second.rend());
Changed |= tryToVectorizeSequence<StoreInst>(
- Pair.second, StoreSorter, AreCompatibleStores,
+ ReversedStores, StoreSorter, AreCompatibleStores,
[this, &R](ArrayRef<StoreInst *> Candidates, bool) {
return vectorizeStores(Candidates, R);
},
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 1271d1424c03..7ff6749a0908 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -133,9 +133,12 @@ public:
Ingredient2Recipe[I] = R;
}
+ /// Create the mask for the vector loop header block.
+ void createHeaderMask(VPlan &Plan);
+
/// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True. It returns the *entry*
- /// mask for the block BB.
+ /// that the header block of the loop is set to True or the loop mask when
+ /// tail folding. It returns the *entry* mask for the block BB.
VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan);
/// A helper function that computes the predicate of the edge between SRC
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index e81b88fd8099..1d7df9c9575a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -19,7 +19,6 @@
#include "VPlan.h"
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
-#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
@@ -234,6 +233,99 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
// set(Def, Extract, Instance);
return Extract;
}
+
+Value *VPTransformState::get(VPValue *Def, unsigned Part) {
+ // If Values have been set for this Def return the one relevant for \p Part.
+ if (hasVectorValue(Def, Part))
+ return Data.PerPartOutput[Def][Part];
+
+ auto GetBroadcastInstrs = [this, Def](Value *V) {
+ bool SafeToHoist = Def->isDefinedOutsideVectorRegions();
+ if (VF.isScalar())
+ return V;
+ // Place the code for broadcasting invariant variables in the new preheader.
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ if (SafeToHoist) {
+ BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
+ Plan->getVectorLoopRegion()->getSinglePredecessor())];
+ if (LoopVectorPreHeader)
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ }
+
+ // Place the code for broadcasting invariant variables in the new preheader.
+ // Broadcast the scalar into all locations in the vector.
+ Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+ return Shuf;
+ };
+
+ if (!hasScalarValue(Def, {Part, 0})) {
+ assert(Def->isLiveIn() && "expected a live-in");
+ if (Part != 0)
+ return get(Def, 0);
+ Value *IRV = Def->getLiveInIRValue();
+ Value *B = GetBroadcastInstrs(IRV);
+ set(Def, B, Part);
+ return B;
+ }
+
+ Value *ScalarValue = get(Def, {Part, 0});
+ // If we aren't vectorizing, we can just copy the scalar map values over
+ // to the vector map.
+ if (VF.isScalar()) {
+ set(Def, ScalarValue, Part);
+ return ScalarValue;
+ }
+
+ bool IsUniform = vputils::isUniformAfterVectorization(Def);
+
+ unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
+ // Check if there is a scalar value for the selected lane.
+ if (!hasScalarValue(Def, {Part, LastLane})) {
+ // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
+ // VPExpandSCEVRecipes can also be uniform.
+ assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
+ isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
+ isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
+ "unexpected recipe found to be invariant");
+ IsUniform = true;
+ LastLane = 0;
+ }
+
+ auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
+ // Set the insert point after the last scalarized instruction or after the
+ // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
+ // will directly follow the scalar definitions.
+ auto OldIP = Builder.saveIP();
+ auto NewIP =
+ isa<PHINode>(LastInst)
+ ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
+ : std::next(BasicBlock::iterator(LastInst));
+ Builder.SetInsertPoint(&*NewIP);
+
+ // However, if we are vectorizing, we need to construct the vector values.
+ // If the value is known to be uniform after vectorization, we can just
+ // broadcast the scalar value corresponding to lane zero for each unroll
+ // iteration. Otherwise, we construct the vector values using
+ // insertelement instructions. Since the resulting vectors are stored in
+ // State, we will only generate the insertelements once.
+ Value *VectorValue = nullptr;
+ if (IsUniform) {
+ VectorValue = GetBroadcastInstrs(ScalarValue);
+ set(Def, VectorValue, Part);
+ } else {
+ // Initialize packing with insertelements to start from undef.
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
+ set(Def, Undef, Part);
+ for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+ packScalarIntoVectorValue(Def, {Part, Lane});
+ VectorValue = get(Def, Part);
+ }
+ Builder.restoreIP(OldIP);
+ return VectorValue;
+}
+
BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) {
VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion();
return VPBB2IRBB[LoopRegion->getPreheaderVPBB()];
@@ -267,18 +359,15 @@ void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {
}
}
-void VPTransformState::setDebugLocFromInst(const Value *V) {
- const Instruction *Inst = dyn_cast<Instruction>(V);
- if (!Inst) {
- Builder.SetCurrentDebugLocation(DebugLoc());
- return;
- }
-
- const DILocation *DIL = Inst->getDebugLoc();
+void VPTransformState::setDebugLocFrom(DebugLoc DL) {
+ const DILocation *DIL = DL;
// When a FSDiscriminator is enabled, we don't need to add the multiply
// factors to the discriminators.
- if (DIL && Inst->getFunction()->shouldEmitDebugInfoForProfiling() &&
- !Inst->isDebugOrPseudoInst() && !EnableFSDiscriminator) {
+ if (DIL &&
+ Builder.GetInsertBlock()
+ ->getParent()
+ ->shouldEmitDebugInfoForProfiling() &&
+ !EnableFSDiscriminator) {
// FIXME: For scalable vectors, assume vscale=1.
auto NewDIL =
DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
@@ -291,6 +380,15 @@ void VPTransformState::setDebugLocFromInst(const Value *V) {
Builder.SetCurrentDebugLocation(DIL);
}
+void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
+ const VPIteration &Instance) {
+ Value *ScalarInst = get(Def, Instance);
+ Value *VectorValue = get(Def, Instance.Part);
+ VectorValue = Builder.CreateInsertElement(
+ VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF));
+ set(Def, VectorValue, Instance.Part);
+}
+
BasicBlock *
VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
// BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
@@ -616,22 +714,17 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader);
Plan->TripCount =
vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);
+ // Create empty VPRegionBlock, to be filled during processing later.
+ auto *TopRegion = new VPRegionBlock("vector loop", false /*isReplicator*/);
+ VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
+ VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
+ VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
return Plan;
}
-VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {
- VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
- for (VPRecipeBase &R : Header->phis()) {
- if (isa<VPActiveLaneMaskPHIRecipe>(&R))
- return cast<VPActiveLaneMaskPHIRecipe>(&R);
- }
- return nullptr;
-}
-
void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
Value *CanonicalIVStartValue,
- VPTransformState &State,
- bool IsEpilogueVectorization) {
+ VPTransformState &State) {
// Check if the backedge taken count is needed, and if so build it.
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
@@ -648,6 +741,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
State.set(&VectorTripCount, VectorTripCountV, Part);
+ IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+ // FIXME: Model VF * UF computation completely in VPlan.
+ State.set(&VFxUF,
+ createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF),
+ 0);
+
// When vectorizing the epilogue loop, the canonical induction start value
// needs to be changed from zero to the value after the main vector loop.
// FIXME: Improve modeling for canonical IV start values in the epilogue loop.
@@ -656,16 +755,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
auto *IV = getCanonicalIV();
assert(all_of(IV->users(),
[](const VPUser *U) {
- if (isa<VPScalarIVStepsRecipe>(U) ||
- isa<VPDerivedIVRecipe>(U))
- return true;
- auto *VPI = cast<VPInstruction>(U);
- return VPI->getOpcode() ==
- VPInstruction::CanonicalIVIncrement ||
- VPI->getOpcode() ==
- VPInstruction::CanonicalIVIncrementNUW;
+ return isa<VPScalarIVStepsRecipe>(U) ||
+ isa<VPDerivedIVRecipe>(U) ||
+ cast<VPInstruction>(U)->getOpcode() ==
+ Instruction::Add;
}) &&
- "the canonical IV should only be used by its increments or "
+ "the canonical IV should only be used by its increment or "
"ScalarIVSteps when resetting the start value");
IV->setOperand(0, VPV);
}
@@ -754,11 +849,14 @@ void VPlan::execute(VPTransformState *State) {
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD
-void VPlan::print(raw_ostream &O) const {
+void VPlan::printLiveIns(raw_ostream &O) const {
VPSlotTracker SlotTracker(this);
- O << "VPlan '" << getName() << "' {";
+ if (VFxUF.getNumUsers() > 0) {
+ O << "\nLive-in ";
+ VFxUF.printAsOperand(O, SlotTracker);
+ O << " = VF * UF";
+ }
if (VectorTripCount.getNumUsers() > 0) {
O << "\nLive-in ";
@@ -778,6 +876,15 @@ void VPlan::print(raw_ostream &O) const {
TripCount->printAsOperand(O, SlotTracker);
O << " = original trip-count";
O << "\n";
+}
+
+LLVM_DUMP_METHOD
+void VPlan::print(raw_ostream &O) const {
+ VPSlotTracker SlotTracker(this);
+
+ O << "VPlan '" << getName() << "' {";
+
+ printLiveIns(O);
if (!getPreheader()->empty()) {
O << "\n";
@@ -895,11 +1002,18 @@ void VPlanPrinter::dump() {
OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
if (!Plan.getName().empty())
OS << "\\n" << DOT::EscapeString(Plan.getName());
- if (Plan.BackedgeTakenCount) {
- OS << ", where:\\n";
- Plan.BackedgeTakenCount->print(OS, SlotTracker);
- OS << " := BackedgeTakenCount";
+
+ {
+ // Print live-ins.
+ std::string Str;
+ raw_string_ostream SS(Str);
+ Plan.printLiveIns(SS);
+ SmallVector<StringRef, 0> Lines;
+ StringRef(Str).rtrim('\n').split(Lines, "\n");
+ for (auto Line : Lines)
+ OS << DOT::EscapeString(Line.str()) << "\\n";
}
+
OS << "\"]\n";
OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
OS << "edge [fontname=Courier, fontsize=30]\n";
@@ -1021,16 +1135,43 @@ void VPlanIngredient::print(raw_ostream &O) const {
template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
void VPValue::replaceAllUsesWith(VPValue *New) {
+ if (this == New)
+ return;
for (unsigned J = 0; J < getNumUsers();) {
VPUser *User = Users[J];
- unsigned NumUsers = getNumUsers();
+ bool RemovedUser = false;
for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
- if (User->getOperand(I) == this)
+ if (User->getOperand(I) == this) {
User->setOperand(I, New);
+ RemovedUser = true;
+ }
// If a user got removed after updating the current user, the next user to
// update will be moved to the current position, so we only need to
// increment the index if the number of users did not change.
- if (NumUsers == getNumUsers())
+ if (!RemovedUser)
+ J++;
+ }
+}
+
+void VPValue::replaceUsesWithIf(
+ VPValue *New,
+ llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace) {
+ if (this == New)
+ return;
+ for (unsigned J = 0; J < getNumUsers();) {
+ VPUser *User = Users[J];
+ bool RemovedUser = false;
+ for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {
+ if (User->getOperand(I) != this || !ShouldReplace(*User, I))
+ continue;
+
+ RemovedUser = true;
+ User->setOperand(I, New);
+ }
+ // If a user got removed after updating the current user, the next user to
+ // update will be moved to the current position, so we only need to
+ // increment the index if the number of users did not change.
+ if (!RemovedUser)
J++;
}
}
@@ -1116,6 +1257,8 @@ void VPSlotTracker::assignSlot(const VPValue *V) {
}
void VPSlotTracker::assignSlots(const VPlan &Plan) {
+ if (Plan.VFxUF.getNumUsers() > 0)
+ assignSlot(&Plan.VFxUF);
assignSlot(&Plan.VectorTripCount);
if (Plan.BackedgeTakenCount)
assignSlot(Plan.BackedgeTakenCount);
@@ -1139,6 +1282,11 @@ bool vputils::onlyFirstLaneUsed(VPValue *Def) {
[Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); });
}
+bool vputils::onlyFirstPartUsed(VPValue *Def) {
+ return all_of(Def->users(),
+ [Def](VPUser *U) { return U->onlyFirstPartUsed(Def); });
+}
+
VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
ScalarEvolution &SE) {
if (auto *Expanded = Plan.getSCEVExpansion(Expr))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 73313465adea..94cb76889813 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -23,6 +23,7 @@
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+#include "VPlanAnalysis.h"
#include "VPlanValue.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
@@ -233,9 +234,9 @@ struct VPIteration {
struct VPTransformState {
VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
DominatorTree *DT, IRBuilderBase &Builder,
- InnerLoopVectorizer *ILV, VPlan *Plan)
+ InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx)
: VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
- LVer(nullptr) {}
+ LVer(nullptr), TypeAnalysis(Ctx) {}
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
ElementCount VF;
@@ -274,10 +275,6 @@ struct VPTransformState {
I->second[Part];
}
- bool hasAnyVectorValue(VPValue *Def) const {
- return Data.PerPartOutput.contains(Def);
- }
-
bool hasScalarValue(VPValue *Def, VPIteration Instance) {
auto I = Data.PerPartScalars.find(Def);
if (I == Data.PerPartScalars.end())
@@ -349,8 +346,11 @@ struct VPTransformState {
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
- /// Set the debug location in the builder using the debug location in \p V.
- void setDebugLocFromInst(const Value *V);
+ /// Set the debug location in the builder using the debug location \p DL.
+ void setDebugLocFrom(DebugLoc DL);
+
+ /// Construct the vector value of a scalarized value \p V one lane at a time.
+ void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance);
/// Hold state information used when constructing the CFG of the output IR,
/// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
@@ -410,6 +410,9 @@ struct VPTransformState {
/// Map SCEVs to their expanded values. Populated when executing
/// VPExpandSCEVRecipes.
DenseMap<const SCEV *, Value *> ExpandedSCEVs;
+
+ /// VPlan-based type analysis.
+ VPTypeAnalysis TypeAnalysis;
};
/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
@@ -582,6 +585,8 @@ public:
/// This VPBlockBase must have no successors.
void setOneSuccessor(VPBlockBase *Successor) {
assert(Successors.empty() && "Setting one successor when others exist.");
+ assert(Successor->getParent() == getParent() &&
+ "connected blocks must have the same parent");
appendSuccessor(Successor);
}
@@ -693,7 +698,7 @@ public:
};
/// VPRecipeBase is a base class modeling a sequence of one or more output IR
-/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
+/// instructions. VPRecipeBase owns the VPValues it defines through VPDef
/// and is responsible for deleting its defined values. Single-value
/// VPRecipeBases that also inherit from VPValue must make sure to inherit from
/// VPRecipeBase before VPValue.
@@ -706,13 +711,18 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
/// Each VPRecipe belongs to a single VPBasicBlock.
VPBasicBlock *Parent = nullptr;
+ /// The debug location for the recipe.
+ DebugLoc DL;
+
public:
- VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands)
- : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}
+ VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands,
+ DebugLoc DL = {})
+ : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {}
template <typename IterT>
- VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands)
- : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}
+ VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands,
+ DebugLoc DL = {})
+ : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {}
virtual ~VPRecipeBase() = default;
/// \return the VPBasicBlock which this VPRecipe belongs to.
@@ -789,6 +799,9 @@ public:
bool mayReadOrWriteMemory() const {
return mayReadFromMemory() || mayWriteToMemory();
}
+
+ /// Returns the debug location of the recipe.
+ DebugLoc getDebugLoc() const { return DL; }
};
// Helper macro to define common classof implementations for recipes.
@@ -808,153 +821,30 @@ public:
return R->getVPDefID() == VPDefID; \
}
-/// This is a concrete Recipe that models a single VPlan-level instruction.
-/// While as any Recipe it may generate a sequence of IR instructions when
-/// executed, these instructions would always form a single-def expression as
-/// the VPInstruction is also a single def-use vertex.
-class VPInstruction : public VPRecipeBase, public VPValue {
- friend class VPlanSlp;
-
-public:
- /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
- enum {
- FirstOrderRecurrenceSplice =
- Instruction::OtherOpsEnd + 1, // Combines the incoming and previous
- // values of a first-order recurrence.
- Not,
- ICmpULE,
- SLPLoad,
- SLPStore,
- ActiveLaneMask,
- CalculateTripCountMinusVF,
- CanonicalIVIncrement,
- CanonicalIVIncrementNUW,
- // The next two are similar to the above, but instead increment the
- // canonical IV separately for each unrolled part.
- CanonicalIVIncrementForPart,
- CanonicalIVIncrementForPartNUW,
- BranchOnCount,
- BranchOnCond
- };
-
-private:
- typedef unsigned char OpcodeTy;
- OpcodeTy Opcode;
- FastMathFlags FMF;
- DebugLoc DL;
-
- /// An optional name that can be used for the generated IR instruction.
- const std::string Name;
-
- /// Utility method serving execute(): generates a single instance of the
- /// modeled instruction. \returns the generated value for \p Part.
- /// In some cases an existing value is returned rather than a generated
- /// one.
- Value *generateInstruction(VPTransformState &State, unsigned Part);
-
-protected:
- void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
-
-public:
- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
- const Twine &Name = "")
- : VPRecipeBase(VPDef::VPInstructionSC, Operands), VPValue(this),
- Opcode(Opcode), DL(DL), Name(Name.str()) {}
-
- VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
- DebugLoc DL = {}, const Twine &Name = "")
- : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}
-
- VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
-
- VPInstruction *clone() const {
- SmallVector<VPValue *, 2> Operands(operands());
- return new VPInstruction(Opcode, Operands, DL, Name);
- }
-
- unsigned getOpcode() const { return Opcode; }
-
- /// Generate the instruction.
- /// TODO: We currently execute only per-part unless a specific instance is
- /// provided.
- void execute(VPTransformState &State) override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the VPInstruction to \p O.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-
- /// Print the VPInstruction to dbgs() (for debugging).
- LLVM_DUMP_METHOD void dump() const;
-#endif
-
- /// Return true if this instruction may modify memory.
- bool mayWriteToMemory() const {
- // TODO: we can use attributes of the called function to rule out memory
- // modifications.
- return Opcode == Instruction::Store || Opcode == Instruction::Call ||
- Opcode == Instruction::Invoke || Opcode == SLPStore;
- }
-
- bool hasResult() const {
- // CallInst may or may not have a result, depending on the called function.
- // Conservatively return calls have results for now.
- switch (getOpcode()) {
- case Instruction::Ret:
- case Instruction::Br:
- case Instruction::Store:
- case Instruction::Switch:
- case Instruction::IndirectBr:
- case Instruction::Resume:
- case Instruction::CatchRet:
- case Instruction::Unreachable:
- case Instruction::Fence:
- case Instruction::AtomicRMW:
- case VPInstruction::BranchOnCond:
- case VPInstruction::BranchOnCount:
- return false;
- default:
- return true;
- }
- }
-
- /// Set the fast-math flags.
- void setFastMathFlags(FastMathFlags FMFNew);
-
- /// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
- assert(is_contained(operands(), Op) &&
- "Op must be an operand of the recipe");
- if (getOperand(0) != Op)
- return false;
- switch (getOpcode()) {
- default:
- return false;
- case VPInstruction::ActiveLaneMask:
- case VPInstruction::CalculateTripCountMinusVF:
- case VPInstruction::CanonicalIVIncrement:
- case VPInstruction::CanonicalIVIncrementNUW:
- case VPInstruction::CanonicalIVIncrementForPart:
- case VPInstruction::CanonicalIVIncrementForPartNUW:
- case VPInstruction::BranchOnCount:
- return true;
- };
- llvm_unreachable("switch should return");
- }
-};
-
/// Class to record LLVM IR flag for a recipe along with it.
class VPRecipeWithIRFlags : public VPRecipeBase {
enum class OperationType : unsigned char {
+ Cmp,
OverflowingBinOp,
+ DisjointOp,
PossiblyExactOp,
GEPOp,
FPMathOp,
+ NonNegOp,
Other
};
+
+public:
struct WrapFlagsTy {
char HasNUW : 1;
char HasNSW : 1;
+
+ WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {}
+ };
+
+private:
+ struct DisjointFlagsTy {
+ char IsDisjoint : 1;
};
struct ExactFlagsTy {
char IsExact : 1;
@@ -962,6 +852,9 @@ class VPRecipeWithIRFlags : public VPRecipeBase {
struct GEPFlagsTy {
char IsInBounds : 1;
};
+ struct NonNegFlagsTy {
+ char NonNeg : 1;
+ };
struct FastMathFlagsTy {
char AllowReassoc : 1;
char NoNaNs : 1;
@@ -970,56 +863,81 @@ class VPRecipeWithIRFlags : public VPRecipeBase {
char AllowReciprocal : 1;
char AllowContract : 1;
char ApproxFunc : 1;
+
+ FastMathFlagsTy(const FastMathFlags &FMF);
};
OperationType OpType;
union {
+ CmpInst::Predicate CmpPredicate;
WrapFlagsTy WrapFlags;
+ DisjointFlagsTy DisjointFlags;
ExactFlagsTy ExactFlags;
GEPFlagsTy GEPFlags;
+ NonNegFlagsTy NonNegFlags;
FastMathFlagsTy FMFs;
- unsigned char AllFlags;
+ unsigned AllFlags;
};
public:
template <typename IterT>
- VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands)
- : VPRecipeBase(SC, Operands) {
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {})
+ : VPRecipeBase(SC, Operands, DL) {
OpType = OperationType::Other;
AllFlags = 0;
}
template <typename IterT>
- VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands,
- Instruction &I)
- : VPRecipeWithIRFlags(SC, Operands) {
- if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) {
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I)
+ : VPRecipeWithIRFlags(SC, Operands, I.getDebugLoc()) {
+ if (auto *Op = dyn_cast<CmpInst>(&I)) {
+ OpType = OperationType::Cmp;
+ CmpPredicate = Op->getPredicate();
+ } else if (auto *Op = dyn_cast<PossiblyDisjointInst>(&I)) {
+ OpType = OperationType::DisjointOp;
+ DisjointFlags.IsDisjoint = Op->isDisjoint();
+ } else if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) {
OpType = OperationType::OverflowingBinOp;
- WrapFlags.HasNUW = Op->hasNoUnsignedWrap();
- WrapFlags.HasNSW = Op->hasNoSignedWrap();
+ WrapFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()};
} else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) {
OpType = OperationType::PossiblyExactOp;
ExactFlags.IsExact = Op->isExact();
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
OpType = OperationType::GEPOp;
GEPFlags.IsInBounds = GEP->isInBounds();
+ } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) {
+ OpType = OperationType::NonNegOp;
+ NonNegFlags.NonNeg = PNNI->hasNonNeg();
} else if (auto *Op = dyn_cast<FPMathOperator>(&I)) {
OpType = OperationType::FPMathOp;
- FastMathFlags FMF = Op->getFastMathFlags();
- FMFs.AllowReassoc = FMF.allowReassoc();
- FMFs.NoNaNs = FMF.noNaNs();
- FMFs.NoInfs = FMF.noInfs();
- FMFs.NoSignedZeros = FMF.noSignedZeros();
- FMFs.AllowReciprocal = FMF.allowReciprocal();
- FMFs.AllowContract = FMF.allowContract();
- FMFs.ApproxFunc = FMF.approxFunc();
+ FMFs = Op->getFastMathFlags();
}
}
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+ CmpInst::Predicate Pred, DebugLoc DL = {})
+ : VPRecipeBase(SC, Operands, DL), OpType(OperationType::Cmp),
+ CmpPredicate(Pred) {}
+
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+ WrapFlagsTy WrapFlags, DebugLoc DL = {})
+ : VPRecipeBase(SC, Operands, DL), OpType(OperationType::OverflowingBinOp),
+ WrapFlags(WrapFlags) {}
+
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+ FastMathFlags FMFs, DebugLoc DL = {})
+ : VPRecipeBase(SC, Operands, DL), OpType(OperationType::FPMathOp),
+ FMFs(FMFs) {}
+
static inline bool classof(const VPRecipeBase *R) {
- return R->getVPDefID() == VPRecipeBase::VPWidenSC ||
+ return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenSC ||
R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC;
}
@@ -1032,6 +950,9 @@ public:
WrapFlags.HasNUW = false;
WrapFlags.HasNSW = false;
break;
+ case OperationType::DisjointOp:
+ DisjointFlags.IsDisjoint = false;
+ break;
case OperationType::PossiblyExactOp:
ExactFlags.IsExact = false;
break;
@@ -1042,6 +963,10 @@ public:
FMFs.NoNaNs = false;
FMFs.NoInfs = false;
break;
+ case OperationType::NonNegOp:
+ NonNegFlags.NonNeg = false;
+ break;
+ case OperationType::Cmp:
case OperationType::Other:
break;
}
@@ -1054,6 +979,9 @@ public:
I->setHasNoUnsignedWrap(WrapFlags.HasNUW);
I->setHasNoSignedWrap(WrapFlags.HasNSW);
break;
+ case OperationType::DisjointOp:
+ cast<PossiblyDisjointInst>(I)->setIsDisjoint(DisjointFlags.IsDisjoint);
+ break;
case OperationType::PossiblyExactOp:
I->setIsExact(ExactFlags.IsExact);
break;
@@ -1069,43 +997,209 @@ public:
I->setHasAllowContract(FMFs.AllowContract);
I->setHasApproxFunc(FMFs.ApproxFunc);
break;
+ case OperationType::NonNegOp:
+ I->setNonNeg(NonNegFlags.NonNeg);
+ break;
+ case OperationType::Cmp:
case OperationType::Other:
break;
}
}
+ CmpInst::Predicate getPredicate() const {
+ assert(OpType == OperationType::Cmp &&
+ "recipe doesn't have a compare predicate");
+ return CmpPredicate;
+ }
+
bool isInBounds() const {
assert(OpType == OperationType::GEPOp &&
"recipe doesn't have inbounds flag");
return GEPFlags.IsInBounds;
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- FastMathFlags getFastMathFlags() const {
- FastMathFlags Res;
- Res.setAllowReassoc(FMFs.AllowReassoc);
- Res.setNoNaNs(FMFs.NoNaNs);
- Res.setNoInfs(FMFs.NoInfs);
- Res.setNoSignedZeros(FMFs.NoSignedZeros);
- Res.setAllowReciprocal(FMFs.AllowReciprocal);
- Res.setAllowContract(FMFs.AllowContract);
- Res.setApproxFunc(FMFs.ApproxFunc);
- return Res;
+ /// Returns true if the recipe has fast-math flags.
+ bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; }
+
+ FastMathFlags getFastMathFlags() const;
+
+ bool hasNoUnsignedWrap() const {
+ assert(OpType == OperationType::OverflowingBinOp &&
+ "recipe doesn't have a NUW flag");
+ return WrapFlags.HasNUW;
}
+ bool hasNoSignedWrap() const {
+ assert(OpType == OperationType::OverflowingBinOp &&
+ "recipe doesn't have a NSW flag");
+ return WrapFlags.HasNSW;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void printFlags(raw_ostream &O) const;
#endif
};
+/// This is a concrete Recipe that models a single VPlan-level instruction.
+/// While as any Recipe it may generate a sequence of IR instructions when
+/// executed, these instructions would always form a single-def expression as
+/// the VPInstruction is also a single def-use vertex.
+class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
+ friend class VPlanSlp;
+
+public:
+ /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
+ enum {
+ FirstOrderRecurrenceSplice =
+ Instruction::OtherOpsEnd + 1, // Combines the incoming and previous
+ // values of a first-order recurrence.
+ Not,
+ SLPLoad,
+ SLPStore,
+ ActiveLaneMask,
+ CalculateTripCountMinusVF,
+ // Increment the canonical IV separately for each unrolled part.
+ CanonicalIVIncrementForPart,
+ BranchOnCount,
+ BranchOnCond
+ };
+
+private:
+ typedef unsigned char OpcodeTy;
+ OpcodeTy Opcode;
+
+ /// An optional name that can be used for the generated IR instruction.
+ const std::string Name;
+
+ /// Utility method serving execute(): generates a single instance of the
+ /// modeled instruction. \returns the generated value for \p Part.
+ /// In some cases an existing value is returned rather than a generated
+ /// one.
+ Value *generateInstruction(VPTransformState &State, unsigned Part);
+
+#if !defined(NDEBUG)
+ /// Return true if the VPInstruction is a floating point math operation, i.e.
+ /// has fast-math flags.
+ bool isFPMathOp() const;
+#endif
+
+protected:
+ void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
+
+public:
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
+ const Twine &Name = "")
+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
+ VPValue(this), Opcode(Opcode), Name(Name.str()) {}
+
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
+ DebugLoc DL = {}, const Twine &Name = "")
+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}
+
+ VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A,
+ VPValue *B, DebugLoc DL = {}, const Twine &Name = "");
+
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
+ WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "")
+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL),
+ VPValue(this), Opcode(Opcode), Name(Name.str()) {}
+
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
+ FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
+
+ VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
+
+ unsigned getOpcode() const { return Opcode; }
+
+ /// Generate the instruction.
+ /// TODO: We currently execute only per-part unless a specific instance is
+ /// provided.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the VPInstruction to \p O.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+
+ /// Print the VPInstruction to dbgs() (for debugging).
+ LLVM_DUMP_METHOD void dump() const;
+#endif
+
+ /// Return true if this instruction may modify memory.
+ bool mayWriteToMemory() const {
+ // TODO: we can use attributes of the called function to rule out memory
+ // modifications.
+ return Opcode == Instruction::Store || Opcode == Instruction::Call ||
+ Opcode == Instruction::Invoke || Opcode == SLPStore;
+ }
+
+ bool hasResult() const {
+ // CallInst may or may not have a result, depending on the called function.
+ // Conservatively return calls have results for now.
+ switch (getOpcode()) {
+ case Instruction::Ret:
+ case Instruction::Br:
+ case Instruction::Store:
+ case Instruction::Switch:
+ case Instruction::IndirectBr:
+ case Instruction::Resume:
+ case Instruction::CatchRet:
+ case Instruction::Unreachable:
+ case Instruction::Fence:
+ case Instruction::AtomicRMW:
+ case VPInstruction::BranchOnCond:
+ case VPInstruction::BranchOnCount:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ if (getOperand(0) != Op)
+ return false;
+ switch (getOpcode()) {
+ default:
+ return false;
+ case VPInstruction::ActiveLaneMask:
+ case VPInstruction::CalculateTripCountMinusVF:
+ case VPInstruction::CanonicalIVIncrementForPart:
+ case VPInstruction::BranchOnCount:
+ return true;
+ };
+ llvm_unreachable("switch should return");
+ }
+
+ /// Returns true if the recipe only uses the first part of operand \p Op.
+ bool onlyFirstPartUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ if (getOperand(0) != Op)
+ return false;
+ switch (getOpcode()) {
+ default:
+ return false;
+ case VPInstruction::BranchOnCount:
+ return true;
+ };
+ llvm_unreachable("switch should return");
+ }
+};
+
/// VPWidenRecipe is a recipe for producing a copy of vector type its
/// ingredient. This recipe covers most of the traditional vectorization cases
/// where each ingredient transforms into a vectorized version of itself.
class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
+ unsigned Opcode;
public:
template <typename IterT>
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
- : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPValue(this, &I) {}
+ : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPValue(this, &I),
+ Opcode(I.getOpcode()) {}
~VPWidenRecipe() override = default;
@@ -1114,6 +1208,8 @@ public:
/// Produce widened copies of all Ingredients.
void execute(VPTransformState &State) override;
+ unsigned getOpcode() const { return Opcode; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -1122,7 +1218,7 @@ public:
};
/// VPWidenCastRecipe is a recipe to create vector cast instructions.
-class VPWidenCastRecipe : public VPRecipeBase, public VPValue {
+class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPValue {
/// Cast instruction opcode.
Instruction::CastOps Opcode;
@@ -1131,15 +1227,19 @@ class VPWidenCastRecipe : public VPRecipeBase, public VPValue {
public:
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
- CastInst *UI = nullptr)
- : VPRecipeBase(VPDef::VPWidenCastSC, Op), VPValue(this, UI),
+ CastInst &UI)
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), VPValue(this, &UI),
Opcode(Opcode), ResultTy(ResultTy) {
- assert((!UI || UI->getOpcode() == Opcode) &&
+ assert(UI.getOpcode() == Opcode &&
"opcode of underlying cast doesn't match");
- assert((!UI || UI->getType() == ResultTy) &&
+ assert(UI.getType() == ResultTy &&
"result type of underlying cast doesn't match");
}
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPValue(this, nullptr),
+ Opcode(Opcode), ResultTy(ResultTy) {}
+
~VPWidenCastRecipe() override = default;
VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
@@ -1196,7 +1296,8 @@ public:
struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
template <typename IterT>
VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)
- : VPRecipeBase(VPDef::VPWidenSelectSC, Operands), VPValue(this, &I) {}
+ : VPRecipeBase(VPDef::VPWidenSelectSC, Operands, I.getDebugLoc()),
+ VPValue(this, &I) {}
~VPWidenSelectRecipe() override = default;
@@ -1282,8 +1383,8 @@ public:
class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue {
protected:
VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr,
- VPValue *Start = nullptr)
- : VPRecipeBase(VPDefID, {}), VPValue(this, UnderlyingInstr) {
+ VPValue *Start = nullptr, DebugLoc DL = {})
+ : VPRecipeBase(VPDefID, {}, DL), VPValue(this, UnderlyingInstr) {
if (Start)
addOperand(Start);
}
@@ -1404,7 +1505,7 @@ public:
bool isCanonical() const;
/// Returns the scalar type of the induction.
- const Type *getScalarType() const {
+ Type *getScalarType() const {
return Trunc ? Trunc->getType() : IV->getType();
}
};
@@ -1565,14 +1666,13 @@ public:
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPRecipeBase, public VPValue {
- PHINode *Phi;
-
public:
/// The blend operation is a User of the incoming values and of their
/// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
/// might be incoming with a full mask for which there is no VPValue.
VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
- : VPRecipeBase(VPDef::VPBlendSC, Operands), VPValue(this, Phi), Phi(Phi) {
+ : VPRecipeBase(VPDef::VPBlendSC, Operands, Phi->getDebugLoc()),
+ VPValue(this, Phi) {
assert(Operands.size() > 0 &&
((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
"Expected either a single incoming value or a positive even number "
@@ -1701,16 +1801,13 @@ public:
/// The Operands are {ChainOp, VecOp, [Condition]}.
class VPReductionRecipe : public VPRecipeBase, public VPValue {
/// The recurrence decriptor for the reduction in question.
- const RecurrenceDescriptor *RdxDesc;
- /// Pointer to the TTI, needed to create the target reduction
- const TargetTransformInfo *TTI;
+ const RecurrenceDescriptor &RdxDesc;
public:
- VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I,
- VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- const TargetTransformInfo *TTI)
+ VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
+ VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp)
: VPRecipeBase(VPDef::VPReductionSC, {ChainOp, VecOp}), VPValue(this, I),
- RdxDesc(R), TTI(TTI) {
+ RdxDesc(R) {
if (CondOp)
addOperand(CondOp);
}
@@ -2008,11 +2105,9 @@ public:
/// loop). VPWidenCanonicalIVRecipe represents the vector version of the
/// canonical induction variable.
class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
- DebugLoc DL;
-
public:
VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)
- : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV), DL(DL) {}
+ : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {}
~VPCanonicalIVPHIRecipe() override = default;
@@ -2032,8 +2127,8 @@ public:
#endif
/// Returns the scalar type of the induction.
- const Type *getScalarType() const {
- return getOperand(0)->getLiveInIRValue()->getType();
+ Type *getScalarType() const {
+ return getStartValue()->getLiveInIRValue()->getType();
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
@@ -2043,6 +2138,13 @@ public:
return true;
}
+ /// Returns true if the recipe only uses the first part of operand \p Op.
+ bool onlyFirstPartUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+
/// Check if the induction described by \p Kind, /p Start and \p Step is
/// canonical, i.e. has the same start, step (of 1), and type as the
/// canonical IV.
@@ -2055,12 +2157,10 @@ public:
/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
/// remove VPActiveLaneMaskPHIRecipe.
class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
- DebugLoc DL;
-
public:
VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
- : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask),
- DL(DL) {}
+ : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask,
+ DL) {}
~VPActiveLaneMaskPHIRecipe() override = default;
@@ -2113,19 +2213,24 @@ public:
/// an IV with different start and step values, using Start + CanonicalIV *
/// Step.
class VPDerivedIVRecipe : public VPRecipeBase, public VPValue {
- /// The type of the result value. It may be smaller than the type of the
- /// induction and in this case it will get truncated to ResultTy.
- Type *ResultTy;
+ /// If not nullptr, the result of the induction will get truncated to
+ /// TruncResultTy.
+ Type *TruncResultTy;
- /// Induction descriptor for the induction the canonical IV is transformed to.
- const InductionDescriptor &IndDesc;
+ /// Kind of the induction.
+ const InductionDescriptor::InductionKind Kind;
+ /// If not nullptr, the floating point induction binary operator. Must be set
+ /// for floating point inductions.
+ const FPMathOperator *FPBinOp;
public:
VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
- Type *ResultTy)
+ Type *TruncResultTy)
: VPRecipeBase(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
- VPValue(this), ResultTy(ResultTy), IndDesc(IndDesc) {}
+ VPValue(this), TruncResultTy(TruncResultTy), Kind(IndDesc.getKind()),
+ FPBinOp(dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())) {
+ }
~VPDerivedIVRecipe() override = default;
@@ -2141,6 +2246,11 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
+ Type *getScalarType() const {
+ return TruncResultTy ? TruncResultTy
+ : getStartValue()->getLiveInIRValue()->getType();
+ }
+
VPValue *getStartValue() const { return getOperand(0); }
VPValue *getCanonicalIV() const { return getOperand(1); }
VPValue *getStepValue() const { return getOperand(2); }
@@ -2155,14 +2265,23 @@ public:
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their scalar values.
-class VPScalarIVStepsRecipe : public VPRecipeBase, public VPValue {
- const InductionDescriptor &IndDesc;
+class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, public VPValue {
+ Instruction::BinaryOps InductionOpcode;
public:
+ VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step,
+ Instruction::BinaryOps Opcode, FastMathFlags FMFs)
+ : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC,
+ ArrayRef<VPValue *>({IV, Step}), FMFs),
+ VPValue(this), InductionOpcode(Opcode) {}
+
VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
VPValue *Step)
- : VPRecipeBase(VPDef::VPScalarIVStepsSC, {IV, Step}), VPValue(this),
- IndDesc(IndDesc) {}
+ : VPScalarIVStepsRecipe(
+ IV, Step, IndDesc.getInductionOpcode(),
+ dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())
+ ? IndDesc.getInductionBinOp()->getFastMathFlags()
+ : FastMathFlags()) {}
~VPScalarIVStepsRecipe() override = default;
@@ -2445,6 +2564,9 @@ class VPlan {
/// Represents the vector trip count.
VPValue VectorTripCount;
+ /// Represents the loop-invariant VF * UF of the vector loop region.
+ VPValue VFxUF;
+
/// Holds a mapping between Values and their corresponding VPValue inside
/// VPlan.
Value2VPValueTy Value2VPValue;
@@ -2490,15 +2612,17 @@ public:
~VPlan();
- /// Create an initial VPlan with preheader and entry blocks. Creates a
- /// VPExpandSCEVRecipe for \p TripCount and uses it as plan's trip count.
+ /// Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping
+ /// original scalar pre-header) which contains SCEV expansions that need to
+ /// happen before the CFG is modified; a VPBasicBlock for the vector
+ /// pre-header, followed by a region for the vector loop, followed by the
+ /// middle VPBasicBlock.
static VPlanPtr createInitialVPlan(const SCEV *TripCount,
ScalarEvolution &PSE);
/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
- Value *CanonicalIVStartValue, VPTransformState &State,
- bool IsEpilogueVectorization);
+ Value *CanonicalIVStartValue, VPTransformState &State);
/// Generate the IR code for this VPlan.
void execute(VPTransformState *State);
@@ -2522,6 +2646,9 @@ public:
/// The vector trip count.
VPValue &getVectorTripCount() { return VectorTripCount; }
+ /// Returns VF * UF of the vector loop region.
+ VPValue &getVFxUF() { return VFxUF; }
+
/// Mark the plan to indicate that using Value2VPValue is not safe any
/// longer, because it may be stale.
void disableValue2VPValue() { Value2VPValueEnabled = false; }
@@ -2583,13 +2710,10 @@ public:
return getVPValue(V);
}
- void removeVPValueFor(Value *V) {
- assert(Value2VPValueEnabled &&
- "IR value to VPValue mapping may be out of date!");
- Value2VPValue.erase(V);
- }
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the live-ins of this VPlan to \p O.
+ void printLiveIns(raw_ostream &O) const;
+
/// Print this VPlan to \p O.
void print(raw_ostream &O) const;
@@ -2628,10 +2752,6 @@ public:
return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
}
- /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there
- /// be only one at most. If there isn't one, then return nullptr.
- VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi();
-
void addLiveOut(PHINode *PN, VPValue *V);
void removeLiveOut(PHINode *PN) {
@@ -2959,6 +3079,9 @@ namespace vputils {
/// Returns true if only the first lane of \p Def is used.
bool onlyFirstLaneUsed(VPValue *Def);
+/// Returns true if only the first part of \p Def is used.
+bool onlyFirstPartUsed(VPValue *Def);
+
/// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
/// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
/// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
new file mode 100644
index 000000000000..97a8a1803bbf
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -0,0 +1,237 @@
+//===- VPlanAnalysis.cpp - Various Analyses working on VPlan ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VPlanAnalysis.h"
+#include "VPlan.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan"
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPBlendRecipe *R) {
+ Type *ResTy = inferScalarType(R->getIncomingValue(0));
+ for (unsigned I = 1, E = R->getNumIncomingValues(); I != E; ++I) {
+ VPValue *Inc = R->getIncomingValue(I);
+ assert(inferScalarType(Inc) == ResTy &&
+ "different types inferred for different incoming values");
+ CachedTypes[Inc] = ResTy;
+ }
+ return ResTy;
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
+ switch (R->getOpcode()) {
+ case Instruction::Select: {
+ Type *ResTy = inferScalarType(R->getOperand(1));
+ VPValue *OtherV = R->getOperand(2);
+ assert(inferScalarType(OtherV) == ResTy &&
+ "different types inferred for different operands");
+ CachedTypes[OtherV] = ResTy;
+ return ResTy;
+ }
+ case VPInstruction::FirstOrderRecurrenceSplice: {
+ Type *ResTy = inferScalarType(R->getOperand(0));
+ VPValue *OtherV = R->getOperand(1);
+ assert(inferScalarType(OtherV) == ResTy &&
+ "different types inferred for different operands");
+ CachedTypes[OtherV] = ResTy;
+ return ResTy;
+ }
+ default:
+ break;
+ }
+ // Type inference not implemented for opcode.
+ LLVM_DEBUG({
+ dbgs() << "LV: Found unhandled opcode for: ";
+ R->getVPSingleValue()->dump();
+ });
+ llvm_unreachable("Unhandled opcode!");
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
+ unsigned Opcode = R->getOpcode();
+ switch (Opcode) {
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return IntegerType::get(Ctx, 1);
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ Type *ResTy = inferScalarType(R->getOperand(0));
+ assert(ResTy == inferScalarType(R->getOperand(1)) &&
+ "types for both operands must match for binary op");
+ CachedTypes[R->getOperand(1)] = ResTy;
+ return ResTy;
+ }
+ case Instruction::FNeg:
+ case Instruction::Freeze:
+ return inferScalarType(R->getOperand(0));
+ default:
+ break;
+ }
+
+ // Type inference not implemented for opcode.
+ LLVM_DEBUG({
+ dbgs() << "LV: Found unhandled opcode for: ";
+ R->getVPSingleValue()->dump();
+ });
+ llvm_unreachable("Unhandled opcode!");
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
+ auto &CI = *cast<CallInst>(R->getUnderlyingInstr());
+ return CI.getType();
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(
+ const VPWidenMemoryInstructionRecipe *R) {
+ assert(!R->isStore() && "Store recipes should not define any values");
+ return cast<LoadInst>(&R->getIngredient())->getType();
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenSelectRecipe *R) {
+ Type *ResTy = inferScalarType(R->getOperand(1));
+ VPValue *OtherV = R->getOperand(2);
+ assert(inferScalarType(OtherV) == ResTy &&
+ "different types inferred for different operands");
+ CachedTypes[OtherV] = ResTy;
+ return ResTy;
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
+ switch (R->getUnderlyingInstr()->getOpcode()) {
+ case Instruction::Call: {
+ unsigned CallIdx = R->getNumOperands() - (R->isPredicated() ? 2 : 1);
+ return cast<Function>(R->getOperand(CallIdx)->getLiveInIRValue())
+ ->getReturnType();
+ }
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ Type *ResTy = inferScalarType(R->getOperand(0));
+ assert(ResTy == inferScalarType(R->getOperand(1)) &&
+ "inferred types for operands of binary op don't match");
+ CachedTypes[R->getOperand(1)] = ResTy;
+ return ResTy;
+ }
+ case Instruction::Select: {
+ Type *ResTy = inferScalarType(R->getOperand(1));
+ assert(ResTy == inferScalarType(R->getOperand(2)) &&
+ "inferred types for operands of select op don't match");
+ CachedTypes[R->getOperand(2)] = ResTy;
+ return ResTy;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return IntegerType::get(Ctx, 1);
+ case Instruction::Alloca:
+ case Instruction::BitCast:
+ case Instruction::Trunc:
+ case Instruction::SExt:
+ case Instruction::ZExt:
+ case Instruction::FPExt:
+ case Instruction::FPTrunc:
+ case Instruction::ExtractValue:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::FPToSI:
+ case Instruction::FPToUI:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ return R->getUnderlyingInstr()->getType();
+ case Instruction::Freeze:
+ case Instruction::FNeg:
+ case Instruction::GetElementPtr:
+ return inferScalarType(R->getOperand(0));
+ case Instruction::Load:
+ return cast<LoadInst>(R->getUnderlyingInstr())->getType();
+ case Instruction::Store:
+ // FIXME: VPReplicateRecipes with store opcodes still define a result
+ // VPValue, so we need to handle them here. Remove the code here once this
+ // is modeled accurately in VPlan.
+ return Type::getVoidTy(Ctx);
+ default:
+ break;
+ }
+ // Type inference not implemented for opcode.
+ LLVM_DEBUG({
+ dbgs() << "LV: Found unhandled opcode for: ";
+ R->getVPSingleValue()->dump();
+ });
+ llvm_unreachable("Unhandled opcode");
+}
+
+Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
+ if (Type *CachedTy = CachedTypes.lookup(V))
+ return CachedTy;
+
+ if (V->isLiveIn())
+ return V->getLiveInIRValue()->getType();
+
+ Type *ResultTy =
+ TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
+ .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
+ VPReductionPHIRecipe, VPWidenPointerInductionRecipe>(
+ [this](const auto *R) {
+ // Handle header phi recipes, except VPWienIntOrFpInduction
+ // which needs special handling due it being possibly truncated.
+ // TODO: consider inferring/caching type of siblings, e.g.,
+ // backedge value, here and in cases below.
+ return inferScalarType(R->getStartValue());
+ })
+ .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
+ [](const auto *R) { return R->getScalarType(); })
+ .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
+ VPWidenGEPRecipe>([this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
+ .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
+ VPWidenCallRecipe, VPWidenMemoryInstructionRecipe,
+ VPWidenSelectRecipe>(
+ [this](const auto *R) { return inferScalarTypeForRecipe(R); })
+ .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
+ // TODO: Use info from interleave group.
+ return V->getUnderlyingValue()->getType();
+ })
+ .Case<VPWidenCastRecipe>(
+ [](const VPWidenCastRecipe *R) { return R->getResultType(); });
+ assert(ResultTy && "could not infer type for the given VPValue");
+ CachedTypes[V] = ResultTy;
+ return ResultTy;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
new file mode 100644
index 000000000000..7276641551ae
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -0,0 +1,61 @@
+//===- VPlanAnalysis.h - Various Analyses working on VPlan ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class LLVMContext;
+class VPValue;
+class VPBlendRecipe;
+class VPInstruction;
+class VPWidenRecipe;
+class VPWidenCallRecipe;
+class VPWidenIntOrFpInductionRecipe;
+class VPWidenMemoryInstructionRecipe;
+struct VPWidenSelectRecipe;
+class VPReplicateRecipe;
+class Type;
+
+/// An analysis for type-inference for VPValues.
+/// It infers the scalar type for a given VPValue by bottom-up traversing
+/// through defining recipes until root nodes with known types are reached (e.g.
+/// live-ins or load recipes). The types are then propagated top down through
+/// operations.
+/// Note that the analysis caches the inferred types. A new analysis object must
+/// be constructed once a VPlan has been modified in a way that invalidates any
+/// of the previously inferred types.
+class VPTypeAnalysis {
+ DenseMap<const VPValue *, Type *> CachedTypes;
+ LLVMContext &Ctx;
+
+ Type *inferScalarTypeForRecipe(const VPBlendRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPInstruction *R);
+ Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPWidenRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPWidenMemoryInstructionRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+
+public:
+ VPTypeAnalysis(LLVMContext &Ctx) : Ctx(Ctx) {}
+
+ /// Infer the type of \p V. Returns the scalar type of \p V.
+ Type *inferScalarType(const VPValue *V);
+
+ /// Return the LLVMContext used by the analysis.
+ LLVMContext &getContext() { return Ctx; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index f6e3a2a16db8..f950d4740e41 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -61,6 +61,7 @@ private:
// Utility functions.
void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
+ void setRegionPredsFromBB(VPRegionBlock *VPBB, BasicBlock *BB);
void fixPhiNodes();
VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
#ifndef NDEBUG
@@ -81,14 +82,43 @@ public:
// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
// must have no predecessors.
void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
- SmallVector<VPBlockBase *, 8> VPBBPreds;
+ auto GetLatchOfExit = [this](BasicBlock *BB) -> BasicBlock * {
+ auto *SinglePred = BB->getSinglePredecessor();
+ Loop *LoopForBB = LI->getLoopFor(BB);
+ if (!SinglePred || LI->getLoopFor(SinglePred) == LoopForBB)
+ return nullptr;
+ // The input IR must be in loop-simplify form, ensuring a single predecessor
+ // for exit blocks.
+ assert(SinglePred == LI->getLoopFor(SinglePred)->getLoopLatch() &&
+ "SinglePred must be the only loop latch");
+ return SinglePred;
+ };
+ if (auto *LatchBB = GetLatchOfExit(BB)) {
+ auto *PredRegion = getOrCreateVPBB(LatchBB)->getParent();
+ assert(VPBB == cast<VPBasicBlock>(PredRegion->getSingleSuccessor()) &&
+ "successor must already be set for PredRegion; it must have VPBB "
+ "as single successor");
+ VPBB->setPredecessors({PredRegion});
+ return;
+ }
// Collect VPBB predecessors.
+ SmallVector<VPBlockBase *, 2> VPBBPreds;
for (BasicBlock *Pred : predecessors(BB))
VPBBPreds.push_back(getOrCreateVPBB(Pred));
-
VPBB->setPredecessors(VPBBPreds);
}
+static bool isHeaderBB(BasicBlock *BB, Loop *L) {
+ return L && BB == L->getHeader();
+}
+
+void PlainCFGBuilder::setRegionPredsFromBB(VPRegionBlock *Region,
+ BasicBlock *BB) {
+ // BB is a loop header block. Connect the region to the loop preheader.
+ Loop *LoopOfBB = LI->getLoopFor(BB);
+ Region->setPredecessors({getOrCreateVPBB(LoopOfBB->getLoopPredecessor())});
+}
+
// Add operands to VPInstructions representing phi nodes from the input IR.
void PlainCFGBuilder::fixPhiNodes() {
for (auto *Phi : PhisToFix) {
@@ -100,38 +130,85 @@ void PlainCFGBuilder::fixPhiNodes() {
assert(VPPhi->getNumOperands() == 0 &&
"Expected VPInstruction with no operands.");
+ Loop *L = LI->getLoopFor(Phi->getParent());
+ if (isHeaderBB(Phi->getParent(), L)) {
+ // For header phis, make sure the incoming value from the loop
+ // predecessor is the first operand of the recipe.
+ assert(Phi->getNumOperands() == 2);
+ BasicBlock *LoopPred = L->getLoopPredecessor();
+ VPPhi->addIncoming(
+ getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred)),
+ BB2VPBB[LoopPred]);
+ BasicBlock *LoopLatch = L->getLoopLatch();
+ VPPhi->addIncoming(
+ getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch)),
+ BB2VPBB[LoopLatch]);
+ continue;
+ }
+
for (unsigned I = 0; I != Phi->getNumOperands(); ++I)
VPPhi->addIncoming(getOrCreateVPOperand(Phi->getIncomingValue(I)),
BB2VPBB[Phi->getIncomingBlock(I)]);
}
}
+static bool isHeaderVPBB(VPBasicBlock *VPBB) {
+ return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB;
+}
+
+/// Return true of \p L loop is contained within \p OuterLoop.
+static bool doesContainLoop(const Loop *L, const Loop *OuterLoop) {
+ if (L->getLoopDepth() < OuterLoop->getLoopDepth())
+ return false;
+ const Loop *P = L;
+ while (P) {
+ if (P == OuterLoop)
+ return true;
+ P = P->getParentLoop();
+ }
+ return false;
+}
+
// Create a new empty VPBasicBlock for an incoming BasicBlock in the region
// corresponding to the containing loop or retrieve an existing one if it was
// already created. If no region exists yet for the loop containing \p BB, a new
// one is created.
VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
- auto BlockIt = BB2VPBB.find(BB);
- if (BlockIt != BB2VPBB.end())
+ if (auto *VPBB = BB2VPBB.lookup(BB)) {
// Retrieve existing VPBB.
- return BlockIt->second;
-
- // Get or create a region for the loop containing BB.
- Loop *CurrentLoop = LI->getLoopFor(BB);
- VPRegionBlock *ParentR = nullptr;
- if (CurrentLoop) {
- auto Iter = Loop2Region.insert({CurrentLoop, nullptr});
- if (Iter.second)
- Iter.first->second = new VPRegionBlock(
- CurrentLoop->getHeader()->getName().str(), false /*isReplicator*/);
- ParentR = Iter.first->second;
+ return VPBB;
}
// Create new VPBB.
- LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
- VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
+ StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();
+ LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
+ VPBasicBlock *VPBB = new VPBasicBlock(Name);
BB2VPBB[BB] = VPBB;
- VPBB->setParent(ParentR);
+
+ // Get or create a region for the loop containing BB.
+ Loop *LoopOfBB = LI->getLoopFor(BB);
+ if (!LoopOfBB || !doesContainLoop(LoopOfBB, TheLoop))
+ return VPBB;
+
+ auto *RegionOfVPBB = Loop2Region.lookup(LoopOfBB);
+ if (!isHeaderBB(BB, LoopOfBB)) {
+ assert(RegionOfVPBB &&
+ "Region should have been created by visiting header earlier");
+ VPBB->setParent(RegionOfVPBB);
+ return VPBB;
+ }
+
+ assert(!RegionOfVPBB &&
+ "First visit of a header basic block expects to register its region.");
+ // Handle a header - take care of its Region.
+ if (LoopOfBB == TheLoop) {
+ RegionOfVPBB = Plan.getVectorLoopRegion();
+ } else {
+ RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/);
+ RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]);
+ }
+ RegionOfVPBB->setEntry(VPBB);
+ Loop2Region[LoopOfBB] = RegionOfVPBB;
return VPBB;
}
@@ -254,6 +331,25 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
// Main interface to build the plain CFG.
void PlainCFGBuilder::buildPlainCFG() {
+ // 0. Reuse the top-level region, vector-preheader and exit VPBBs from the
+ // skeleton. These were created directly rather than via getOrCreateVPBB(),
+ // revisit them now to update BB2VPBB. Note that header/entry and
+ // latch/exiting VPBB's of top-level region have yet to be created.
+ VPRegionBlock *TheRegion = Plan.getVectorLoopRegion();
+ BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
+ assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+ "Unexpected loop preheader");
+ auto *VectorPreheaderVPBB =
+ cast<VPBasicBlock>(TheRegion->getSinglePredecessor());
+ // ThePreheaderBB conceptually corresponds to both Plan.getPreheader() (which
+ // wraps the original preheader BB) and Plan.getEntry() (which represents the
+ // new vector preheader); here we're interested in setting BB2VPBB to the
+ // latter.
+ BB2VPBB[ThePreheaderBB] = VectorPreheaderVPBB;
+ BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
+ assert(LoopExitBB && "Loops with multiple exits are not supported.");
+ BB2VPBB[LoopExitBB] = cast<VPBasicBlock>(TheRegion->getSingleSuccessor());
+
// 1. Scan the body of the loop in a topological order to visit each basic
// block after having visited its predecessor basic blocks. Create a VPBB for
// each BB and link it to its successor and predecessor VPBBs. Note that
@@ -263,21 +359,11 @@ void PlainCFGBuilder::buildPlainCFG() {
// Loop PH needs to be explicitly visited since it's not taken into account by
// LoopBlocksDFS.
- BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
- assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
- "Unexpected loop preheader");
- VPBasicBlock *ThePreheaderVPBB = Plan.getEntry();
- BB2VPBB[ThePreheaderBB] = ThePreheaderVPBB;
- ThePreheaderVPBB->setName("vector.ph");
for (auto &I : *ThePreheaderBB) {
if (I.getType()->isVoidTy())
continue;
IRDef2VPValue[&I] = Plan.getVPValueOrAddLiveIn(&I);
}
- // Create empty VPBB for Loop H so that we can link PH->H.
- VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
- HeaderVPBB->setName("vector.body");
- ThePreheaderVPBB->setOneSuccessor(HeaderVPBB);
LoopBlocksRPO RPO(TheLoop);
RPO.perform(LI);
@@ -286,88 +372,55 @@ void PlainCFGBuilder::buildPlainCFG() {
// Create or retrieve the VPBasicBlock for this BB and create its
// VPInstructions.
VPBasicBlock *VPBB = getOrCreateVPBB(BB);
+ VPRegionBlock *Region = VPBB->getParent();
createVPInstructionsForVPBB(VPBB, BB);
+ Loop *LoopForBB = LI->getLoopFor(BB);
+ // Set VPBB predecessors in the same order as they are in the incoming BB.
+ if (!isHeaderBB(BB, LoopForBB)) {
+ setVPBBPredsFromBB(VPBB, BB);
+ } else {
+ // BB is a loop header, set the predecessor for the region, except for the
+ // top region, whose predecessor was set when creating VPlan's skeleton.
+ assert(isHeaderVPBB(VPBB) && "isHeaderBB and isHeaderVPBB disagree");
+ if (TheRegion != Region)
+ setRegionPredsFromBB(Region, BB);
+ }
// Set VPBB successors. We create empty VPBBs for successors if they don't
// exist already. Recipes will be created when the successor is visited
// during the RPO traversal.
- Instruction *TI = BB->getTerminator();
- assert(TI && "Terminator expected.");
- unsigned NumSuccs = TI->getNumSuccessors();
-
+ auto *BI = cast<BranchInst>(BB->getTerminator());
+ unsigned NumSuccs = succ_size(BB);
if (NumSuccs == 1) {
- VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0));
- assert(SuccVPBB && "VPBB Successor not found.");
- VPBB->setOneSuccessor(SuccVPBB);
- } else if (NumSuccs == 2) {
- VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0));
- assert(SuccVPBB0 && "Successor 0 not found.");
- VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1));
- assert(SuccVPBB1 && "Successor 1 not found.");
-
- // Get VPBB's condition bit.
- assert(isa<BranchInst>(TI) && "Unsupported terminator!");
- // Look up the branch condition to get the corresponding VPValue
- // representing the condition bit in VPlan (which may be in another VPBB).
- assert(IRDef2VPValue.count(cast<BranchInst>(TI)->getCondition()) &&
- "Missing condition bit in IRDef2VPValue!");
-
- // Link successors.
- VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1);
- } else
- llvm_unreachable("Number of successors not supported.");
-
- // Set VPBB predecessors in the same order as they are in the incoming BB.
- setVPBBPredsFromBB(VPBB, BB);
+ auto *Successor = getOrCreateVPBB(BB->getSingleSuccessor());
+ VPBB->setOneSuccessor(isHeaderVPBB(Successor)
+ ? Successor->getParent()
+ : static_cast<VPBlockBase *>(Successor));
+ continue;
+ }
+ assert(BI->isConditional() && NumSuccs == 2 && BI->isConditional() &&
+ "block must have conditional branch with 2 successors");
+ // Look up the branch condition to get the corresponding VPValue
+ // representing the condition bit in VPlan (which may be in another VPBB).
+ assert(IRDef2VPValue.contains(BI->getCondition()) &&
+ "Missing condition bit in IRDef2VPValue!");
+ VPBasicBlock *Successor0 = getOrCreateVPBB(BI->getSuccessor(0));
+ VPBasicBlock *Successor1 = getOrCreateVPBB(BI->getSuccessor(1));
+ if (!LoopForBB || BB != LoopForBB->getLoopLatch()) {
+ VPBB->setTwoSuccessors(Successor0, Successor1);
+ continue;
+ }
+ // For a latch we need to set the successor of the region rather than that
+ // of VPBB and it should be set to the exit, i.e., non-header successor,
+ // except for the top region, whose successor was set when creating VPlan's
+ // skeleton.
+ if (TheRegion != Region)
+ Region->setOneSuccessor(isHeaderVPBB(Successor0) ? Successor1
+ : Successor0);
+ Region->setExiting(VPBB);
}
- // 2. Process outermost loop exit. We created an empty VPBB for the loop
- // single exit BB during the RPO traversal of the loop body but Instructions
- // weren't visited because it's not part of the the loop.
- BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
- assert(LoopExitBB && "Loops with multiple exits are not supported.");
- VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
- // Loop exit was already set as successor of the loop exiting BB.
- // We only set its predecessor VPBB now.
- setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
-
- // 3. Fix up region blocks for loops. For each loop,
- // * use the header block as entry to the corresponding region,
- // * use the latch block as exit of the corresponding region,
- // * set the region as successor of the loop pre-header, and
- // * set the exit block as successor to the region.
- SmallVector<Loop *> LoopWorkList;
- LoopWorkList.push_back(TheLoop);
- while (!LoopWorkList.empty()) {
- Loop *L = LoopWorkList.pop_back_val();
- BasicBlock *Header = L->getHeader();
- BasicBlock *Exiting = L->getLoopLatch();
- assert(Exiting == L->getExitingBlock() &&
- "Latch must be the only exiting block");
- VPRegionBlock *Region = Loop2Region[L];
- VPBasicBlock *HeaderVPBB = getOrCreateVPBB(Header);
- VPBasicBlock *ExitingVPBB = getOrCreateVPBB(Exiting);
-
- // Disconnect backedge and pre-header from header.
- VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(L->getLoopPreheader());
- VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPBB);
- VPBlockUtils::disconnectBlocks(ExitingVPBB, HeaderVPBB);
-
- Region->setParent(PreheaderVPBB->getParent());
- Region->setEntry(HeaderVPBB);
- VPBlockUtils::connectBlocks(PreheaderVPBB, Region);
-
- // Disconnect exit block from exiting (=latch) block, set exiting block and
- // connect region to exit block.
- VPBasicBlock *ExitVPBB = getOrCreateVPBB(L->getExitBlock());
- VPBlockUtils::disconnectBlocks(ExitingVPBB, ExitVPBB);
- Region->setExiting(ExitingVPBB);
- VPBlockUtils::connectBlocks(Region, ExitVPBB);
-
- // Queue sub-loops for processing.
- LoopWorkList.append(L->begin(), L->end());
- }
- // 4. The whole CFG has been built at this point so all the input Values must
+ // 2. The whole CFG has been built at this point so all the input Values must
// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
// VPlan operands.
fixPhiNodes();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 26c309eed800..02e400d590be 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "VPlan.h"
+#include "VPlanAnalysis.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
@@ -43,6 +44,8 @@ extern cl::opt<bool> EnableVPlanNativePath;
bool VPRecipeBase::mayWriteToMemory() const {
switch (getVPDefID()) {
+ case VPInterleaveSC:
+ return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
case VPWidenMemoryInstructionSC: {
return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
}
@@ -114,6 +117,16 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPPredInstPHISC:
return false;
+ case VPInstructionSC:
+ switch (cast<VPInstruction>(this)->getOpcode()) {
+ case Instruction::ICmp:
+ case VPInstruction::Not:
+ case VPInstruction::CalculateTripCountMinusVF:
+ case VPInstruction::CanonicalIVIncrementForPart:
+ return false;
+ default:
+ return true;
+ }
case VPWidenCallSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayHaveSideEffects();
@@ -135,6 +148,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
+ case VPInterleaveSC:
+ return mayWriteToMemory();
case VPWidenMemoryInstructionSC:
assert(cast<VPWidenMemoryInstructionRecipe>(this)
->getIngredient()
@@ -156,8 +171,13 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
VPValue *ExitValue = getOperand(0);
if (vputils::isUniformAfterVectorization(ExitValue))
Lane = VPLane::getFirstLane();
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+ assert(MiddleVPBB->getNumSuccessors() == 0 &&
+ "the middle block must not have any successors");
+ BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
- State.Builder.GetInsertBlock());
+ MiddleBB);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -216,15 +236,55 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
insertBefore(BB, I);
}
+FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
+ assert(OpType == OperationType::FPMathOp &&
+ "recipe doesn't have fast math flags");
+ FastMathFlags Res;
+ Res.setAllowReassoc(FMFs.AllowReassoc);
+ Res.setNoNaNs(FMFs.NoNaNs);
+ Res.setNoInfs(FMFs.NoInfs);
+ Res.setNoSignedZeros(FMFs.NoSignedZeros);
+ Res.setAllowReciprocal(FMFs.AllowReciprocal);
+ Res.setAllowContract(FMFs.AllowContract);
+ Res.setApproxFunc(FMFs.ApproxFunc);
+ return Res;
+}
+
+VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
+ VPValue *A, VPValue *B, DebugLoc DL,
+ const Twine &Name)
+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
+ Pred, DL),
+ VPValue(this), Opcode(Opcode), Name(Name.str()) {
+ assert(Opcode == Instruction::ICmp &&
+ "only ICmp predicates supported at the moment");
+}
+
+VPInstruction::VPInstruction(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands,
+ FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
+ VPValue(this), Opcode(Opcode), Name(Name.str()) {
+ // Make sure the VPInstruction is a floating-point operation.
+ assert(isFPMathOp() && "this op can't take fast-math flags");
+}
+
Value *VPInstruction::generateInstruction(VPTransformState &State,
unsigned Part) {
IRBuilderBase &Builder = State.Builder;
- Builder.SetCurrentDebugLocation(DL);
+ Builder.SetCurrentDebugLocation(getDebugLoc());
if (Instruction::isBinaryOp(getOpcode())) {
+ if (Part != 0 && vputils::onlyFirstPartUsed(this))
+ return State.get(this, 0);
+
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
- return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
+ auto *Res =
+ Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
+ if (auto *I = dyn_cast<Instruction>(Res))
+ setFlags(I);
+ return Res;
}
switch (getOpcode()) {
@@ -232,10 +292,10 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Value *A = State.get(getOperand(0), Part);
return Builder.CreateNot(A, Name);
}
- case VPInstruction::ICmpULE: {
- Value *IV = State.get(getOperand(0), Part);
- Value *TC = State.get(getOperand(1), Part);
- return Builder.CreateICmpULE(IV, TC, Name);
+ case Instruction::ICmp: {
+ Value *A = State.get(getOperand(0), Part);
+ Value *B = State.get(getOperand(1), Part);
+ return Builder.CreateCmp(getPredicate(), A, B, Name);
}
case Instruction::Select: {
Value *Cond = State.get(getOperand(0), Part);
@@ -285,23 +345,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
return Builder.CreateSelect(Cmp, Sub, Zero);
}
- case VPInstruction::CanonicalIVIncrement:
- case VPInstruction::CanonicalIVIncrementNUW: {
- if (Part == 0) {
- bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
- auto *Phi = State.get(getOperand(0), 0);
- // The loop step is equal to the vectorization factor (num of SIMD
- // elements) times the unroll factor (num of SIMD instructions).
- Value *Step =
- createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
- return Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
- }
- return State.get(this, 0);
- }
-
- case VPInstruction::CanonicalIVIncrementForPart:
- case VPInstruction::CanonicalIVIncrementForPartNUW: {
- bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;
+ case VPInstruction::CanonicalIVIncrementForPart: {
auto *IV = State.get(getOperand(0), VPIteration(0, 0));
if (Part == 0)
return IV;
@@ -309,7 +353,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
// The canonical IV is incremented by the vectorization factor (num of SIMD
// elements) times the unroll part.
Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
- return Builder.CreateAdd(IV, Step, Name, IsNUW, false);
+ return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
+ hasNoSignedWrap());
}
case VPInstruction::BranchOnCond: {
if (Part != 0)
@@ -361,10 +406,25 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
}
}
+#if !defined(NDEBUG)
+bool VPInstruction::isFPMathOp() const {
+ // Inspired by FPMathOperator::classof. Notable differences are that we don't
+ // support Call, PHI and Select opcodes here yet.
+ return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
+ Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
+ Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
+ Opcode == Instruction::FCmp || Opcode == Instruction::Select;
+}
+#endif
+
void VPInstruction::execute(VPTransformState &State) {
assert(!State.Instance && "VPInstruction executing an Instance");
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
- State.Builder.setFastMathFlags(FMF);
+ assert((hasFastMathFlags() == isFPMathOp() ||
+ getOpcode() == Instruction::Select) &&
+ "Recipe not a FPMathOp but has fast-math flags?");
+ if (hasFastMathFlags())
+ State.Builder.setFastMathFlags(getFastMathFlags());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *GeneratedValue = generateInstruction(State, Part);
if (!hasResult())
@@ -393,9 +453,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::Not:
O << "not";
break;
- case VPInstruction::ICmpULE:
- O << "icmp ule";
- break;
case VPInstruction::SLPLoad:
O << "combined load";
break;
@@ -408,12 +465,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::FirstOrderRecurrenceSplice:
O << "first-order splice";
break;
- case VPInstruction::CanonicalIVIncrement:
- O << "VF * UF + ";
- break;
- case VPInstruction::CanonicalIVIncrementNUW:
- O << "VF * UF +(nuw) ";
- break;
case VPInstruction::BranchOnCond:
O << "branch-on-cond";
break;
@@ -421,49 +472,35 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
O << "TC > VF ? TC - VF : 0";
break;
case VPInstruction::CanonicalIVIncrementForPart:
- O << "VF * Part + ";
- break;
- case VPInstruction::CanonicalIVIncrementForPartNUW:
- O << "VF * Part +(nuw) ";
+ O << "VF * Part +";
break;
case VPInstruction::BranchOnCount:
- O << "branch-on-count ";
+ O << "branch-on-count";
break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
- O << FMF;
-
- for (const VPValue *Operand : operands()) {
- O << " ";
- Operand->printAsOperand(O, SlotTracker);
- }
+ printFlags(O);
+ printOperands(O, SlotTracker);
- if (DL) {
+ if (auto DL = getDebugLoc()) {
O << ", !dbg ";
DL.print(O);
}
}
#endif
-void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
- // Make sure the VPInstruction is a floating-point operation.
- assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
- Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
- Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
- Opcode == Instruction::FCmp) &&
- "this op can't take fast-math flags");
- FMF = FMFNew;
-}
-
void VPWidenCallRecipe::execute(VPTransformState &State) {
assert(State.VF.isVector() && "not widening");
auto &CI = *cast<CallInst>(getUnderlyingInstr());
assert(!isa<DbgInfoIntrinsic>(CI) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
- State.setDebugLocFromInst(&CI);
+ State.setDebugLocFrom(CI.getDebugLoc());
+ FunctionType *VFTy = nullptr;
+ if (Variant)
+ VFTy = Variant->getFunctionType();
for (unsigned Part = 0; Part < State.UF; ++Part) {
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
@@ -475,12 +512,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
for (const auto &I : enumerate(operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
+ // Some vectorized function variants may also take a scalar argument,
+ // e.g. linear parameters for pointers.
Value *Arg;
- if (VectorIntrinsicID == Intrinsic::not_intrinsic ||
- !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
- Arg = State.get(I.value(), Part);
- else
+ if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
+ (VectorIntrinsicID != Intrinsic::not_intrinsic &&
+ isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
Arg = State.get(I.value(), VPIteration(0, 0));
+ else
+ Arg = State.get(I.value(), Part);
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
TysForDecl.push_back(Arg->getType());
Args.push_back(Arg);
@@ -553,8 +593,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPWidenSelectRecipe::execute(VPTransformState &State) {
- auto &I = *cast<SelectInst>(getUnderlyingInstr());
- State.setDebugLocFromInst(&I);
+ State.setDebugLocFrom(getDebugLoc());
// The condition can be loop invariant but still defined inside the
// loop. This means that we can't just use the original 'cond' value.
@@ -569,13 +608,31 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
Value *Op1 = State.get(getOperand(2), Part);
Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
State.set(this, Sel, Part);
- State.addMetadata(Sel, &I);
+ State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
}
}
+VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
+ const FastMathFlags &FMF) {
+ AllowReassoc = FMF.allowReassoc();
+ NoNaNs = FMF.noNaNs();
+ NoInfs = FMF.noInfs();
+ NoSignedZeros = FMF.noSignedZeros();
+ AllowReciprocal = FMF.allowReciprocal();
+ AllowContract = FMF.allowContract();
+ ApproxFunc = FMF.approxFunc();
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
switch (OpType) {
+ case OperationType::Cmp:
+ O << " " << CmpInst::getPredicateName(getPredicate());
+ break;
+ case OperationType::DisjointOp:
+ if (DisjointFlags.IsDisjoint)
+ O << " disjoint";
+ break;
case OperationType::PossiblyExactOp:
if (ExactFlags.IsExact)
O << " exact";
@@ -593,17 +650,22 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
if (GEPFlags.IsInBounds)
O << " inbounds";
break;
+ case OperationType::NonNegOp:
+ if (NonNegFlags.NonNeg)
+ O << " nneg";
+ break;
case OperationType::Other:
break;
}
- O << " ";
+ if (getNumOperands() > 0)
+ O << " ";
}
#endif
void VPWidenRecipe::execute(VPTransformState &State) {
- auto &I = *cast<Instruction>(getUnderlyingValue());
+ State.setDebugLocFrom(getDebugLoc());
auto &Builder = State.Builder;
- switch (I.getOpcode()) {
+ switch (Opcode) {
case Instruction::Call:
case Instruction::Br:
case Instruction::PHI:
@@ -630,28 +692,24 @@ void VPWidenRecipe::execute(VPTransformState &State) {
case Instruction::Or:
case Instruction::Xor: {
// Just widen unops and binops.
- State.setDebugLocFromInst(&I);
-
for (unsigned Part = 0; Part < State.UF; ++Part) {
SmallVector<Value *, 2> Ops;
for (VPValue *VPOp : operands())
Ops.push_back(State.get(VPOp, Part));
- Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
+ Value *V = Builder.CreateNAryOp(Opcode, Ops);
if (auto *VecOp = dyn_cast<Instruction>(V))
setFlags(VecOp);
// Use this vector value for all users of the original instruction.
State.set(this, V, Part);
- State.addMetadata(V, &I);
+ State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
}
break;
}
case Instruction::Freeze: {
- State.setDebugLocFromInst(&I);
-
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *Op = State.get(getOperand(0), Part);
@@ -663,9 +721,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
case Instruction::ICmp:
case Instruction::FCmp: {
// Widen compares. Generate vector compares.
- bool FCmp = (I.getOpcode() == Instruction::FCmp);
- auto *Cmp = cast<CmpInst>(&I);
- State.setDebugLocFromInst(Cmp);
+ bool FCmp = Opcode == Instruction::FCmp;
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
@@ -673,51 +729,64 @@ void VPWidenRecipe::execute(VPTransformState &State) {
if (FCmp) {
// Propagate fast math flags.
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- Builder.setFastMathFlags(Cmp->getFastMathFlags());
- C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+ if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
+ Builder.setFastMathFlags(I->getFastMathFlags());
+ C = Builder.CreateFCmp(getPredicate(), A, B);
} else {
- C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+ C = Builder.CreateICmp(getPredicate(), A, B);
}
State.set(this, C, Part);
- State.addMetadata(C, &I);
+ State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
}
break;
}
default:
// This instruction is not vectorized by simple widening.
- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
+ << Instruction::getOpcodeName(Opcode));
llvm_unreachable("Unhandled instruction!");
} // end of switch.
+
+#if !defined(NDEBUG)
+ // Verify that VPlan type inference results agree with the type of the
+ // generated values.
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
+ State.VF) == State.get(this, Part)->getType() &&
+ "inferred type and type from generated instructions do not match");
+ }
+#endif
}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
printAsOperand(O, SlotTracker);
- const Instruction *UI = getUnderlyingInstr();
- O << " = " << UI->getOpcodeName();
+ O << " = " << Instruction::getOpcodeName(Opcode);
printFlags(O);
- if (auto *Cmp = dyn_cast<CmpInst>(UI))
- O << Cmp->getPredicate() << " ";
printOperands(O, SlotTracker);
}
#endif
void VPWidenCastRecipe::execute(VPTransformState &State) {
- auto *I = cast_or_null<Instruction>(getUnderlyingValue());
- if (I)
- State.setDebugLocFromInst(I);
+ State.setDebugLocFrom(getDebugLoc());
auto &Builder = State.Builder;
/// Vectorize casts.
assert(State.VF.isVector() && "Not vectorizing?");
Type *DestTy = VectorType::get(getResultType(), State.VF);
-
+ VPValue *Op = getOperand(0);
for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *A = State.get(getOperand(0), Part);
+ if (Part > 0 && Op->isLiveIn()) {
+ // FIXME: Remove once explicit unrolling is implemented using VPlan.
+ State.set(this, State.get(this, 0), Part);
+ continue;
+ }
+ Value *A = State.get(Op, Part);
Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
State.set(this, Cast, Part);
- State.addMetadata(Cast, I);
+ State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
}
}
@@ -727,10 +796,182 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
O << Indent << "WIDEN-CAST ";
printAsOperand(O, SlotTracker);
O << " = " << Instruction::getOpcodeName(Opcode) << " ";
+ printFlags(O);
printOperands(O, SlotTracker);
O << " to " << *getResultType();
}
+#endif
+
+/// This function adds
+/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
+/// to each vector element of Val. The sequence starts at StartIndex.
+/// \p Opcode is relevant for FP induction variable.
+static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp, ElementCount VF,
+ IRBuilderBase &Builder) {
+ assert(VF.isVector() && "only vector VFs are supported");
+
+ // Create and check the types.
+ auto *ValVTy = cast<VectorType>(Val->getType());
+ ElementCount VLen = ValVTy->getElementCount();
+
+ Type *STy = Val->getType()->getScalarType();
+ assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+ "Induction Step must be an integer or FP");
+ assert(Step->getType() == STy && "Step has wrong type");
+
+ SmallVector<Constant *, 8> Indices;
+
+ // Create a vector of consecutive numbers from zero to VF.
+ VectorType *InitVecValVTy = ValVTy;
+ if (STy->isFloatingPointTy()) {
+ Type *InitVecValSTy =
+ IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
+ InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
+ }
+ Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
+
+ // Splat the StartIdx
+ Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
+
+ if (STy->isIntegerTy()) {
+ InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ assert(Step->getType() == Val->getType() && "Invalid step vec");
+ // FIXME: The newly created binary instructions should contain nsw/nuw
+ // flags, which can be found from the original scalar operations.
+ Step = Builder.CreateMul(InitVec, Step);
+ return Builder.CreateAdd(Val, Step, "induction");
+ }
+
+ // Floating point induction.
+ assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+ "Binary Opcode should be specified for FP induction");
+ InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
+ InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
+
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ Value *MulOp = Builder.CreateFMul(InitVec, Step);
+ return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+ return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+ : ConstantFP::get(Ty, C);
+}
+
+static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
+ ElementCount VF) {
+ assert(FTy->isFloatingPointTy() && "Expected floating point type!");
+ Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
+ Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
+ return B.CreateUIToFP(RuntimeVF, FTy);
+}
+
+void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Int or FP induction being replicated.");
+
+ Value *Start = getStartValue()->getLiveInIRValue();
+ const InductionDescriptor &ID = getInductionDescriptor();
+ TruncInst *Trunc = getTruncInst();
+ IRBuilderBase &Builder = State.Builder;
+ assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+ assert(State.VF.isVector() && "must have vector VF");
+
+ // The value from the original loop to which we are mapping the new induction
+ // variable.
+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+ Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
+
+ // Now do the actual transformations, and start with fetching the step value.
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ auto CurrIP = Builder.saveIP();
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ if (isa<TruncInst>(EntryVal)) {
+ assert(Start->getType()->isIntegerTy() &&
+ "Truncation requires an integer type");
+ auto *TruncType = cast<IntegerType>(EntryVal->getType());
+ Step = Builder.CreateTrunc(Step, TruncType);
+ Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+ }
+
+ Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
+ Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
+ Value *SteppedStart = getStepVector(
+ SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
+
+ // We create vector phi nodes for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (Step->getType()->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = ID.getInductionOpcode();
+ MulOp = Instruction::FMul;
+ }
+
+ // Multiply the vectorization factor by the step using integer or
+ // floating-point arithmetic as appropriate.
+ Type *StepType = Step->getType();
+ Value *RuntimeVF;
+ if (Step->getType()->isFloatingPointTy())
+ RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
+ else
+ RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+ Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
+
+ // Create a vector splat to use in the induction update.
+ //
+ // FIXME: If the step is non-constant, we create the vector splat with
+ // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+ // handle a constant vector splat.
+ Value *SplatVF = isa<Constant>(Mul)
+ ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(State.VF, Mul);
+ Builder.restoreIP(CurrIP);
+
+ // We may need to add the step a number of times, depending on the unroll
+ // factor. The last of those goes into the PHI.
+ PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
+ VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
+ VecInd->setDebugLoc(EntryVal->getDebugLoc());
+ Instruction *LastInduction = VecInd;
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ State.set(this, LastInduction, Part);
+
+ if (isa<TruncInst>(EntryVal))
+ State.addMetadata(LastInduction, EntryVal);
+ LastInduction = cast<Instruction>(
+ Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
+ LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+ }
+
+ LastInduction->setName("vec.ind.next");
+ VecInd->addIncoming(SteppedStart, VectorPH);
+ // Add induction update using an incorrect block temporarily. The phi node
+ // will be fixed after VPlan execution. Note that at this point the latch
+ // block cannot be used, as it does not exist yet.
+ // TODO: Model increment value in VPlan, by turning the recipe into a
+ // multi-def and a subclass of VPHeaderPHIRecipe.
+ VecInd->addIncoming(LastInduction, VectorPH);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-INDUCTION";
@@ -770,17 +1011,112 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
O << " * ";
getStepValue()->printAsOperand(O, SlotTracker);
- if (IndDesc.getStep()->getType() != ResultTy)
- O << " (truncated to " << *ResultTy << ")";
+ if (TruncResultTy)
+ O << " (truncated to " << *TruncResultTy << ")";
}
#endif
+void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
+ if (hasFastMathFlags())
+ State.Builder.setFastMathFlags(getFastMathFlags());
+
+ /// Compute scalar induction steps. \p ScalarIV is the scalar induction
+ /// variable on which to base the steps, \p Step is the size of the step.
+
+ Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+ IRBuilderBase &Builder = State.Builder;
+
+ // Ensure step has the same type as that of scalar IV.
+ Type *BaseIVTy = BaseIV->getType()->getScalarType();
+ if (BaseIVTy != Step->getType()) {
+ // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
+ // avoid separate truncate here.
+ assert(Step->getType()->isIntegerTy() &&
+ "Truncation requires an integer step");
+ Step = State.Builder.CreateTrunc(Step, BaseIVTy);
+ }
+
+ // We build scalar steps for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (BaseIVTy->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = InductionOpcode;
+ MulOp = Instruction::FMul;
+ }
+
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration.
+ bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
+ // Compute the scalar steps and save the results in State.
+ Type *IntStepTy =
+ IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
+ Type *VecIVTy = nullptr;
+ Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
+ if (!FirstLaneOnly && State.VF.isScalable()) {
+ VecIVTy = VectorType::get(BaseIVTy, State.VF);
+ UnitStepVec =
+ Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
+ SplatStep = Builder.CreateVectorSplat(State.VF, Step);
+ SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
+ }
+
+ unsigned StartPart = 0;
+ unsigned EndPart = State.UF;
+ unsigned StartLane = 0;
+ unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
+ if (State.Instance) {
+ StartPart = State.Instance->Part;
+ EndPart = StartPart + 1;
+ StartLane = State.Instance->Lane.getKnownLane();
+ EndLane = StartLane + 1;
+ }
+ for (unsigned Part = StartPart; Part < EndPart; ++Part) {
+ Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
+
+ if (!FirstLaneOnly && State.VF.isScalable()) {
+ auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
+ auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
+ if (BaseIVTy->isFloatingPointTy())
+ InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
+ auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
+ auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
+ State.set(this, Add, Part);
+ // It's useful to record the lane values too for the known minimum number
+ // of elements so we do those below. This improves the code quality when
+ // trying to extract the first element, for example.
+ }
+
+ if (BaseIVTy->isFloatingPointTy())
+ StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
+
+ for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
+ Value *StartIdx = Builder.CreateBinOp(
+ AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
+ // The step returned by `createStepForVF` is a runtime-evaluated value
+ // when VF is scalable. Otherwise, it should be folded into a Constant.
+ assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
+ "Expected StartIdx to be folded to a constant when VF is not "
+ "scalable");
+ auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
+ auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
+ State.set(this, Add, VPIteration(Part, Lane));
+ }
+ }
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
- O << Indent << "= SCALAR-STEPS ";
+ O << " = SCALAR-STEPS ";
printOperands(O, SlotTracker);
}
#endif
@@ -874,7 +1210,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPBlendRecipe::execute(VPTransformState &State) {
- State.setDebugLocFromInst(Phi);
+ State.setDebugLocFrom(getDebugLoc());
// We know that all PHIs in non-header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
@@ -916,7 +1252,7 @@ void VPBlendRecipe::execute(VPTransformState &State) {
void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "BLEND ";
- Phi->printAsOperand(O, false);
+ printAsOperand(O, SlotTracker);
O << " =";
if (getNumIncomingValues() == 1) {
// Not a User of any mask: not really blending, this is a
@@ -942,14 +1278,14 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
O << " +";
if (isa<FPMathOperator>(getUnderlyingInstr()))
O << getUnderlyingInstr()->getFastMathFlags();
- O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
+ O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
getVecOp()->printAsOperand(O, SlotTracker);
if (getCondOp()) {
O << ", ";
getCondOp()->printAsOperand(O, SlotTracker);
}
O << ")";
- if (RdxDesc->IntermediateStore)
+ if (RdxDesc.IntermediateStore)
O << " (with final reduction value stored in invariant address sank "
"outside of loop)";
}
@@ -1093,12 +1429,12 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
Value *Start = getStartValue()->getLiveInIRValue();
- PHINode *EntryPart = PHINode::Create(
- Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
+ PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
+ EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
EntryPart->addIncoming(Start, VectorPH);
- EntryPart->setDebugLoc(DL);
+ EntryPart->setDebugLoc(getDebugLoc());
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
State.set(this, EntryPart, Part);
}
@@ -1108,7 +1444,8 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT ";
printAsOperand(O, SlotTracker);
- O << " = CANONICAL-INDUCTION";
+ O << " = CANONICAL-INDUCTION ";
+ printOperands(O, SlotTracker);
}
#endif
@@ -1221,8 +1558,8 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
}
// Create a phi node for the new recurrence.
- PHINode *EntryPart = PHINode::Create(
- VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
+ PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
+ EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
EntryPart->addIncoming(VectorInit, VectorPH);
State.set(this, EntryPart, 0);
}
@@ -1254,8 +1591,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
"recipe must be in the vector loop header");
unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
- Value *EntryPart =
- PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
+ Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
+ EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
State.set(this, EntryPart, Part);
}
@@ -1269,8 +1606,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
Value *Iden = nullptr;
RecurKind RK = RdxDesc.getRecurrenceKind();
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
- RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
- // MinMax reduction have the start value as their identify.
+ RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+ // MinMax and AnyOf reductions have the start value as their identity.
if (ScalarPHI) {
Iden = StartV;
} else {
@@ -1316,23 +1653,7 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) {
assert(EnableVPlanNativePath &&
"Non-native vplans are not expected to have VPWidenPHIRecipes.");
- // Currently we enter here in the VPlan-native path for non-induction
- // PHIs where all control flow is uniform. We simply widen these PHIs.
- // Create a vector phi with no operands - the vector phi operands will be
- // set at the end of vector code generation.
- VPBasicBlock *Parent = getParent();
- VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
- unsigned StartIdx = 0;
- // For phis in header blocks of loop regions, use the index of the value
- // coming from the preheader.
- if (LoopRegion->getEntryBasicBlock() == Parent) {
- for (unsigned I = 0; I < getNumOperands(); ++I) {
- if (getIncomingBlock(I) ==
- LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
- StartIdx = I;
- }
- }
- Value *Op0 = State.get(getOperand(StartIdx), 0);
+ Value *Op0 = State.get(getOperand(0), 0);
Type *VecTy = Op0->getType();
Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
State.set(this, VecPhi, 0);
@@ -1368,7 +1689,7 @@ void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
PHINode *EntryPart =
State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
EntryPart->addIncoming(StartMask, VectorPH);
- EntryPart->setDebugLoc(DL);
+ EntryPart->setDebugLoc(getDebugLoc());
State.set(this, EntryPart, Part);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 83bfdfd09d19..33132880d5a4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -12,17 +12,22 @@
//===----------------------------------------------------------------------===//
#include "VPlanTransforms.h"
-#include "VPlanDominatorTree.h"
#include "VPRecipeBuilder.h"
+#include "VPlanAnalysis.h"
#include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
using namespace llvm;
+using namespace llvm::PatternMatch;
+
void VPlanTransforms::VPInstructionsToVPRecipes(
VPlanPtr &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
@@ -76,7 +81,7 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
} else if (auto *CI = dyn_cast<CastInst>(Inst)) {
NewRecipe = new VPWidenCastRecipe(
- CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI);
+ CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
} else {
NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
}
@@ -158,17 +163,10 @@ static bool sinkScalarOperands(VPlan &Plan) {
// TODO: add ".cloned" suffix to name of Clone's VPValue.
Clone->insertBefore(SinkCandidate);
- for (auto *U : to_vector(SinkCandidate->getVPSingleValue()->users())) {
- auto *UI = cast<VPRecipeBase>(U);
- if (UI->getParent() == SinkTo)
- continue;
-
- for (unsigned Idx = 0; Idx != UI->getNumOperands(); Idx++) {
- if (UI->getOperand(Idx) != SinkCandidate->getVPSingleValue())
- continue;
- UI->setOperand(Idx, Clone);
- }
- }
+ SinkCandidate->getVPSingleValue()->replaceUsesWithIf(
+ Clone, [SinkTo](VPUser &U, unsigned) {
+ return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
+ });
}
SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
for (VPValue *Op : SinkCandidate->operands())
@@ -273,16 +271,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
VPValue *PredInst1 =
cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
- for (VPUser *U : to_vector(Phi1ToMoveV->users())) {
- auto *UI = dyn_cast<VPRecipeBase>(U);
- if (!UI || UI->getParent() != Then2)
- continue;
- for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) {
- if (Phi1ToMoveV != U->getOperand(I))
- continue;
- U->setOperand(I, PredInst1);
- }
- }
+ Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
+ auto *UI = dyn_cast<VPRecipeBase>(&U);
+ return UI && UI->getParent() == Then2;
+ });
Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
}
@@ -479,15 +471,45 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
// The recipes in the block are processed in reverse order, to catch chains
// of dead recipes.
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
- if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) {
- return V->getNumUsers() > 0;
- }))
+ // A user keeps R alive:
+ if (any_of(R.definedValues(),
+ [](VPValue *V) { return V->getNumUsers(); }))
+ continue;
+
+ // Having side effects keeps R alive, but do remove conditional assume
+ // instructions as their conditions may be flattened.
+ auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+ bool IsConditionalAssume =
+ RepR && RepR->isPredicated() &&
+ match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
+ if (R.mayHaveSideEffects() && !IsConditionalAssume)
continue;
+
R.eraseFromParent();
}
}
}
+static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
+ ScalarEvolution &SE, Instruction *TruncI,
+ Type *IVTy, VPValue *StartV,
+ VPValue *Step) {
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ auto IP = HeaderVPBB->getFirstNonPhi();
+ VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+ Type *TruncTy = TruncI ? TruncI->getType() : IVTy;
+ VPValue *BaseIV = CanonicalIV;
+ if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) {
+ BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step,
+ TruncI ? TruncI->getType() : nullptr);
+ HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);
+ }
+
+ VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);
+ HeaderVPBB->insert(Steps, IP);
+ return Steps;
+}
+
void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
SmallVector<VPRecipeBase *> ToRemove;
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
@@ -501,36 +523,18 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
}))
continue;
- auto IP = HeaderVPBB->getFirstNonPhi();
- VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
- Type *ResultTy = WideIV->getPHINode()->getType();
- if (Instruction *TruncI = WideIV->getTruncInst())
- ResultTy = TruncI->getType();
const InductionDescriptor &ID = WideIV->getInductionDescriptor();
- VPValue *Step = WideIV->getStepValue();
- VPValue *BaseIV = CanonicalIV;
- if (!CanonicalIV->isCanonical(ID.getKind(), WideIV->getStartValue(), Step,
- ResultTy)) {
- BaseIV = new VPDerivedIVRecipe(ID, WideIV->getStartValue(), CanonicalIV,
- Step, ResultTy);
- HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);
- }
-
- VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);
- HeaderVPBB->insert(Steps, IP);
+ VPValue *Steps = createScalarIVSteps(
+ Plan, ID, SE, WideIV->getTruncInst(), WideIV->getPHINode()->getType(),
+ WideIV->getStartValue(), WideIV->getStepValue());
- // Update scalar users of IV to use Step instead. Use SetVector to ensure
- // the list of users doesn't contain duplicates.
- SetVector<VPUser *> Users(WideIV->user_begin(), WideIV->user_end());
- for (VPUser *U : Users) {
- if (HasOnlyVectorVFs && !U->usesScalars(WideIV))
- continue;
- for (unsigned I = 0, E = U->getNumOperands(); I != E; I++) {
- if (U->getOperand(I) != WideIV)
- continue;
- U->setOperand(I, Steps);
- }
- }
+ // Update scalar users of IV to use Step instead.
+ if (!HasOnlyVectorVFs)
+ WideIV->replaceAllUsesWith(Steps);
+ else
+ WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
+ return U.usesScalars(WideIV);
+ });
}
}
@@ -778,3 +782,375 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
}
}
}
+
+/// Returns true is \p V is constant one.
+static bool isConstantOne(VPValue *V) {
+ if (!V->isLiveIn())
+ return false;
+ auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue());
+ return C && C->isOne();
+}
+
+/// Returns the llvm::Instruction opcode for \p R.
+static unsigned getOpcodeForRecipe(VPRecipeBase &R) {
+ if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R))
+ return WidenR->getUnderlyingInstr()->getOpcode();
+ if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R))
+ return WidenC->getOpcode();
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R))
+ return RepR->getUnderlyingInstr()->getOpcode();
+ if (auto *VPI = dyn_cast<VPInstruction>(&R))
+ return VPI->getOpcode();
+ return 0;
+}
+
+/// Try to simplify recipe \p R.
+static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
+ switch (getOpcodeForRecipe(R)) {
+ case Instruction::Mul: {
+ VPValue *A = R.getOperand(0);
+ VPValue *B = R.getOperand(1);
+ if (isConstantOne(A))
+ return R.getVPSingleValue()->replaceAllUsesWith(B);
+ if (isConstantOne(B))
+ return R.getVPSingleValue()->replaceAllUsesWith(A);
+ break;
+ }
+ case Instruction::Trunc: {
+ VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe();
+ if (!Ext)
+ break;
+ unsigned ExtOpcode = getOpcodeForRecipe(*Ext);
+ if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
+ break;
+ VPValue *A = Ext->getOperand(0);
+ VPValue *Trunc = R.getVPSingleValue();
+ Type *TruncTy = TypeInfo.inferScalarType(Trunc);
+ Type *ATy = TypeInfo.inferScalarType(A);
+ if (TruncTy == ATy) {
+ Trunc->replaceAllUsesWith(A);
+ } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+ auto *VPC =
+ new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
+ auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ }
+#ifndef NDEBUG
+ // Verify that the cached type info is for both A and its users is still
+ // accurate by comparing it to freshly computed types.
+ VPTypeAnalysis TypeInfo2(TypeInfo.getContext());
+ assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
+ for (VPUser *U : A->users()) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ if (!R)
+ continue;
+ for (VPValue *VPV : R->definedValues())
+ assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
+ }
+#endif
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+/// Try to simplify the recipes in \p Plan.
+static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) {
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ VPTypeAnalysis TypeInfo(Ctx);
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ simplifyRecipe(R, TypeInfo);
+ }
+ }
+}
+
+void VPlanTransforms::truncateToMinimalBitwidths(
+ VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs,
+ LLVMContext &Ctx) {
+#ifndef NDEBUG
+ // Count the processed recipes and cross check the count later with MinBWs
+ // size, to make sure all entries in MinBWs have been handled.
+ unsigned NumProcessedRecipes = 0;
+#endif
+ // Keep track of created truncates, so they can be re-used. Note that we
+ // cannot use RAUW after creating a new truncate, as this would could make
+ // other uses have different types for their operands, making them invalidly
+ // typed.
+ DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
+ VPTypeAnalysis TypeInfo(Ctx);
+ VPBasicBlock *PH = Plan.getEntry();
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
+ VPWidenSelectRecipe>(&R))
+ continue;
+
+ VPValue *ResultVPV = R.getVPSingleValue();
+ auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
+ unsigned NewResSizeInBits = MinBWs.lookup(UI);
+ if (!NewResSizeInBits)
+ continue;
+
+#ifndef NDEBUG
+ NumProcessedRecipes++;
+#endif
+ // If the value wasn't vectorized, we must maintain the original scalar
+ // type. Skip those here, after incrementing NumProcessedRecipes. Also
+ // skip casts which do not need to be handled explicitly here, as
+ // redundant casts will be removed during recipe simplification.
+ if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) {
+#ifndef NDEBUG
+ // If any of the operands is a live-in and not used by VPWidenRecipe or
+ // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as
+ // processed as well. When MinBWs is currently constructed, there is no
+ // information about whether recipes are widened or replicated and in
+ // case they are reciplicated the operands are not truncated. Counting
+ // them them here ensures we do not miss any recipes in MinBWs.
+ // TODO: Remove once the analysis is done on VPlan.
+ for (VPValue *Op : R.operands()) {
+ if (!Op->isLiveIn())
+ continue;
+ auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue());
+ if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) &&
+ all_of(Op->users(), [](VPUser *U) {
+ return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U);
+ })) {
+ // Add an entry to ProcessedTruncs to avoid counting the same
+ // operand multiple times.
+ ProcessedTruncs[Op] = nullptr;
+ NumProcessedRecipes += 1;
+ }
+ }
+#endif
+ continue;
+ }
+
+ Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
+ unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
+ assert(OldResTy->isIntegerTy() && "only integer types supported");
+ if (OldResSizeInBits == NewResSizeInBits)
+ continue;
+ assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
+ (void)OldResSizeInBits;
+
+ auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
+
+ // Shrink operands by introducing truncates as needed.
+ unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
+ for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
+ auto *Op = R.getOperand(Idx);
+ unsigned OpSizeInBits =
+ TypeInfo.inferScalarType(Op)->getScalarSizeInBits();
+ if (OpSizeInBits == NewResSizeInBits)
+ continue;
+ assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
+ auto [ProcessedIter, IterIsEmpty] =
+ ProcessedTruncs.insert({Op, nullptr});
+ VPWidenCastRecipe *NewOp =
+ IterIsEmpty
+ ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
+ : ProcessedIter->second;
+ R.setOperand(Idx, NewOp);
+ if (!IterIsEmpty)
+ continue;
+ ProcessedIter->second = NewOp;
+ if (!Op->isLiveIn()) {
+ NewOp->insertBefore(&R);
+ } else {
+ PH->appendRecipe(NewOp);
+#ifndef NDEBUG
+ auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue());
+ bool IsContained = MinBWs.contains(OpInst);
+ NumProcessedRecipes += IsContained;
+#endif
+ }
+ }
+
+ // Any wrapping introduced by shrinking this operation shouldn't be
+ // considered undefined behavior. So, we can't unconditionally copy
+ // arithmetic wrapping flags to VPW.
+ if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
+ VPW->dropPoisonGeneratingFlags();
+
+ // Extend result to original width.
+ auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
+ Ext->insertAfter(&R);
+ ResultVPV->replaceAllUsesWith(Ext);
+ Ext->setOperand(0, ResultVPV);
+ }
+ }
+
+ assert(MinBWs.size() == NumProcessedRecipes &&
+ "some entries in MinBWs haven't been processed");
+}
+
+void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
+ removeRedundantCanonicalIVs(Plan);
+ removeRedundantInductionCasts(Plan);
+
+ optimizeInductions(Plan, SE);
+ simplifyRecipes(Plan, SE.getContext());
+ removeDeadRecipes(Plan);
+
+ createAndOptimizeReplicateRegions(Plan);
+
+ removeRedundantExpandSCEVRecipes(Plan);
+ mergeBlocksIntoPredecessors(Plan);
+}
+
+// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
+// the loop terminator with a branch-on-cond recipe with the negated
+// active-lane-mask as operand. Note that this turns the loop into an
+// uncountable one. Only the existing terminator is replaced, all other existing
+// recipes/users remain unchanged, except for poison-generating flags being
+// dropped from the canonical IV increment. Return the created
+// VPActiveLaneMaskPHIRecipe.
+//
+// The function uses the following definitions:
+//
+// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
+// calculate-trip-count-minus-VF (original TC) : original TC
+// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
+// CanonicalIVPhi : CanonicalIVIncrement
+// %StartV is the canonical induction start value.
+//
+// The function adds the following recipes:
+//
+// vector.ph:
+// %TripCount = calculate-trip-count-minus-VF (original TC)
+// [if DataWithControlFlowWithoutRuntimeCheck]
+// %EntryInc = canonical-iv-increment-for-part %StartV
+// %EntryALM = active-lane-mask %EntryInc, %TripCount
+//
+// vector.body:
+// ...
+// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
+// ...
+// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
+// %ALM = active-lane-mask %InLoopInc, TripCount
+// %Negated = Not %ALM
+// branch-on-cond %Negated
+//
+static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
+ VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+ VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
+ auto *CanonicalIVPHI = Plan.getCanonicalIV();
+ VPValue *StartV = CanonicalIVPHI->getStartValue();
+
+ auto *CanonicalIVIncrement =
+ cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
+ // TODO: Check if dropping the flags is needed if
+ // !DataAndControlFlowWithoutRuntimeCheck.
+ CanonicalIVIncrement->dropPoisonGeneratingFlags();
+ DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+ // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
+ // we have to take unrolling into account. Each part needs to start at
+ // Part * VF
+ auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
+ VPBuilder Builder(VecPreheader);
+
+ // Create the ActiveLaneMask instruction using the correct start values.
+ VPValue *TC = Plan.getTripCount();
+
+ VPValue *TripCount, *IncrementValue;
+ if (!DataAndControlFlowWithoutRuntimeCheck) {
+ // When the loop is guarded by a runtime overflow check for the loop
+ // induction variable increment by VF, we can increment the value before
+ // the get.active.lane mask and use the unmodified tripcount.
+ IncrementValue = CanonicalIVIncrement;
+ TripCount = TC;
+ } else {
+ // When avoiding a runtime check, the active.lane.mask inside the loop
+ // uses a modified trip count and the induction variable increment is
+ // done after the active.lane.mask intrinsic is called.
+ IncrementValue = CanonicalIVPHI;
+ TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
+ {TC}, DL);
+ }
+ auto *EntryIncrement = Builder.createOverflowingOp(
+ VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
+ "index.part.next");
+
+ // Create the active lane mask instruction in the VPlan preheader.
+ auto *EntryALM =
+ Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
+ DL, "active.lane.mask.entry");
+
+ // Now create the ActiveLaneMaskPhi recipe in the main loop using the
+ // preheader ActiveLaneMask instruction.
+ auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+ LaneMaskPhi->insertAfter(CanonicalIVPHI);
+
+ // Create the active lane mask for the next iteration of the loop before the
+ // original terminator.
+ VPRecipeBase *OriginalTerminator = EB->getTerminator();
+ Builder.setInsertPoint(OriginalTerminator);
+ auto *InLoopIncrement =
+ Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
+ {IncrementValue}, {false, false}, DL);
+ auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+ {InLoopIncrement, TripCount}, DL,
+ "active.lane.mask.next");
+ LaneMaskPhi->addOperand(ALM);
+
+ // Replace the original terminator with BranchOnCond. We have to invert the
+ // mask here because a true condition means jumping to the exit block.
+ auto *NotMask = Builder.createNot(ALM, DL);
+ Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
+ OriginalTerminator->eraseFromParent();
+ return LaneMaskPhi;
+}
+
+void VPlanTransforms::addActiveLaneMask(
+ VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
+ bool DataAndControlFlowWithoutRuntimeCheck) {
+ assert((!DataAndControlFlowWithoutRuntimeCheck ||
+ UseActiveLaneMaskForControlFlow) &&
+ "DataAndControlFlowWithoutRuntimeCheck implies "
+ "UseActiveLaneMaskForControlFlow");
+
+ auto FoundWidenCanonicalIVUser =
+ find_if(Plan.getCanonicalIV()->users(),
+ [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+ assert(FoundWidenCanonicalIVUser &&
+ "Must have widened canonical IV when tail folding!");
+ auto *WideCanonicalIV =
+ cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
+ VPRecipeBase *LaneMask;
+ if (UseActiveLaneMaskForControlFlow) {
+ LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
+ Plan, DataAndControlFlowWithoutRuntimeCheck);
+ } else {
+ LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
+ {WideCanonicalIV, Plan.getTripCount()},
+ nullptr, "active.lane.mask");
+ LaneMask->insertAfter(WideCanonicalIV);
+ }
+
+ // Walk users of WideCanonicalIV and replace all compares of the form
+ // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
+ // active-lane-mask.
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
+ auto *CompareToReplace = dyn_cast<VPInstruction>(U);
+ if (!CompareToReplace ||
+ CompareToReplace->getOpcode() != Instruction::ICmp ||
+ CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
+ CompareToReplace->getOperand(1) != BTC)
+ continue;
+
+ assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
+ "WidenCanonicalIV must be the first operand of the compare");
+ CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue());
+ CompareToReplace->eraseFromParent();
+ }
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3eccf6e9600d..3bf91115debb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -22,11 +22,9 @@ class InductionDescriptor;
class Instruction;
class PHINode;
class ScalarEvolution;
-class Loop;
class PredicatedScalarEvolution;
class TargetLibraryInfo;
class VPBuilder;
-class VPRecipeBuilder;
struct VPlanTransforms {
/// Replaces the VPInstructions in \p Plan with corresponding
@@ -37,12 +35,56 @@ struct VPlanTransforms {
GetIntOrFpInductionDescriptor,
ScalarEvolution &SE, const TargetLibraryInfo &TLI);
+ /// Sink users of fixed-order recurrences after the recipe defining their
+ /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions
+ /// to combine the value from the recurrence phis and previous values. The
+ /// current implementation assumes all users can be sunk after the previous
+ /// value, which is enforced by earlier legality checks.
+ /// \returns true if all users of fixed-order recurrences could be re-arranged
+ /// as needed or false if it is not possible. In the latter case, \p Plan is
+ /// not valid.
+ static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
+
+ /// Clear NSW/NUW flags from reduction instructions if necessary.
+ static void clearReductionWrapFlags(VPlan &Plan);
+
+ /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
+ /// resulting plan to \p BestVF and \p BestUF.
+ static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
+ unsigned BestUF,
+ PredicatedScalarEvolution &PSE);
+
+ /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
+ /// optimizations, dead recipe removal, replicate region optimizations and
+ /// block merging.
+ static void optimize(VPlan &Plan, ScalarEvolution &SE);
+
/// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
/// region block and remove the mask operand. Optimize the created regions by
/// iteratively sinking scalar operands into the region, followed by merging
/// regions until no improvements are remaining.
static void createAndOptimizeReplicateRegions(VPlan &Plan);
+ /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an
+ /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p
+ /// UseActiveLaneMaskForControlFlow is true, introduce an
+ /// VPActiveLaneMaskPHIRecipe. If \p DataAndControlFlowWithoutRuntimeCheck is
+ /// true, no minimum-iteration runtime check will be created (during skeleton
+ /// creation) and instead it is handled using active-lane-mask. \p
+ /// DataAndControlFlowWithoutRuntimeCheck implies \p
+ /// UseActiveLaneMaskForControlFlow.
+ static void addActiveLaneMask(VPlan &Plan,
+ bool UseActiveLaneMaskForControlFlow,
+ bool DataAndControlFlowWithoutRuntimeCheck);
+
+ /// Insert truncates and extends for any truncated recipe. Redundant casts
+ /// will be folded later.
+ static void
+ truncateToMinimalBitwidths(VPlan &Plan,
+ const MapVector<Instruction *, uint64_t> &MinBWs,
+ LLVMContext &Ctx);
+
+private:
/// Remove redundant VPBasicBlocks by merging them into their predecessor if
/// the predecessor has a single successor.
static bool mergeBlocksIntoPredecessors(VPlan &Plan);
@@ -71,24 +113,6 @@ struct VPlanTransforms {
/// them with already existing recipes expanding the same SCEV expression.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan);
- /// Sink users of fixed-order recurrences after the recipe defining their
- /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions
- /// to combine the value from the recurrence phis and previous values. The
- /// current implementation assumes all users can be sunk after the previous
- /// value, which is enforced by earlier legality checks.
- /// \returns true if all users of fixed-order recurrences could be re-arranged
- /// as needed or false if it is not possible. In the latter case, \p Plan is
- /// not valid.
- static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
-
- /// Clear NSW/NUW flags from reduction instructions if necessary.
- static void clearReductionWrapFlags(VPlan &Plan);
-
- /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
- /// resulting plan to \p BestVF and \p BestUF.
- static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
- unsigned BestUF,
- PredicatedScalarEvolution &PSE);
};
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
index ac110bb3b0ef..116acad8e8f3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -121,18 +121,11 @@ public:
/// Remove a single \p User from the list of users.
void removeUser(VPUser &User) {
- bool Found = false;
// The same user can be added multiple times, e.g. because the same VPValue
// is used twice by the same VPUser. Remove a single one.
- erase_if(Users, [&User, &Found](VPUser *Other) {
- if (Found)
- return false;
- if (Other == &User) {
- Found = true;
- return true;
- }
- return false;
- });
+ auto *I = find(Users, &User);
+ if (I != Users.end())
+ Users.erase(I);
}
typedef SmallVectorImpl<VPUser *>::iterator user_iterator;
@@ -163,6 +156,13 @@ public:
void replaceAllUsesWith(VPValue *New);
+ /// Go through the uses list for this VPValue and make each use point to \p
+ /// New if the callback ShouldReplace returns true for the given use specified
+ /// by a pair of (VPUser, the use index).
+ void replaceUsesWithIf(
+ VPValue *New,
+ llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace);
+
/// Returns the recipe defining this VPValue or nullptr if it is not defined
/// by a recipe, i.e. is a live-in.
VPRecipeBase *getDefiningRecipe();
@@ -296,6 +296,14 @@ public:
"Op must be an operand of the recipe");
return false;
}
+
+ /// Returns true if the VPUser only uses the first part of operand \p Op.
+ /// Conservatively returns false.
+ virtual bool onlyFirstPartUsed(const VPValue *Op) const {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return false;
+ }
};
/// This class augments a recipe with a set of VPValues defined by the recipe.
@@ -325,7 +333,7 @@ class VPDef {
assert(V->Def == this && "can only remove VPValue linked with this VPDef");
assert(is_contained(DefinedValues, V) &&
"VPValue to remove must be in DefinedValues");
- erase_value(DefinedValues, V);
+ llvm::erase(DefinedValues, V);
V->Def = nullptr;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 13464c9d3496..f18711ba30b7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -13,6 +13,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -28,6 +30,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include <numeric>
+#include <queue>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -100,8 +103,9 @@ private:
Instruction &I);
bool foldExtractExtract(Instruction &I);
bool foldInsExtFNeg(Instruction &I);
- bool foldBitcastShuf(Instruction &I);
+ bool foldBitcastShuffle(Instruction &I);
bool scalarizeBinopOrCmp(Instruction &I);
+ bool scalarizeVPIntrinsic(Instruction &I);
bool foldExtractedCmps(Instruction &I);
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
@@ -258,8 +262,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// It is safe and potentially profitable to load a vector directly:
// inselt undef, load Scalar, 0 --> load VecPtr
IRBuilder<> Builder(Load);
- Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
- SrcPtr, MinVecTy->getPointerTo(AS));
+ Value *CastedPtr =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
VecLd = Builder.CreateShuffleVector(VecLd, Mask);
@@ -321,7 +325,7 @@ bool VectorCombine::widenSubvectorLoad(Instruction &I) {
IRBuilder<> Builder(Load);
Value *CastedPtr =
- Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS));
+ Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
replaceValue(I, *VecLd);
++NumVecLoad;
@@ -677,7 +681,7 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
/// destination type followed by shuffle. This can enable further transforms by
/// moving bitcasts or shuffles together.
-bool VectorCombine::foldBitcastShuf(Instruction &I) {
+bool VectorCombine::foldBitcastShuffle(Instruction &I) {
Value *V;
ArrayRef<int> Mask;
if (!match(&I, m_BitCast(
@@ -687,35 +691,43 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
// scalable type is unknown; Second, we cannot reason if the narrowed shuffle
// mask for scalable type is a splat or not.
- // 2) Disallow non-vector casts and length-changing shuffles.
+ // 2) Disallow non-vector casts.
// TODO: We could allow any shuffle.
+ auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
- if (!SrcTy || I.getOperand(0)->getType() != SrcTy)
+ if (!DestTy || !SrcTy)
+ return false;
+
+ unsigned DestEltSize = DestTy->getScalarSizeInBits();
+ unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
+ if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
return false;
- auto *DestTy = cast<FixedVectorType>(I.getType());
- unsigned DestNumElts = DestTy->getNumElements();
- unsigned SrcNumElts = SrcTy->getNumElements();
SmallVector<int, 16> NewMask;
- if (SrcNumElts <= DestNumElts) {
+ if (DestEltSize <= SrcEltSize) {
// The bitcast is from wide to narrow/equal elements. The shuffle mask can
// always be expanded to the equivalent form choosing narrower elements.
- assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
- unsigned ScaleFactor = DestNumElts / SrcNumElts;
+ assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
+ unsigned ScaleFactor = SrcEltSize / DestEltSize;
narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
} else {
// The bitcast is from narrow elements to wide elements. The shuffle mask
// must choose consecutive elements to allow casting first.
- assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
- unsigned ScaleFactor = SrcNumElts / DestNumElts;
+ assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
+ unsigned ScaleFactor = DestEltSize / SrcEltSize;
if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
return false;
}
+ // Bitcast the shuffle src - keep its original width but using the destination
+ // scalar type.
+ unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
+ auto *ShuffleTy = FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
+
// The new shuffle must not cost more than the old shuffle. The bitcast is
// moved ahead of the shuffle, so assume that it has the same cost as before.
InstructionCost DestCost = TTI.getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask);
+ TargetTransformInfo::SK_PermuteSingleSrc, ShuffleTy, NewMask);
InstructionCost SrcCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask);
if (DestCost > SrcCost || !DestCost.isValid())
@@ -723,12 +735,131 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
// bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
++NumShufOfBitcast;
- Value *CastV = Builder.CreateBitCast(V, DestTy);
+ Value *CastV = Builder.CreateBitCast(V, ShuffleTy);
Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
replaceValue(I, *Shuf);
return true;
}
+/// VP Intrinsics whose vector operands are both splat values may be simplified
+/// into the scalar version of the operation and the result splatted. This
+/// can lead to scalarization down the line.
+bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
+ if (!isa<VPIntrinsic>(I))
+ return false;
+ VPIntrinsic &VPI = cast<VPIntrinsic>(I);
+ Value *Op0 = VPI.getArgOperand(0);
+ Value *Op1 = VPI.getArgOperand(1);
+
+ if (!isSplatValue(Op0) || !isSplatValue(Op1))
+ return false;
+
+ // Check getSplatValue early in this function, to avoid doing unnecessary
+ // work.
+ Value *ScalarOp0 = getSplatValue(Op0);
+ Value *ScalarOp1 = getSplatValue(Op1);
+ if (!ScalarOp0 || !ScalarOp1)
+ return false;
+
+ // For the binary VP intrinsics supported here, the result on disabled lanes
+ // is a poison value. For now, only do this simplification if all lanes
+ // are active.
+ // TODO: Relax the condition that all lanes are active by using insertelement
+ // on inactive lanes.
+ auto IsAllTrueMask = [](Value *MaskVal) {
+ if (Value *SplattedVal = getSplatValue(MaskVal))
+ if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
+ return ConstValue->isAllOnesValue();
+ return false;
+ };
+ if (!IsAllTrueMask(VPI.getArgOperand(2)))
+ return false;
+
+ // Check to make sure we support scalarization of the intrinsic
+ Intrinsic::ID IntrID = VPI.getIntrinsicID();
+ if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
+ return false;
+
+ // Calculate cost of splatting both operands into vectors and the vector
+ // intrinsic
+ VectorType *VecTy = cast<VectorType>(VPI.getType());
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost SplatCost =
+ TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+
+ // Calculate the cost of the VP Intrinsic
+ SmallVector<Type *, 4> Args;
+ for (Value *V : VPI.args())
+ Args.push_back(V->getType());
+ IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
+ InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
+ InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
+
+ // Determine scalar opcode
+ std::optional<unsigned> FunctionalOpcode =
+ VPI.getFunctionalOpcode();
+ std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
+ if (!FunctionalOpcode) {
+ ScalarIntrID = VPI.getFunctionalIntrinsicID();
+ if (!ScalarIntrID)
+ return false;
+ }
+
+ // Calculate cost of scalarizing
+ InstructionCost ScalarOpCost = 0;
+ if (ScalarIntrID) {
+ IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
+ ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
+ } else {
+ ScalarOpCost =
+ TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType());
+ }
+
+ // The existing splats may be kept around if other instructions use them.
+ InstructionCost CostToKeepSplats =
+ (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
+ InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
+
+ LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
+ << "\n");
+ LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
+ << ", Cost of scalarizing:" << NewCost << "\n");
+
+ // We want to scalarize unless the vector variant actually has lower cost.
+ if (OldCost < NewCost || !NewCost.isValid())
+ return false;
+
+ // Scalarize the intrinsic
+ ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
+ Value *EVL = VPI.getArgOperand(3);
+ const DataLayout &DL = VPI.getModule()->getDataLayout();
+
+ // If the VP op might introduce UB or poison, we can scalarize it provided
+ // that we know the EVL > 0: If the EVL is zero, then the original VP op
+ // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
+ // scalarizing it.
+ bool SafeToSpeculate;
+ if (ScalarIntrID)
+ SafeToSpeculate = Intrinsic::getAttributes(I.getContext(), *ScalarIntrID)
+ .hasFnAttr(Attribute::AttrKind::Speculatable);
+ else
+ SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode(
+ *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
+ if (!SafeToSpeculate && !isKnownNonZero(EVL, DL, 0, &AC, &VPI, &DT))
+ return false;
+
+ Value *ScalarVal =
+ ScalarIntrID
+ ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
+ {ScalarOp0, ScalarOp1})
+ : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
+ ScalarOp0, ScalarOp1);
+
+ replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
+ return true;
+}
+
/// Match a vector binop or compare instruction with at least one inserted
/// scalar operand and convert to scalar binop/cmp followed by insertelement.
bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
@@ -1013,19 +1144,24 @@ public:
/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
/// Idx. \p Idx must access a valid vector element.
-static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy,
- Value *Idx, Instruction *CtxI,
+static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
+ Instruction *CtxI,
AssumptionCache &AC,
const DominatorTree &DT) {
+ // We do checks for both fixed vector types and scalable vector types.
+ // This is the number of elements of fixed vector types,
+ // or the minimum number of elements of scalable vector types.
+ uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
+
if (auto *C = dyn_cast<ConstantInt>(Idx)) {
- if (C->getValue().ult(VecTy->getNumElements()))
+ if (C->getValue().ult(NumElements))
return ScalarizationResult::safe();
return ScalarizationResult::unsafe();
}
unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
APInt Zero(IntWidth, 0);
- APInt MaxElts(IntWidth, VecTy->getNumElements());
+ APInt MaxElts(IntWidth, NumElements);
ConstantRange ValidIndices(Zero, MaxElts);
ConstantRange IdxRange(IntWidth, true);
@@ -1074,8 +1210,7 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,
// store i32 %b, i32* %1
bool VectorCombine::foldSingleElementStore(Instruction &I) {
auto *SI = cast<StoreInst>(&I);
- if (!SI->isSimple() ||
- !isa<FixedVectorType>(SI->getValueOperand()->getType()))
+ if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
return false;
// TODO: Combine more complicated patterns (multiple insert) by referencing
@@ -1089,13 +1224,13 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
return false;
if (auto *Load = dyn_cast<LoadInst>(Source)) {
- auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
+ auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
const DataLayout &DL = I.getModule()->getDataLayout();
Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
// Don't optimize for atomic/volatile load or store. Ensure memory is not
// modified between, vector type matches store size, and index is inbounds.
if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
- !DL.typeSizeEqualsStoreSize(Load->getType()) ||
+ !DL.typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
SrcAddr != SI->getPointerOperand()->stripPointerCasts())
return false;
@@ -1130,19 +1265,26 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
if (!match(&I, m_Load(m_Value(Ptr))))
return false;
- auto *FixedVT = cast<FixedVectorType>(I.getType());
+ auto *VecTy = cast<VectorType>(I.getType());
auto *LI = cast<LoadInst>(&I);
const DataLayout &DL = I.getModule()->getDataLayout();
- if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(FixedVT))
+ if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(VecTy->getScalarType()))
return false;
InstructionCost OriginalCost =
- TTI.getMemoryOpCost(Instruction::Load, FixedVT, LI->getAlign(),
+ TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
LI->getPointerAddressSpace());
InstructionCost ScalarizedCost = 0;
Instruction *LastCheckedInst = LI;
unsigned NumInstChecked = 0;
+ DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
+ auto FailureGuard = make_scope_exit([&]() {
+ // If the transform is aborted, discard the ScalarizationResults.
+ for (auto &Pair : NeedFreeze)
+ Pair.second.discard();
+ });
+
// Check if all users of the load are extracts with no memory modifications
// between the load and the extract. Compute the cost of both the original
// code and the scalarized version.
@@ -1151,9 +1293,6 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
if (!UI || UI->getParent() != LI->getParent())
return false;
- if (!isGuaranteedNotToBePoison(UI->getOperand(1), &AC, LI, &DT))
- return false;
-
// Check if any instruction between the load and the extract may modify
// memory.
if (LastCheckedInst->comesBefore(UI)) {
@@ -1168,22 +1307,23 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
LastCheckedInst = UI;
}
- auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT);
- if (!ScalarIdx.isSafe()) {
- // TODO: Freeze index if it is safe to do so.
- ScalarIdx.discard();
+ auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT);
+ if (ScalarIdx.isUnsafe())
return false;
+ if (ScalarIdx.isSafeWithFreeze()) {
+ NeedFreeze.try_emplace(UI, ScalarIdx);
+ ScalarIdx.discard();
}
auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
OriginalCost +=
- TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind,
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
Index ? Index->getZExtValue() : -1);
ScalarizedCost +=
- TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),
+ TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
Align(1), LI->getPointerAddressSpace());
- ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType());
+ ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType());
}
if (ScalarizedCost >= OriginalCost)
@@ -1192,21 +1332,27 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
// Replace extracts with narrow scalar loads.
for (User *U : LI->users()) {
auto *EI = cast<ExtractElementInst>(U);
- Builder.SetInsertPoint(EI);
-
Value *Idx = EI->getOperand(1);
+
+ // Insert 'freeze' for poison indexes.
+ auto It = NeedFreeze.find(EI);
+ if (It != NeedFreeze.end())
+ It->second.freeze(Builder, *cast<Instruction>(Idx));
+
+ Builder.SetInsertPoint(EI);
Value *GEP =
- Builder.CreateInBoundsGEP(FixedVT, Ptr, {Builder.getInt32(0), Idx});
+ Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(
- FixedVT->getElementType(), GEP, EI->getName() + ".scalar"));
+ VecTy->getElementType(), GEP, EI->getName() + ".scalar"));
Align ScalarOpAlignment = computeAlignmentAfterScalarization(
- LI->getAlign(), FixedVT->getElementType(), Idx, DL);
+ LI->getAlign(), VecTy->getElementType(), Idx, DL);
NewLoad->setAlignment(ScalarOpAlignment);
replaceValue(*EI, *NewLoad);
}
+ FailureGuard.release();
return true;
}
@@ -1340,21 +1486,28 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());
if (!ShuffleInputType)
return false;
- int NumInputElts = ShuffleInputType->getNumElements();
+ unsigned NumInputElts = ShuffleInputType->getNumElements();
// Find the mask from sorting the lanes into order. This is most likely to
// become a identity or concat mask. Undef elements are pushed to the end.
SmallVector<int> ConcatMask;
Shuffle->getShuffleMask(ConcatMask);
sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
+ // In the case of a truncating shuffle it's possible for the mask
+ // to have an index greater than the size of the resulting vector.
+ // This requires special handling.
+ bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts;
bool UsesSecondVec =
- any_of(ConcatMask, [&](int M) { return M >= NumInputElts; });
+ any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
+
+ FixedVectorType *VecTyForCost =
+ (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;
InstructionCost OldCost = TTI.getShuffleCost(
- UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
- Shuffle->getShuffleMask());
+ UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
+ VecTyForCost, Shuffle->getShuffleMask());
InstructionCost NewCost = TTI.getShuffleCost(
- UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
- ConcatMask);
+ UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
+ VecTyForCost, ConcatMask);
LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
<< "\n");
@@ -1657,16 +1810,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
return SSV->getOperand(Op);
return SV->getOperand(Op);
};
- Builder.SetInsertPoint(SVI0A->getInsertionPointAfterDef());
+ Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
GetShuffleOperand(SVI0A, 1), V1A);
- Builder.SetInsertPoint(SVI0B->getInsertionPointAfterDef());
+ Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
GetShuffleOperand(SVI0B, 1), V1B);
- Builder.SetInsertPoint(SVI1A->getInsertionPointAfterDef());
+ Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
GetShuffleOperand(SVI1A, 1), V2A);
- Builder.SetInsertPoint(SVI1B->getInsertionPointAfterDef());
+ Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
GetShuffleOperand(SVI1B, 1), V2B);
Builder.SetInsertPoint(Op0);
@@ -1723,9 +1876,6 @@ bool VectorCombine::run() {
case Instruction::ShuffleVector:
MadeChange |= widenSubvectorLoad(I);
break;
- case Instruction::Load:
- MadeChange |= scalarizeLoadExtract(I);
- break;
default:
break;
}
@@ -1733,13 +1883,15 @@ bool VectorCombine::run() {
// This transform works with scalable and fixed vectors
// TODO: Identify and allow other scalable transforms
- if (isa<VectorType>(I.getType()))
+ if (isa<VectorType>(I.getType())) {
MadeChange |= scalarizeBinopOrCmp(I);
+ MadeChange |= scalarizeLoadExtract(I);
+ MadeChange |= scalarizeVPIntrinsic(I);
+ }
if (Opcode == Instruction::Store)
MadeChange |= foldSingleElementStore(I);
-
// If this is an early pipeline invocation of this pass, we are done.
if (TryEarlyFoldsOnly)
return;
@@ -1758,7 +1910,7 @@ bool VectorCombine::run() {
MadeChange |= foldSelectShuffle(I);
break;
case Instruction::BitCast:
- MadeChange |= foldBitcastShuf(I);
+ MadeChange |= foldBitcastShuffle(I);
break;
}
} else {