aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-12-25 22:36:56 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-05-14 11:44:01 +0000
commit0eae32dcef82f6f06de6419a0d623d7def0cc8f6 (patch)
tree55b7e05be47b835fd137915bee1e64026c35e71c /contrib/llvm-project/llvm/lib/Transforms/Vectorize
parent4824e7fd18a1223177218d4aec1b3c6c5c4a444e (diff)
parent77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp89
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h38
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp463
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp633
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h118
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp49
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h21
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp26
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp6
12 files changed, 912 insertions, 551 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 805011191da0..81e5aa223c07 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -55,22 +55,23 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
cl::desc("The maximum number of SCEV checks allowed with a "
"vectorize(enable) pragma"));
-// FIXME: When scalable vectorization is stable enough, change the default
-// to SK_PreferFixedWidth.
-static cl::opt<LoopVectorizeHints::ScalableForceKind> ScalableVectorization(
- "scalable-vectorization", cl::init(LoopVectorizeHints::SK_FixedWidthOnly),
- cl::Hidden,
- cl::desc("Control whether the compiler can use scalable vectors to "
- "vectorize a loop"),
- cl::values(
- clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
- "Scalable vectorization is disabled."),
- clEnumValN(LoopVectorizeHints::SK_PreferFixedWidth, "on",
- "Scalable vectorization is available, but favor fixed-width "
- "vectorization when the cost is inconclusive."),
- clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred",
- "Scalable vectorization is available and favored when the "
- "cost is inconclusive.")));
+static cl::opt<LoopVectorizeHints::ScalableForceKind>
+ ForceScalableVectorization(
+ "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified),
+ cl::Hidden,
+ cl::desc("Control whether the compiler can use scalable vectors to "
+ "vectorize a loop"),
+ cl::values(
+ clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
+ "Scalable vectorization is disabled."),
+ clEnumValN(
+ LoopVectorizeHints::SK_PreferScalable, "preferred",
+ "Scalable vectorization is available and favored when the "
+ "cost is inconclusive."),
+ clEnumValN(
+ LoopVectorizeHints::SK_PreferScalable, "on",
+ "Scalable vectorization is available and favored when the "
+ "cost is inconclusive.")));
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -95,7 +96,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
bool InterleaveOnlyWhenForced,
- OptimizationRemarkEmitter &ORE)
+ OptimizationRemarkEmitter &ORE,
+ const TargetTransformInfo *TTI)
: Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
@@ -110,14 +112,32 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
if (VectorizerParams::isInterleaveForced())
Interleave.Value = VectorizerParams::VectorizationInterleave;
+ // If the metadata doesn't explicitly specify whether to enable scalable
+ // vectorization, then decide based on the following criteria (increasing
+ // level of priority):
+ // - Target default
+ // - Metadata width
+ // - Force option (always overrides)
+ if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
+ if (TTI)
+ Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
+ : SK_FixedWidthOnly;
+
+ if (Width.Value)
+ // If the width is set, but the metadata says nothing about the scalable
+ // property, then assume it concerns only a fixed-width UserVF.
+ // If width is not set, the flag takes precedence.
+ Scalable.Value = SK_FixedWidthOnly;
+ }
+
+ // If the flag is set to force any use of scalable vectors, override the loop
+ // hints.
+ if (ForceScalableVectorization.getValue() !=
+ LoopVectorizeHints::SK_Unspecified)
+ Scalable.Value = ForceScalableVectorization.getValue();
+
+ // Scalable vectorization is disabled if no preference is specified.
if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
- // If the width is set, but the metadata says nothing about the scalable
- // property, then assume it concerns only a fixed-width UserVF.
- // If width is not set, the flag takes precedence.
- Scalable.Value = Width.Value ? SK_FixedWidthOnly : ScalableVectorization;
- else if (ScalableVectorization == SK_FixedWidthOnly)
- // If the flag is set to disable any use of scalable vectors, override the
- // loop hint.
Scalable.Value = SK_FixedWidthOnly;
if (IsVectorized.Value != 1)
@@ -929,7 +949,7 @@ bool LoopVectorizationLegality::canVectorizeFPMath(
}));
}
-bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
Value *In0 = const_cast<Value *>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
if (!PN)
@@ -938,16 +958,29 @@ bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
return Inductions.count(PN);
}
-bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+const InductionDescriptor *
+LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {
+ if (!isInductionPhi(Phi))
+ return nullptr;
+ auto &ID = getInductionVars().find(Phi)->second;
+ if (ID.getKind() == InductionDescriptor::IK_IntInduction ||
+ ID.getKind() == InductionDescriptor::IK_FpInduction)
+ return &ID;
+ return nullptr;
+}
+
+bool LoopVectorizationLegality::isCastedInductionVariable(
+ const Value *V) const {
auto *Inst = dyn_cast<Instruction>(V);
return (Inst && InductionCastsToIgnore.count(Inst));
}
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) const {
return isInductionPhi(V) || isCastedInductionVariable(V);
}
-bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+bool LoopVectorizationLegality::isFirstOrderRecurrence(
+ const PHINode *Phi) const {
return FirstOrderRecurrences.count(Phi);
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7d6609f8c56..71eb39a18d2f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -45,16 +45,17 @@ class VPBuilder {
VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
VPInstruction *createInstruction(unsigned Opcode,
- ArrayRef<VPValue *> Operands) {
- VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+ ArrayRef<VPValue *> Operands, DebugLoc DL) {
+ VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL);
if (BB)
BB->insert(Instr, InsertPt);
return Instr;
}
VPInstruction *createInstruction(unsigned Opcode,
- std::initializer_list<VPValue *> Operands) {
- return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+ std::initializer_list<VPValue *> Operands,
+ DebugLoc DL) {
+ return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL);
}
public:
@@ -123,30 +124,33 @@ public:
/// its underlying Instruction.
VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
Instruction *Inst = nullptr) {
- VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+ DebugLoc DL;
+ if (Inst)
+ DL = Inst->getDebugLoc();
+ VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL);
NewVPInst->setUnderlyingValue(Inst);
return NewVPInst;
}
- VPValue *createNaryOp(unsigned Opcode,
- std::initializer_list<VPValue *> Operands,
- Instruction *Inst = nullptr) {
- return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+ VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ DebugLoc DL) {
+ return createInstruction(Opcode, Operands, DL);
}
- VPValue *createNot(VPValue *Operand) {
- return createInstruction(VPInstruction::Not, {Operand});
+ VPValue *createNot(VPValue *Operand, DebugLoc DL) {
+ return createInstruction(VPInstruction::Not, {Operand}, DL);
}
- VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+ VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) {
+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL);
}
- VPValue *createOr(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+ VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) {
+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL);
}
- VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {
- return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});
+ VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
+ DebugLoc DL) {
+ return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL);
}
//===--------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5ca0adb4242c..4747f34fcc62 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -428,6 +428,8 @@ class GeneratedRTChecks;
namespace llvm {
+AnalysisKey ShouldRunExtraVectorPasses::Key;
+
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
@@ -506,8 +508,8 @@ public:
/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
/// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
- VPValue *Def, VPValue *CastDef,
+ void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
+ Value *Start, TruncInst *Trunc, VPValue *Def,
VPTransformState &State);
/// Construct the vector value of a scalarized value \p V one lane at a time.
@@ -534,7 +536,7 @@ public:
/// Returns true if the reordering of FP operations is not allowed, but we are
/// able to vectorize with strict in-order reductions for the given RdxDesc.
- bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
+ bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
@@ -619,7 +621,7 @@ protected:
/// can also be a truncate instruction.
void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
const InductionDescriptor &ID, VPValue *Def,
- VPValue *CastDef, VPTransformState &State);
+ VPTransformState &State);
/// Create a vector induction phi node based on an existing scalar one. \p
/// EntryVal is the value from the original loop that maps to the vector phi
@@ -629,7 +631,6 @@ protected:
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Value *Start,
Instruction *EntryVal, VPValue *Def,
- VPValue *CastDef,
VPTransformState &State);
/// Returns true if an instruction \p I should be scalarized instead of
@@ -639,29 +640,6 @@ protected:
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
- /// If there is a cast involved in the induction variable \p ID, which should
- /// be ignored in the vectorized loop body, this function records the
- /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
- /// cast. We had already proved that the casted Phi is equal to the uncasted
- /// Phi in the vectorized loop (under a runtime guard), and therefore
- /// there is no need to vectorize the cast - the same value can be used in the
- /// vector loop for both the Phi and the cast.
- /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
- /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
- ///
- /// \p EntryVal is the value from the original loop that maps to the vector
- /// phi node and is used to distinguish what is the IV currently being
- /// processed - original one (if \p EntryVal is a phi corresponding to the
- /// original IV) or the "newly-created" one based on the proof mentioned above
- /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
- /// latter case \p EntryVal is a TruncInst and we must not record anything for
- /// that IV, but it's error-prone to expect callers of this routine to care
- /// about that, hence this explicit parameter.
- void recordVectorLoopValueForInductionCast(
- const InductionDescriptor &ID, const Instruction *EntryVal,
- Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
- unsigned Part, unsigned Lane = UINT_MAX);
-
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
@@ -698,7 +676,8 @@ protected:
/// flags, which can be found from the original scalar operations.
Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
const DataLayout &DL,
- const InductionDescriptor &ID) const;
+ const InductionDescriptor &ID,
+ BasicBlock *VectorHeader) const;
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
/// vector loop preheader, middle block and scalar preheader. Also
@@ -1728,7 +1707,8 @@ private:
/// disabled or unsupported, then the scalable part will be equal to
/// ElementCount::getScalable(0).
FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
- ElementCount UserVF);
+ ElementCount UserVF,
+ bool FoldTailByMasking);
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
@@ -1741,7 +1721,8 @@ private:
ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
unsigned SmallestType,
unsigned WidestType,
- const ElementCount &MaxSafeVF);
+ const ElementCount &MaxSafeVF,
+ bool FoldTailByMasking);
/// \return the maximum legal scalable VF, based on the safe max number
/// of elements.
@@ -2356,8 +2337,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Value *Start,
- Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
- VPTransformState &State) {
+ Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
+ IRBuilder<> &Builder = State.Builder;
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
"Expected either an induction phi-node or a truncate of it!");
@@ -2373,7 +2354,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
}
Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
- Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+ Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
Value *SteppedStart =
getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
@@ -2394,9 +2375,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
Type *StepType = Step->getType();
Value *RuntimeVF;
if (Step->getType()->isFloatingPointTy())
- RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF);
+ RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
else
- RuntimeVF = getRuntimeVF(Builder, StepType, VF);
+ RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
// Create a vector splat to use in the induction update.
@@ -2405,8 +2386,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
// handle a constant vector splat.
Value *SplatVF = isa<Constant>(Mul)
- ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
- : Builder.CreateVectorSplat(VF, Mul);
+ ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(State.VF, Mul);
Builder.restoreIP(CurrIP);
// We may need to add the step a number of times, depending on the unroll
@@ -2420,8 +2401,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
- recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
- State, Part);
LastInduction = cast<Instruction>(
Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
@@ -2455,56 +2434,21 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
return llvm::any_of(IV->users(), isScalarInst);
}
-void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
- const InductionDescriptor &ID, const Instruction *EntryVal,
- Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
- unsigned Part, unsigned Lane) {
- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
- "Expected either an induction phi-node or a truncate of it!");
-
- // This induction variable is not the phi from the original loop but the
- // newly-created IV based on the proof that casted Phi is equal to the
- // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
- // re-uses the same InductionDescriptor that original IV uses but we don't
- // have to do any recording in this case - that is done when original IV is
- // processed.
- if (isa<TruncInst>(EntryVal))
- return;
-
- if (!CastDef) {
- assert(ID.getCastInsts().empty() &&
- "there are casts for ID, but no CastDef");
- return;
- }
- assert(!ID.getCastInsts().empty() &&
- "there is a CastDef, but no casts for ID");
- // Only the first Cast instruction in the Casts vector is of interest.
- // The rest of the Casts (if exist) have no uses outside the
- // induction update chain itself.
- if (Lane < UINT_MAX)
- State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
- else
- State.set(CastDef, VectorLoopVal, Part);
-}
-
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
- TruncInst *Trunc, VPValue *Def,
- VPValue *CastDef,
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
+ const InductionDescriptor &ID,
+ Value *Start, TruncInst *Trunc,
+ VPValue *Def,
VPTransformState &State) {
+ IRBuilder<> &Builder = State.Builder;
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
-
- auto II = Legal->getInductionVars().find(IV);
- assert(II != Legal->getInductionVars().end() && "IV is not an induction");
-
- auto ID = II->second;
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
// The value from the original loop to which we are mapping the new induction
// variable.
Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
- auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+ auto &DL = EntryVal->getModule()->getDataLayout();
// Generate code for the induction step. Note that induction steps are
// required to be loop-invariant
@@ -2514,7 +2458,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
if (PSE.getSE()->isSCEVable(IV->getType())) {
SCEVExpander Exp(*PSE.getSE(), DL, "induction");
return Exp.expandCodeFor(Step, Step->getType(),
- LoopVectorPreHeader->getTerminator());
+ State.CFG.VectorPreHeader->getTerminator());
}
return cast<SCEVUnknown>(Step)->getValue();
};
@@ -2530,7 +2474,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
? Builder.CreateSExtOrTrunc(Induction, IV->getType())
: Builder.CreateCast(Instruction::SIToFP, Induction,
IV->getType());
- ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
+ ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
+ State.CFG.PrevBB);
ScalarIV->setName("offset.idx");
}
if (Trunc) {
@@ -2548,20 +2493,19 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
Value *StartIdx;
if (Step->getType()->isFloatingPointTy())
- StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part);
+ StartIdx =
+ getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);
else
- StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part);
+ StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
Value *EntryPart =
getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
State.set(Def, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
- recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
- State, Part);
}
};
@@ -2572,7 +2516,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
// Now do the actual transformations, and start with creating the step value.
Value *Step = CreateStepValue(ID.getStep());
- if (VF.isZero() || VF.isScalar()) {
+ if (State.VF.isZero() || State.VF.isScalar()) {
Value *ScalarIV = CreateScalarIV(Step);
CreateSplatIV(ScalarIV, Step);
return;
@@ -2583,8 +2527,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
// least one user in the loop that is not widened.
auto NeedsScalarIV = needsScalarInduction(EntryVal);
if (!NeedsScalarIV) {
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
- State);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
return;
}
@@ -2592,14 +2535,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (!shouldScalarizeInstruction(EntryVal)) {
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
- State);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
Value *ScalarIV = CreateScalarIV(Step);
// Create scalar steps that can be used by instructions we will later
// scalarize. Note that the addition of the scalar steps will not increase
// the number of instructions in the loop in the common case prior to
// InstCombine. We will be trading one vector extract for each scalar step.
- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
return;
}
@@ -2609,7 +2551,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
Value *ScalarIV = CreateScalarIV(Step);
if (!Cost->isScalarEpilogueAllowed())
CreateSplatIV(ScalarIV, Step);
- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
@@ -2663,10 +2605,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Instruction *EntryVal,
const InductionDescriptor &ID,
- VPValue *Def, VPValue *CastDef,
+ VPValue *Def,
VPTransformState &State) {
+ IRBuilder<> &Builder = State.Builder;
// We shouldn't have to build scalar steps if we aren't vectorizing.
- assert(VF.isVector() && "VF should be greater than one");
+ assert(State.VF.isVector() && "VF should be greater than one");
// Get the value type and ensure it and the step have the same integer type.
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
assert(ScalarIVTy == Step->getType() &&
@@ -2688,33 +2631,32 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
bool IsUniform =
- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
- unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
+ unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
// Compute the scalar steps and save the results in State.
Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
ScalarIVTy->getScalarSizeInBits());
Type *VecIVTy = nullptr;
Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
- if (!IsUniform && VF.isScalable()) {
- VecIVTy = VectorType::get(ScalarIVTy, VF);
- UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
- SplatStep = Builder.CreateVectorSplat(VF, Step);
- SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
+ if (!IsUniform && State.VF.isScalable()) {
+ VecIVTy = VectorType::get(ScalarIVTy, State.VF);
+ UnitStepVec =
+ Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
+ SplatStep = Builder.CreateVectorSplat(State.VF, Step);
+ SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
}
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part);
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
- if (!IsUniform && VF.isScalable()) {
- auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
+ if (!IsUniform && State.VF.isScalable()) {
+ auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
if (ScalarIVTy->isFloatingPointTy())
InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
State.set(Def, Add, Part);
- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
- Part);
// It's useful to record the lane values too for the known minimum number
// of elements so we do those below. This improves the code quality when
// trying to extract the first element, for example.
@@ -2728,14 +2670,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
// The step returned by `createStepForVF` is a runtime-evaluated value
// when VF is scalable. Otherwise, it should be folded into a Constant.
- assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
+ assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable");
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
State.set(Def, Add, VPIteration(Part, Lane));
- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
- Part, Lane);
}
}
}
@@ -3023,21 +2963,19 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
// poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
// instruction could feed a poison value to the base address of the widen
// load/store.
- if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
+ if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
Cloned->dropPoisonGeneratingFlags();
State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
Builder.GetInsertPoint());
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
- for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {
- auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
+ for (auto &I : enumerate(RepRecipe->operands())) {
auto InputInstance = Instance;
- if (!Operand || !OrigLoop->contains(Operand) ||
- (Cost->isUniformAfterVectorization(Operand, State.VF)))
+ VPValue *Operand = I.value();
+ if (State.Plan->isUniformAfterVectorization(Operand))
InputInstance.Lane = VPLane::getFirstLane();
- auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);
- Cloned->setOperand(op, NewOp);
+ Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
}
addNewMetadata(Cloned, Instr);
@@ -3339,7 +3277,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
Value *InnerLoopVectorizer::emitTransformedIndex(
IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
- const InductionDescriptor &ID) const {
+ const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
SCEVExpander Exp(*SE, DL, "induction");
auto Step = ID.getStep();
@@ -3382,15 +3320,15 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
};
// Get a suitable insert point for SCEV expansion. For blocks in the vector
- // loop, choose the end of the vector loop header (=LoopVectorBody), because
+ // loop, choose the end of the vector loop header (=VectorHeader), because
// the DomTree is not kept up-to-date for additional blocks generated in the
// vector loop. By using the header as insertion point, we guarantee that the
// expanded instructions dominate all their uses.
- auto GetInsertPoint = [this, &B]() {
+ auto GetInsertPoint = [this, &B, VectorHeader]() {
BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
if (InsertBB != LoopVectorBody &&
- LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
- return LoopVectorBody->getTerminator();
+ LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
+ return VectorHeader->getTerminator();
return &*B.GetInsertPoint();
};
@@ -3538,7 +3476,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
- EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+ EndValue =
+ emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
EndValue->setName("ind.end");
// Compute the end value for the additional bypass (if applicable).
@@ -3549,7 +3488,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
CRD =
B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
EndValueFromAdditionalBypass =
- emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+ emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
EndValueFromAdditionalBypass->setName("ind.end");
}
}
@@ -3623,7 +3562,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
if (MDNode *LID = OrigLoop->getLoopID())
L->setLoopID(LID);
- LoopVectorizeHints Hints(L, true, *ORE);
+ LoopVectorizeHints Hints(L, true, *ORE, TTI);
Hints.setAlreadyVectorized();
#ifdef EXPENSIVE_CHECKS
@@ -3780,7 +3719,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
II.getStep()->getType())
: B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
CMO->setName("cast.cmo");
- Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
+ Value *Escape =
+ emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
@@ -4573,7 +4513,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
}
}
-bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+bool InnerLoopVectorizer::useOrderedReductions(
+ const RecurrenceDescriptor &RdxDesc) {
return Cost->useOrderedReductions(RdxDesc);
}
@@ -4648,8 +4589,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
Value *Idx = Builder.CreateAdd(
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
- Value *SclrGep =
- emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
+ Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
+ DL, II, State.CFG.PrevBB);
SclrGep->setName("next.gep");
State.set(PhiR, SclrGep, VPIteration(Part, Lane));
}
@@ -5368,13 +5309,9 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
// Limit MaxScalableVF by the maximum safe dependence distance.
Optional<unsigned> MaxVScale = TTI.getMaxVScale();
- if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
- unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
- .getVScaleRangeArgs()
- .second;
- if (VScaleMax > 0)
- MaxVScale = VScaleMax;
- }
+ if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
+ MaxVScale =
+ TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
MaxScalableVF = ElementCount::getScalable(
MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
if (!MaxScalableVF)
@@ -5386,9 +5323,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return MaxScalableVF;
}
-FixedScalableVFPair
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
- ElementCount UserVF) {
+FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
+ unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5475,12 +5411,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
FixedScalableVFPair Result(ElementCount::getFixed(1),
ElementCount::getScalable(0));
- if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
- WidestType, MaxSafeFixedVF))
+ if (auto MaxVF =
+ getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+ MaxSafeFixedVF, FoldTailByMasking))
Result.FixedVF = MaxVF;
- if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
- WidestType, MaxSafeScalableVF))
+ if (auto MaxVF =
+ getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+ MaxSafeScalableVF, FoldTailByMasking))
if (MaxVF.isScalable()) {
Result.ScalableVF = MaxVF;
LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
@@ -5513,7 +5451,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return computeFeasibleMaxVF(TC, UserVF);
+ return computeFeasibleMaxVF(TC, UserVF, false);
case CM_ScalarEpilogueNotAllowedUsePredicate:
LLVM_FALLTHROUGH;
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5551,7 +5489,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return computeFeasibleMaxVF(TC, UserVF);
+ return computeFeasibleMaxVF(TC, UserVF, false);
}
return FixedScalableVFPair::getNone();
}
@@ -5568,7 +5506,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we chose.
// FIXME: The condition below pessimises the case for fixed-width vectors,
@@ -5641,7 +5579,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
- const ElementCount &MaxSafeVF) {
+ const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
TypeSize WidestRegister = TTI.getRegisterBitWidth(
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@@ -5673,14 +5611,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
if (ConstTripCount &&
ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
- isPowerOf2_32(ConstTripCount)) {
- // We need to clamp the VF to be the ConstTripCount. There is no point in
- // choosing a higher viable VF as done in the loop below. If
- // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
- // the TC is less than or equal to the known number of lanes.
- LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
- << ConstTripCount << "\n");
- return TripCountEC;
+ (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
+ // If loop trip count (TC) is known at compile time there is no point in
+ // choosing VF greater than TC (as done in the loop below). Select maximum
+ // power of two which doesn't exceed TC.
+ // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
+ // when the TC is less than or equal to the known number of lanes.
+ auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
+ LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
+ "exceeding the constant trip count: "
+ << ClampedConstTripCount << "\n");
+ return ElementCount::getFixed(ClampedConstTripCount);
}
ElementCount MaxVF = MaxVectorElementCount;
@@ -5758,12 +5699,11 @@ bool LoopVectorizationCostModel::isMoreProfitable(
EstimatedWidthB *= VScale.getValue();
}
- // When set to preferred, for now assume vscale may be larger than 1 (or the
- // one being tuned for), so that scalable vectorization is slightly favorable
- // over fixed-width vectorization.
- if (Hints->isScalableVectorizationPreferred())
- if (A.Width.isScalable() && !B.Width.isScalable())
- return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
+ // Assume vscale may be larger than 1 (or the value being tuned for),
+ // so that scalable vectorization is slightly favorable over fixed-width
+ // vectorization.
+ if (A.Width.isScalable() && !B.Width.isScalable())
+ return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
// To avoid the need for FP division:
// (CostA / A.Width) < (CostB / B.Width)
@@ -6068,7 +6008,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
if (auto *PN = dyn_cast<PHINode>(&I)) {
if (!Legal->isReductionVariable(PN))
continue;
- const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];
+ const RecurrenceDescriptor &RdxDesc =
+ Legal->getReductionVars().find(PN)->second;
if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
TTI.preferInLoopReduction(RdxDesc.getOpcode(),
RdxDesc.getRecurrenceType(),
@@ -7002,7 +6943,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
const RecurrenceDescriptor &RdxDesc =
- Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
+ Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
InstructionCost BaseCost = TTI.getArithmeticReductionCost(
RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
@@ -7079,22 +7020,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
if (match(Op0, m_ZExtOrSExt(m_Value())) &&
Op0->getOpcode() == Op1->getOpcode() &&
- Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
!TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
bool IsUnsigned = isa<ZExtInst>(Op0);
- auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
- // Matched reduce(mul(ext, ext))
- InstructionCost ExtCost =
- TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
- TTI::CastContextHint::None, CostKind, Op0);
+ Type *Op0Ty = Op0->getOperand(0)->getType();
+ Type *Op1Ty = Op1->getOperand(0)->getType();
+ Type *LargestOpTy =
+ Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
+ : Op0Ty;
+ auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
+
+ // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
+ // different sizes. We take the largest type as the ext to reduce, and add
+ // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
+ InstructionCost ExtCost0 = TTI.getCastInstrCost(
+ Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
+ TTI::CastContextHint::None, CostKind, Op0);
+ InstructionCost ExtCost1 = TTI.getCastInstrCost(
+ Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
+ TTI::CastContextHint::None, CostKind, Op1);
InstructionCost MulCost =
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
InstructionCost RedCost = TTI.getExtendedAddReductionCost(
/*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
CostKind);
+ InstructionCost ExtraExtCost = 0;
+ if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
+ Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
+ ExtraExtCost = TTI.getCastInstrCost(
+ ExtraExtOp->getOpcode(), ExtType,
+ VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
+ TTI::CastContextHint::None, CostKind, ExtraExtOp);
+ }
- if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
+ if (RedCost.isValid() &&
+ (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
return I == RetI ? RedCost : 0;
} else if (!match(I, m_ZExtOrSExt(m_Value()))) {
// Matched reduce(mul())
@@ -7570,8 +7530,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *CondTy = SI->getCondition()->getType();
if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
+
+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
+ Pred = Cmp->getPredicate();
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
+ CostKind, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
@@ -7581,7 +7545,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
- CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
+ cast<CmpInst>(I)->getPredicate(), CostKind,
+ I);
}
case Instruction::Store:
case Instruction::Load: {
@@ -7762,14 +7727,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : Legal->getReductionVars()) {
- RecurrenceDescriptor &RedDes = Reduction.second;
+ const RecurrenceDescriptor &RedDes = Reduction.second;
const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Ignore type-casting instructions we identified during induction
// detection.
for (auto &Induction : Legal->getInductionVars()) {
- InductionDescriptor &IndDes = Induction.second;
+ const InductionDescriptor &IndDes = Induction.second;
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
@@ -7778,7 +7743,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
void LoopVectorizationCostModel::collectInLoopReductions() {
for (auto &Reduction : Legal->getReductionVars()) {
PHINode *Phi = Reduction.first;
- RecurrenceDescriptor &RdxDesc = Reduction.second;
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
// We don't collect reductions that are type promoted (yet).
if (RdxDesc.getRecurrenceType() != Phi->getType())
@@ -8064,18 +8029,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
DeadInstructions.insert(IndUpdate);
-
- // We record as "Dead" also the type-casting instructions we had identified
- // during induction analysis. We don't need any handling for them in the
- // vectorized loop because we have proven that, under a proper runtime
- // test guarding the vectorized loop, the value of the phi, and the casted
- // value of the phi, are the same. The last instruction in this casting chain
- // will get its scalar/vector/widened def from the scalar/vector/widened def
- // of the respective phi node. Any other casts in the induction def-use chain
- // have no other uses outside the phi update chain, and will be ignored.
- InductionDescriptor &IndDes = Induction.second;
- const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
- DeadInstructions.insert(Casts.begin(), Casts.end());
}
}
@@ -8461,7 +8414,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
assert(EdgeMask && "No Edge Mask found for condition");
if (BI->getSuccessor(0) != Dst)
- EdgeMask = Builder.createNot(EdgeMask);
+ EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
// The condition is 'SrcMask && EdgeMask', which is equivalent to
@@ -8470,7 +8423,8 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
// EdgeMask is poison. Using 'and' here introduces undefined behavior.
VPValue *False = Plan->getOrAddVPValue(
ConstantInt::getFalse(BI->getCondition()->getType()));
- EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
+ EdgeMask =
+ Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
}
return EdgeMaskCache[Edge] = EdgeMask;
@@ -8492,22 +8446,24 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
if (!CM.blockNeedsPredicationForAnyReason(BB))
return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
- // Create the block in mask as the first non-phi instruction in the block.
- VPBuilder::InsertPointGuard Guard(Builder);
- auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
- Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
-
// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC.
- // Start by constructing the desired canonical IV.
+ // Start by constructing the desired canonical IV in the header block.
VPValue *IV = nullptr;
if (Legal->getPrimaryInduction())
IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
else {
+ VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
auto *IVRecipe = new VPWidenCanonicalIVRecipe();
- Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
+ HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi());
IV = IVRecipe;
}
+
+ // Create the block in mask as the first non-phi instruction in the block.
+ VPBuilder::InsertPointGuard Guard(Builder);
+ auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
+ Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
+
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
bool TailFolded = !CM.isScalarEpilogueAllowed();
@@ -8534,7 +8490,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
continue;
}
- BlockMask = Builder.createOr(BlockMask, EdgeMask);
+ BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
}
return BlockMaskCache[BB] = BlockMask;
@@ -8591,14 +8547,10 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
ArrayRef<VPValue *> Operands) const {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
- InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
- if (II.getKind() == InductionDescriptor::IK_IntInduction ||
- II.getKind() == InductionDescriptor::IK_FpInduction) {
- assert(II.getStartValue() ==
+ if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
+ assert(II->getStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
- const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
- return new VPWidenIntOrFpInductionRecipe(
- Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
+ return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
}
return nullptr;
@@ -8624,11 +8576,10 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
if (LoopVectorizationPlanner::getDecisionAndClampRange(
isOptimizableIVTruncate(I), Range)) {
- InductionDescriptor II =
- Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
+ auto *Phi = cast<PHINode>(I->getOperand(0));
+ const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
- return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
- Start, nullptr, I);
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);
}
return nullptr;
}
@@ -8844,13 +8795,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
return VPBB;
}
LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
- assert(VPBB->getSuccessors().empty() &&
- "VPBB has successors when handling predicated replication.");
+
+ VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
+ assert(SingleSucc && "VPBB must have a single successor when handling "
+ "predicated replication.");
+ VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
// Record predicated instructions for above packing optimizations.
VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
VPBlockUtils::insertBlockAfter(Region, VPBB);
auto *RegSucc = new VPBasicBlock();
VPBlockUtils::insertBlockAfter(RegSucc, Region);
+ VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
return RegSucc;
}
@@ -8910,7 +8865,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
VPValue *StartV = Operands[0];
if (Legal->isReductionVariable(Phi)) {
- RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ const RecurrenceDescriptor &RdxDesc =
+ Legal->getReductionVars().find(Phi)->second;
assert(RdxDesc.getRecurrenceStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
@@ -9031,7 +8987,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
for (auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
- RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
+ RecurKind Kind =
+ Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
RecipeBuilder.recordRecipeOf(Phi);
@@ -9069,30 +9026,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------
- auto Plan = std::make_unique<VPlan>();
+ // Create initial VPlan skeleton, with separate header and latch blocks.
+ VPBasicBlock *HeaderVPBB = new VPBasicBlock();
+ VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
+ VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
+ auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
+ auto Plan = std::make_unique<VPlan>(TopRegion);
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
- VPBasicBlock *VPBB = nullptr;
- VPBasicBlock *HeaderVPBB = nullptr;
+ VPBasicBlock *VPBB = HeaderVPBB;
SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
unsigned VPBBsForBB = 0;
- auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
- if (VPBB)
- VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
- else {
- auto *TopRegion = new VPRegionBlock("vector loop");
- TopRegion->setEntry(FirstVPBBForBB);
- Plan->setEntry(TopRegion);
- HeaderVPBB = FirstVPBBForBB;
- }
- VPBB = FirstVPBBForBB;
+ VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);
// Introduce each ingredient into VPlan.
@@ -9159,13 +9111,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
: "");
}
}
+
+ VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+ VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
+ // Fold the last, empty block into its predecessor.
+ VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
+ assert(VPBB && "expected to fold last (empty) block");
+ // After here, VPBB should not be used.
+ VPBB = nullptr;
+
assert(isa<VPRegionBlock>(Plan->getEntry()) &&
!Plan->getEntry()->getEntryBasicBlock()->empty() &&
"entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock");
- cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
RecipeBuilder.fixHeaderPhis();
// ---------------------------------------------------------------------------
@@ -9231,18 +9191,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
- if (VPBB == SplitPred)
- VPBB = SplitBlock;
}
}
+ VPlanTransforms::removeRedundantInductionCasts(*Plan);
+
// Now that sink-after is done, move induction recipes for optimized truncates
// to the phi section of the header block.
for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
// Adjust the recipes for any inloop reductions.
- adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
+ adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
+ RecipeBuilder, Range.Start);
// Introduce a recipe to combine the incoming and previous values of a
// first-order recurrence.
@@ -9322,6 +9283,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RSO.flush();
Plan->setName(PlanName);
+ // Fold Exit block into its predecessor if possible.
+ // TODO: Fold block earlier once all VPlan transforms properly maintain a
+ // VPBasicBlock as exit.
+ VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
+
assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
@@ -9355,9 +9321,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
}
SmallPtrSet<Instruction *, 1> DeadInstructions;
- VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
- Legal->getInductionVars(),
- DeadInstructions, *PSE.getSE());
+ VPlanTransforms::VPInstructionsToVPRecipes(
+ OrigLoop, Plan,
+ [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
+ DeadInstructions, *PSE.getSE());
return Plan;
}
@@ -9371,7 +9338,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
ElementCount MinVF) {
for (auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
- RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ const RecurrenceDescriptor &RdxDesc =
+ Legal->getReductionVars().find(Phi)->second;
const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
@@ -9565,7 +9533,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
// exact, etc.). The control flow has been linearized and the
// instruction is no longer guarded by the predicate, which could make
// the flag properties to no longer hold.
- if (State.MayGeneratePoisonRecipes.count(this) > 0)
+ if (State.MayGeneratePoisonRecipes.contains(this))
VecOp->dropPoisonGeneratingFlags();
}
@@ -9714,9 +9682,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
- State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
- getTruncInst(), getVPValue(0),
- getCastValue(), State);
+ State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
+ getStartValue()->getLiveInIRValue(),
+ getTruncInst(), getVPValue(0), State);
}
void VPWidenPHIRecipe::execute(VPTransformState &State) {
@@ -10293,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
<< L->getHeader()->getParent()->getName() << "\" from "
<< DebugLocStr << "\n");
- LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
+ LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
LLVM_DEBUG(
dbgs() << "LV: Loop hints:"
@@ -10747,8 +10715,17 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
}
- if (!Result.MadeCFGChange)
+
+ if (Result.MadeCFGChange) {
+ // Making CFG changes likely means a loop got vectorized. Indicate that
+ // extra simplification passes should be run.
+ // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
+ // be run if runtime checks have been added.
+ AM.getResult<ShouldRunExtraVectorPasses>(F);
+ PA.preserve<ShouldRunExtraVectorPasses>();
+ } else {
PA.preserveSet<CFGAnalyses>();
+ }
return PA;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 95061e9053fa..37ae13666f7a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -631,27 +631,26 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
/// after: 6 3 5 4 7 2 1 0
static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
const unsigned Sz = Order.size();
- SmallBitVector UsedIndices(Sz);
- SmallVector<int> MaskedIndices;
+ SmallBitVector UnusedIndices(Sz, /*t=*/true);
+ SmallBitVector MaskedIndices(Sz);
for (unsigned I = 0; I < Sz; ++I) {
if (Order[I] < Sz)
- UsedIndices.set(Order[I]);
+ UnusedIndices.reset(Order[I]);
else
- MaskedIndices.push_back(I);
+ MaskedIndices.set(I);
}
- if (MaskedIndices.empty())
+ if (MaskedIndices.none())
return;
- SmallVector<int> AvailableIndices(MaskedIndices.size());
- unsigned Cnt = 0;
- int Idx = UsedIndices.find_first();
- do {
- AvailableIndices[Cnt] = Idx;
- Idx = UsedIndices.find_next(Idx);
- ++Cnt;
- } while (Idx > 0);
- assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");
- for (int I = 0, E = MaskedIndices.size(); I < E; ++I)
- Order[MaskedIndices[I]] = AvailableIndices[I];
+ assert(UnusedIndices.count() == MaskedIndices.count() &&
+ "Non-synced masked/available indices.");
+ int Idx = UnusedIndices.find_first();
+ int MIdx = MaskedIndices.find_first();
+ while (MIdx >= 0) {
+ assert(Idx >= 0 && "Indices must be synced.");
+ Order[MIdx] = Idx;
+ Idx = UnusedIndices.find_next(Idx);
+ MIdx = MaskedIndices.find_next(MIdx);
+ }
}
namespace llvm {
@@ -812,6 +811,13 @@ public:
/// ExtractElement, ExtractValue), which can be part of the graph.
Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+ /// Gets reordering data for the given tree entry. If the entry is vectorized
+ /// - just return ReorderIndices, otherwise check if the scalars can be
+ /// reordered and return the most optimal order.
+ /// \param TopToBottom If true, include the order of vectorized stores and
+ /// insertelement nodes, otherwise skip them.
+ Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
+
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
/// of the same size (vectorization factor). Smaller nodes are considered
@@ -1010,18 +1016,25 @@ public:
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
}
- // The hard-coded scores listed here are not very important. When computing
- // the scores of matching one sub-tree with another, we are basically
- // counting the number of values that are matching. So even if all scores
- // are set to 1, we would still get a decent matching result.
+ // The hard-coded scores listed here are not very important, though it shall
+ // be higher for better matches to improve the resulting cost. When
+ // computing the scores of matching one sub-tree with another, we are
+ // basically counting the number of values that are matching. So even if all
+ // scores are set to 1, we would still get a decent matching result.
// However, sometimes we have to break ties. For example we may have to
// choose between matching loads vs matching opcodes. This is what these
- // scores are helping us with: they provide the order of preference.
+ // scores are helping us with: they provide the order of preference. Also,
+ // this is important if the scalar is externally used or used in another
+ // tree entry node in the different lane.
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
- static const int ScoreConsecutiveLoads = 3;
+ static const int ScoreConsecutiveLoads = 4;
+ /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
+ static const int ScoreReversedLoads = 3;
/// ExtractElementInst from same vector and consecutive indexes.
- static const int ScoreConsecutiveExtracts = 3;
+ static const int ScoreConsecutiveExtracts = 4;
+ /// ExtractElementInst from same vector and reversed indices.
+ static const int ScoreReversedExtracts = 3;
/// Constants.
static const int ScoreConstants = 2;
/// Instructions with the same opcode.
@@ -1041,7 +1054,10 @@ public:
/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
- ScalarEvolution &SE) {
+ ScalarEvolution &SE, int NumLanes) {
+ if (V1 == V2)
+ return VLOperands::ScoreSplat;
+
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
@@ -1051,8 +1067,17 @@ public:
Optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
- return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads
- : VLOperands::ScoreFail;
+ if (!Dist)
+ return VLOperands::ScoreFail;
+ // The distance is too large - still may be profitable to use masked
+ // loads/gathers.
+ if (std::abs(*Dist) > NumLanes / 2)
+ return VLOperands::ScoreAltOpcodes;
+ // This still will detect consecutive loads, but we might have "holes"
+ // in some cases. It is ok for non-power-2 vectorization and may produce
+ // better results. It should not affect current vectorization.
+ return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
+ : VLOperands::ScoreReversedLoads;
}
auto *C1 = dyn_cast<Constant>(V1);
@@ -1062,18 +1087,41 @@ public:
// Extracts from consecutive indexes of the same vector better score as
// the extracts could be optimized away.
- Value *EV;
- ConstantInt *Ex1Idx, *Ex2Idx;
- if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
- match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
- Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
- return VLOperands::ScoreConsecutiveExtracts;
+ Value *EV1;
+ ConstantInt *Ex1Idx;
+ if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
+ // Undefs are always profitable for extractelements.
+ if (isa<UndefValue>(V2))
+ return VLOperands::ScoreConsecutiveExtracts;
+ Value *EV2 = nullptr;
+ ConstantInt *Ex2Idx = nullptr;
+ if (match(V2,
+ m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
+ m_Undef())))) {
+ // Undefs are always profitable for extractelements.
+ if (!Ex2Idx)
+ return VLOperands::ScoreConsecutiveExtracts;
+ if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
+ return VLOperands::ScoreConsecutiveExtracts;
+ if (EV2 == EV1) {
+ int Idx1 = Ex1Idx->getZExtValue();
+ int Idx2 = Ex2Idx->getZExtValue();
+ int Dist = Idx2 - Idx1;
+ // The distance is too large - still may be profitable to use
+ // shuffles.
+ if (std::abs(Dist) > NumLanes / 2)
+ return VLOperands::ScoreAltOpcodes;
+ return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
+ : VLOperands::ScoreReversedExtracts;
+ }
+ }
+ }
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (I1 && I2) {
- if (I1 == I2)
- return VLOperands::ScoreSplat;
+ if (I1->getParent() != I2->getParent())
+ return VLOperands::ScoreFail;
InstructionsState S = getSameOpcode({I1, I2});
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
@@ -1088,11 +1136,13 @@ public:
return VLOperands::ScoreFail;
}
- /// Holds the values and their lane that are taking part in the look-ahead
+ /// Holds the values and their lanes that are taking part in the look-ahead
/// score calculation. This is used in the external uses cost calculation.
- SmallDenseMap<Value *, int> InLookAheadValues;
+ /// Need to hold all the lanes in case of splat/broadcast at least to
+ /// correctly check for the use in the different lane.
+ SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;
- /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
+ /// \returns the additional cost due to uses of \p LHS and \p RHS that are
/// either external to the vectorized code, or require shuffling.
int getExternalUsesCost(const std::pair<Value *, int> &LHS,
const std::pair<Value *, int> &RHS) {
@@ -1116,22 +1166,30 @@ public:
for (User *U : V->users()) {
if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
// The user is in the VectorizableTree. Check if we need to insert.
- auto It = llvm::find(UserTE->Scalars, U);
- assert(It != UserTE->Scalars.end() && "U is in UserTE");
- int UserLn = std::distance(UserTE->Scalars.begin(), It);
+ int UserLn = UserTE->findLaneForValue(U);
assert(UserLn >= 0 && "Bad lane");
- if (UserLn != Ln)
+ // If the values are different, check just the line of the current
+ // value. If the values are the same, need to add UserInDiffLaneCost
+ // only if UserLn does not match both line numbers.
+ if ((LHS.first != RHS.first && UserLn != Ln) ||
+ (LHS.first == RHS.first && UserLn != LHS.second &&
+ UserLn != RHS.second)) {
Cost += UserInDiffLaneCost;
+ break;
+ }
} else {
// Check if the user is in the look-ahead code.
auto It2 = InLookAheadValues.find(U);
if (It2 != InLookAheadValues.end()) {
// The user is in the look-ahead code. Check the lane.
- if (It2->second != Ln)
+ if (!It2->getSecond().contains(Ln)) {
Cost += UserInDiffLaneCost;
+ break;
+ }
} else {
// The user is neither in SLP tree nor in the look-ahead code.
Cost += ExternalUseCost;
+ break;
}
}
// Limit the number of visited uses to cap compilation time.
@@ -1170,32 +1228,36 @@ public:
Value *V1 = LHS.first;
Value *V2 = RHS.first;
// Get the shallow score of V1 and V2.
- int ShallowScoreAtThisLevel =
- std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
- getExternalUsesCost(LHS, RHS));
+ int ShallowScoreAtThisLevel = std::max(
+ (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -
+ getExternalUsesCost(LHS, RHS));
int Lane1 = LHS.second;
int Lane2 = RHS.second;
// If reached MaxLevel,
// or if V1 and V2 are not instructions,
// or if they are SPLAT,
- // or if they are not consecutive, early return the current cost.
+ // or if they are not consecutive,
+ // or if profitable to vectorize loads or extractelements, early return
+ // the current cost.
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
- (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
+ (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
+ (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
+ ShallowScoreAtThisLevel))
return ShallowScoreAtThisLevel;
assert(I1 && I2 && "Should have early exited.");
// Keep track of in-tree values for determining the external-use cost.
- InLookAheadValues[V1] = Lane1;
- InLookAheadValues[V2] = Lane2;
+ InLookAheadValues[V1].insert(Lane1);
+ InLookAheadValues[V2].insert(Lane2);
// Contains the I2 operand indexes that got matched with I1 operands.
SmallSet<unsigned, 4> Op2Used;
- // Recursion towards the operands of I1 and I2. We are trying all possbile
+ // Recursion towards the operands of I1 and I2. We are trying all possible
// operand pairs, and keeping track of the best score.
for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
OpIdx1 != NumOperands1; ++OpIdx1) {
@@ -1319,27 +1381,79 @@ public:
return None;
}
- /// Helper for reorderOperandVecs. \Returns the lane that we should start
- /// reordering from. This is the one which has the least number of operands
- /// that can freely move about.
+ /// Helper for reorderOperandVecs.
+ /// \returns the lane that we should start reordering from. This is the one
+ /// which has the least number of operands that can freely move about or
+ /// less profitable because it already has the most optimal set of operands.
unsigned getBestLaneToStartReordering() const {
- unsigned BestLane = 0;
unsigned Min = UINT_MAX;
- for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
- ++Lane) {
- unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
- if (NumFreeOps < Min) {
- Min = NumFreeOps;
- BestLane = Lane;
+ unsigned SameOpNumber = 0;
+ // std::pair<unsigned, unsigned> is used to implement a simple voting
+ // algorithm and choose the lane with the least number of operands that
+ // can freely move about or less profitable because it already has the
+ // most optimal set of operands. The first unsigned is a counter for
+ // voting, the second unsigned is the counter of lanes with instructions
+ // with same/alternate opcodes and same parent basic block.
+ MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
+ // Try to be closer to the original results, if we have multiple lanes
+ // with same cost. If 2 lanes have the same cost, use the one with the
+ // lowest index.
+ for (int I = getNumLanes(); I > 0; --I) {
+ unsigned Lane = I - 1;
+ OperandsOrderData NumFreeOpsHash =
+ getMaxNumOperandsThatCanBeReordered(Lane);
+ // Compare the number of operands that can move and choose the one with
+ // the least number.
+ if (NumFreeOpsHash.NumOfAPOs < Min) {
+ Min = NumFreeOpsHash.NumOfAPOs;
+ SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
+ HashMap.clear();
+ HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
+ } else if (NumFreeOpsHash.NumOfAPOs == Min &&
+ NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
+ // Select the most optimal lane in terms of number of operands that
+ // should be moved around.
+ SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
+ HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
+ } else if (NumFreeOpsHash.NumOfAPOs == Min &&
+ NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
+ ++HashMap[NumFreeOpsHash.Hash].first;
+ }
+ }
+ // Select the lane with the minimum counter.
+ unsigned BestLane = 0;
+ unsigned CntMin = UINT_MAX;
+ for (const auto &Data : reverse(HashMap)) {
+ if (Data.second.first < CntMin) {
+ CntMin = Data.second.first;
+ BestLane = Data.second.second;
}
}
return BestLane;
}
- /// \Returns the maximum number of operands that are allowed to be reordered
- /// for \p Lane. This is used as a heuristic for selecting the first lane to
- /// start operand reordering.
- unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
+ /// Data structure that helps to reorder operands.
+ struct OperandsOrderData {
+ /// The best number of operands with the same APOs, which can be
+ /// reordered.
+ unsigned NumOfAPOs = UINT_MAX;
+ /// Number of operands with the same/alternate instruction opcode and
+ /// parent.
+ unsigned NumOpsWithSameOpcodeParent = 0;
+ /// Hash for the actual operands ordering.
+ /// Used to count operands, actually their position id and opcode
+ /// value. It is used in the voting mechanism to find the lane with the
+ /// least number of operands that can freely move about or less profitable
+ /// because it already has the most optimal set of operands. Can be
+ /// replaced with SmallVector<unsigned> instead but hash code is faster
+ /// and requires less memory.
+ unsigned Hash = 0;
+ };
+ /// \returns the maximum number of operands that are allowed to be reordered
+ /// for \p Lane and the number of compatible instructions(with the same
+ /// parent/opcode). This is used as a heuristic for selecting the first lane
+ /// to start operand reordering.
+ OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
unsigned CntTrue = 0;
unsigned NumOperands = getNumOperands();
// Operands with the same APO can be reordered. We therefore need to count
@@ -1348,11 +1462,45 @@ public:
// a map. Instead we can simply count the number of operands that
// correspond to one of them (in this case the 'true' APO), and calculate
// the other by subtracting it from the total number of operands.
- for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
- if (getData(OpIdx, Lane).APO)
+ // Operands with the same instruction opcode and parent are more
+ // profitable since we don't need to move them in many cases, with a high
+ // probability such lane already can be vectorized effectively.
+ bool AllUndefs = true;
+ unsigned NumOpsWithSameOpcodeParent = 0;
+ Instruction *OpcodeI = nullptr;
+ BasicBlock *Parent = nullptr;
+ unsigned Hash = 0;
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ const OperandData &OpData = getData(OpIdx, Lane);
+ if (OpData.APO)
++CntTrue;
- unsigned CntFalse = NumOperands - CntTrue;
- return std::max(CntTrue, CntFalse);
+ // Use Boyer-Moore majority voting for finding the majority opcode and
+ // the number of times it occurs.
+ if (auto *I = dyn_cast<Instruction>(OpData.V)) {
+ if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||
+ I->getParent() != Parent) {
+ if (NumOpsWithSameOpcodeParent == 0) {
+ NumOpsWithSameOpcodeParent = 1;
+ OpcodeI = I;
+ Parent = I->getParent();
+ } else {
+ --NumOpsWithSameOpcodeParent;
+ }
+ } else {
+ ++NumOpsWithSameOpcodeParent;
+ }
+ }
+ Hash = hash_combine(
+ Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
+ AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
+ }
+ if (AllUndefs)
+ return {};
+ OperandsOrderData Data;
+ Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
+ Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
+ Data.Hash = Hash;
+ return Data;
}
/// Go through the instructions in VL and append their operands.
@@ -1500,11 +1648,37 @@ public:
ReorderingModes[OpIdx] = ReorderingMode::Failed;
}
+ // Check that we don't have same operands. No need to reorder if operands
+ // are just perfect diamond or shuffled diamond match. Do not do it only
+ // for possible broadcasts or non-power of 2 number of scalars (just for
+ // now).
+ auto &&SkipReordering = [this]() {
+ SmallPtrSet<Value *, 4> UniqueValues;
+ ArrayRef<OperandData> Op0 = OpsVec.front();
+ for (const OperandData &Data : Op0)
+ UniqueValues.insert(Data.V);
+ for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
+ if (any_of(Op, [&UniqueValues](const OperandData &Data) {
+ return !UniqueValues.contains(Data.V);
+ }))
+ return false;
+ }
+ // TODO: Check if we can remove a check for non-power-2 number of
+ // scalars after full support of non-power-2 vectorization.
+ return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
+ };
+
// If the initial strategy fails for any of the operand indexes, then we
// perform reordering again in a second pass. This helps avoid assigning
// high priority to the failed strategy, and should improve reordering for
// the non-failed operand indexes.
for (int Pass = 0; Pass != 2; ++Pass) {
+ // Check if no need to reorder operands since they're are perfect or
+ // shuffled diamond match.
+ // Need to to do it to avoid extra external use cost counting for
+ // shuffled matches, which may cause regressions.
+ if (SkipReordering())
+ break;
// Skip the second pass if the first pass did not fail.
bool StrategyFailed = false;
// Mark all operand data as free to use.
@@ -1792,9 +1966,10 @@ private:
if (Operands.size() < OpIdx + 1)
Operands.resize(OpIdx + 1);
assert(Operands[OpIdx].empty() && "Already resized?");
- Operands[OpIdx].resize(Scalars.size());
- for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
- Operands[OpIdx][Lane] = OpVL[Lane];
+ assert(OpVL.size() <= Scalars.size() &&
+ "Number of operands is greater than the number of scalars.");
+ Operands[OpIdx].resize(OpVL.size());
+ copy(OpVL, Operands[OpIdx].begin());
}
/// Set the operands of this bundle in their original order.
@@ -1944,7 +2119,7 @@ private:
if (ReuseShuffleIndices.empty())
dbgs() << "Empty";
else
- for (unsigned ReuseIdx : ReuseShuffleIndices)
+ for (int ReuseIdx : ReuseShuffleIndices)
dbgs() << ReuseIdx << ", ";
dbgs() << "\n";
dbgs() << "ReorderIndices: ";
@@ -2819,6 +2994,50 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
return None;
}
+Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
+ bool TopToBottom) {
+ // No need to reorder if need to shuffle reuses, still need to shuffle the
+ // node.
+ if (!TE.ReuseShuffleIndices.empty())
+ return None;
+ if (TE.State == TreeEntry::Vectorize &&
+ (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
+ (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
+ !TE.isAltShuffle())
+ return TE.ReorderIndices;
+ if (TE.State == TreeEntry::NeedToGather) {
+ // TODO: add analysis of other gather nodes with extractelement
+ // instructions and other values/instructions, not only undefs.
+ if (((TE.getOpcode() == Instruction::ExtractElement &&
+ !TE.isAltShuffle()) ||
+ (all_of(TE.Scalars,
+ [](Value *V) {
+ return isa<UndefValue, ExtractElementInst>(V);
+ }) &&
+ any_of(TE.Scalars,
+ [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
+ all_of(TE.Scalars,
+ [](Value *V) {
+ auto *EE = dyn_cast<ExtractElementInst>(V);
+ return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
+ }) &&
+ allSameType(TE.Scalars)) {
+ // Check that gather of extractelements can be represented as
+ // just a shuffle of a single vector.
+ OrdersType CurrentOrder;
+ bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
+ if (Reuse || !CurrentOrder.empty()) {
+ if (!CurrentOrder.empty())
+ fixupOrderingIndices(CurrentOrder);
+ return CurrentOrder;
+ }
+ }
+ if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+ return CurrentOrder;
+ }
+ return None;
+}
+
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;
@@ -2826,42 +3045,15 @@ void BoUpSLP::reorderTopToBottom() {
// their ordering.
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
// Find all reorderable nodes with the given VF.
- // Currently the are vectorized loads,extracts + some gathering of extracts.
+ // Currently the are vectorized stores,loads,extracts + some gathering of
+ // extracts.
for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
const std::unique_ptr<TreeEntry> &TE) {
- // No need to reorder if need to shuffle reuses, still need to shuffle the
- // node.
- if (!TE->ReuseShuffleIndices.empty())
- return;
- if (TE->State == TreeEntry::Vectorize &&
- isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst,
- InsertElementInst>(TE->getMainOp()) &&
- !TE->isAltShuffle()) {
+ if (Optional<OrdersType> CurrentOrder =
+ getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
- return;
- }
- if (TE->State == TreeEntry::NeedToGather) {
- if (TE->getOpcode() == Instruction::ExtractElement &&
- !TE->isAltShuffle() &&
- isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
- ->getVectorOperandType()) &&
- allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
- // Check that gather of extractelements can be represented as
- // just a shuffle of a single vector.
- OrdersType CurrentOrder;
- bool Reuse =
- canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
- if (Reuse || !CurrentOrder.empty()) {
- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
- GathersToOrders.try_emplace(TE.get(), CurrentOrder);
- return;
- }
- }
- if (Optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(*TE.get())) {
- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ if (TE->State != TreeEntry::Vectorize)
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
- }
}
});
@@ -2993,44 +3185,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
const std::unique_ptr<TreeEntry> &TE) {
if (TE->State != TreeEntry::Vectorize)
NonVectorized.push_back(TE.get());
- // No need to reorder if need to shuffle reuses, still need to shuffle the
- // node.
- if (!TE->ReuseShuffleIndices.empty())
- return;
- if (TE->State == TreeEntry::Vectorize &&
- isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) &&
- !TE->isAltShuffle()) {
+ if (Optional<OrdersType> CurrentOrder =
+ getReorderingData(*TE.get(), /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
- return;
- }
- if (TE->State == TreeEntry::NeedToGather) {
- if (TE->getOpcode() == Instruction::ExtractElement &&
- !TE->isAltShuffle() &&
- isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
- ->getVectorOperandType()) &&
- allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
- // Check that gather of extractelements can be represented as
- // just a shuffle of a single vector with a single user only.
- OrdersType CurrentOrder;
- bool Reuse =
- canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
- if ((Reuse || !CurrentOrder.empty()) &&
- !any_of(VectorizableTree,
- [&TE](const std::unique_ptr<TreeEntry> &Entry) {
- return Entry->State == TreeEntry::NeedToGather &&
- Entry.get() != TE.get() &&
- Entry->isSame(TE->Scalars);
- })) {
- OrderedEntries.insert(TE.get());
- GathersToOrders.try_emplace(TE.get(), CurrentOrder);
- return;
- }
- }
- if (Optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(*TE.get())) {
- OrderedEntries.insert(TE.get());
+ if (TE->State != TreeEntry::Vectorize)
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
- }
}
});
@@ -3392,9 +3551,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check that every instruction appears once in this bundle.
DenseMap<Value *, unsigned> UniquePositions;
for (Value *V : VL) {
+ if (isConstant(V)) {
+ ReuseShuffleIndicies.emplace_back(
+ isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());
+ UniqueValues.emplace_back(V);
+ continue;
+ }
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
- ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1
- : Res.first->second);
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
if (Res.second)
UniqueValues.emplace_back(V);
}
@@ -3404,6 +3568,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
} else {
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
if (NumUniqueScalarValues <= 1 ||
+ (UniquePositions.size() == 1 && all_of(UniqueValues,
+ [](Value *V) {
+ return isa<UndefValue>(V) ||
+ !isConstant(V);
+ })) ||
!llvm::isPowerOf2_32(NumUniqueScalarValues)) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
@@ -3508,11 +3677,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
- // If any of the scalars is marked as a value that needs to stay scalar, then
- // we need to gather the scalars.
// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
for (Value *V : VL) {
- if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+ if (is_contained(UserIgnoreList, V)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -4219,10 +4386,17 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder) const {
- Instruction *E0 = cast<Instruction>(OpValue);
- assert(E0->getOpcode() == Instruction::ExtractElement ||
- E0->getOpcode() == Instruction::ExtractValue);
- assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
+ const auto *It = find_if(VL, [](Value *V) {
+ return isa<ExtractElementInst, ExtractValueInst>(V);
+ });
+ assert(It != VL.end() && "Expected at least one extract instruction.");
+ auto *E0 = cast<Instruction>(*It);
+ assert(all_of(VL,
+ [](Value *V) {
+ return isa<UndefValue, ExtractElementInst, ExtractValueInst>(
+ V);
+ }) &&
+ "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
Value *Vec = E0->getOperand(0);
@@ -4255,23 +4429,28 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
// Also, later we can check that all the indices are used and we have a
// consecutive access in the extract instructions, by checking that no
// element of CurrentOrder still has value E + 1.
- CurrentOrder.assign(E, E + 1);
+ CurrentOrder.assign(E, E);
unsigned I = 0;
for (; I < E; ++I) {
- auto *Inst = cast<Instruction>(VL[I]);
+ auto *Inst = dyn_cast<Instruction>(VL[I]);
+ if (!Inst)
+ continue;
if (Inst->getOperand(0) != Vec)
break;
+ if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
+ if (isa<UndefValue>(EE->getIndexOperand()))
+ continue;
Optional<unsigned> Idx = getExtractIndex(Inst);
if (!Idx)
break;
const unsigned ExtIdx = *Idx;
if (ExtIdx != I) {
- if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)
break;
ShouldKeepOrder = false;
CurrentOrder[ExtIdx] = I;
} else {
- if (CurrentOrder[I] != E + 1)
+ if (CurrentOrder[I] != E)
break;
CurrentOrder[I] = I;
}
@@ -4287,8 +4466,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
bool BoUpSLP::areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const {
return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
- llvm::all_of(I->users(), [this](User *U) {
- return ScalarToTreeEntry.count(U) > 0;
+ all_of(I->users(), [this](User *U) {
+ return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);
});
}
@@ -4348,6 +4527,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
for (auto *V : VL) {
++Idx;
+ // Need to exclude undefs from analysis.
+ if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
+ continue;
+
// Reached the start of a new vector registers.
if (Idx % EltsPerVector == 0) {
AllConsecutive = true;
@@ -4357,9 +4540,11 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
// Check all extracts for a vector register on the target directly
// extract values in order.
unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
- CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+ if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {
+ unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
+ AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
+ CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+ }
if (AllConsecutive)
continue;
@@ -4442,9 +4627,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// FIXME: it tries to fix a problem with MSVC buildbots.
TargetTransformInfo &TTIRef = *TTI;
auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
- VectorizedVals](InstructionCost &Cost,
- bool IsGather) {
+ VectorizedVals, E](InstructionCost &Cost) {
DenseMap<Value *, int> ExtractVectorsTys;
+ SmallPtrSet<Value *, 4> CheckedExtracts;
for (auto *V : VL) {
if (isa<UndefValue>(V))
continue;
@@ -4452,7 +4637,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// instruction itself is not going to be vectorized, consider this
// instruction as dead and remove its cost from the final cost of the
// vectorized tree.
- if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals))
+ // Also, avoid adjusting the cost for extractelements with multiple uses
+ // in different graph entries.
+ const TreeEntry *VE = getTreeEntry(V);
+ if (!CheckedExtracts.insert(V).second ||
+ !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
+ (VE && VE != E))
continue;
auto *EE = cast<ExtractElementInst>(V);
Optional<unsigned> EEIdx = getExtractIndex(EE);
@@ -4549,11 +4739,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
}
return GatherCost;
}
- if (isSplat(VL)) {
- // Found the broadcasting of the single scalar, calculate the cost as the
- // broadcast.
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
- }
if ((E->getOpcode() == Instruction::ExtractElement ||
all_of(E->Scalars,
[](Value *V) {
@@ -4571,13 +4756,20 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// single input vector or of 2 input vectors.
InstructionCost Cost =
computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
- AdjustExtractsCost(Cost, /*IsGather=*/true);
+ AdjustExtractsCost(Cost);
if (NeedToShuffleReuses)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
FinalVecTy, E->ReuseShuffleIndices);
return Cost;
}
}
+ if (isSplat(VL)) {
+ // Found the broadcasting of the single scalar, calculate the cost as the
+ // broadcast.
+ assert(VecTy == FinalVecTy &&
+ "No reused scalars expected for broadcast.");
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+ }
InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses)
ReuseShuffleCost = TTI->getShuffleCost(
@@ -4755,7 +4947,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
}
} else {
- AdjustExtractsCost(CommonCost, /*IsGather=*/false);
+ AdjustExtractsCost(CommonCost);
}
return CommonCost;
}
@@ -5211,15 +5403,15 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
FoundOr = true;
}
// Check if the input is an extended load of the required or/shift expression.
- Value *LoadPtr;
+ Value *Load;
if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
- !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
return false;
// Require that the total load bit width is a legal integer type.
// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
- Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+ Type *SrcTy = Load->getType();
unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
return false;
@@ -9061,8 +9253,7 @@ private:
"A call to the llvm.fmuladd intrinsic is not handled yet");
++NumVectorInstructions;
- return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
- ReductionOps.back());
+ return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);
}
};
@@ -9473,6 +9664,59 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
return Changed;
}
+/// Compare two cmp instructions. If IsCompatibility is true, function returns
+/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
+/// operands. If IsCompatibility is false, function implements strict weak
+/// ordering relation between two cmp instructions, returning true if the first
+/// instruction is "less" than the second, i.e. its predicate is less than the
+/// predicate of the second or the operands IDs are less than the operands IDs
+/// of the second cmp instruction.
+template <bool IsCompatibility>
+static bool compareCmp(Value *V, Value *V2,
+ function_ref<bool(Instruction *)> IsDeleted) {
+ auto *CI1 = cast<CmpInst>(V);
+ auto *CI2 = cast<CmpInst>(V2);
+ if (IsDeleted(CI2) || !isValidElementType(CI2->getType()))
+ return false;
+ if (CI1->getOperand(0)->getType()->getTypeID() <
+ CI2->getOperand(0)->getType()->getTypeID())
+ return !IsCompatibility;
+ if (CI1->getOperand(0)->getType()->getTypeID() >
+ CI2->getOperand(0)->getType()->getTypeID())
+ return false;
+ CmpInst::Predicate Pred1 = CI1->getPredicate();
+ CmpInst::Predicate Pred2 = CI2->getPredicate();
+ CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);
+ CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);
+ CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
+ CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
+ if (BasePred1 < BasePred2)
+ return !IsCompatibility;
+ if (BasePred1 > BasePred2)
+ return false;
+ // Compare operands.
+ bool LEPreds = Pred1 <= Pred2;
+ bool GEPreds = Pred1 >= Pred2;
+ for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
+ auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);
+ auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);
+ if (Op1->getValueID() < Op2->getValueID())
+ return !IsCompatibility;
+ if (Op1->getValueID() > Op2->getValueID())
+ return false;
+ if (auto *I1 = dyn_cast<Instruction>(Op1))
+ if (auto *I2 = dyn_cast<Instruction>(Op2)) {
+ if (I1->getParent() != I2->getParent())
+ return false;
+ InstructionsState S = getSameOpcode({I1, I2});
+ if (S.getOpcode())
+ continue;
+ return false;
+ }
+ }
+ return IsCompatibility;
+}
+
bool SLPVectorizerPass::vectorizeSimpleInstructions(
SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
bool AtTerminator) {
@@ -9504,37 +9748,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
}
// Try to vectorize list of compares.
// Sort by type, compare predicate, etc.
- // TODO: Add analysis on the operand opcodes (profitable to vectorize
- // instructions with same/alternate opcodes/const values).
auto &&CompareSorter = [&R](Value *V, Value *V2) {
- auto *CI1 = cast<CmpInst>(V);
- auto *CI2 = cast<CmpInst>(V2);
- if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
- return false;
- if (CI1->getOperand(0)->getType()->getTypeID() <
- CI2->getOperand(0)->getType()->getTypeID())
- return true;
- if (CI1->getOperand(0)->getType()->getTypeID() >
- CI2->getOperand(0)->getType()->getTypeID())
- return false;
- return CI1->getPredicate() < CI2->getPredicate() ||
- (CI1->getPredicate() > CI2->getPredicate() &&
- CI1->getPredicate() <
- CmpInst::getSwappedPredicate(CI2->getPredicate()));
+ return compareCmp<false>(V, V2,
+ [&R](Instruction *I) { return R.isDeleted(I); });
};
auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {
if (V1 == V2)
return true;
- auto *CI1 = cast<CmpInst>(V1);
- auto *CI2 = cast<CmpInst>(V2);
- if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
- return false;
- if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())
- return false;
- return CI1->getPredicate() == CI2->getPredicate() ||
- CI1->getPredicate() ==
- CmpInst::getSwappedPredicate(CI2->getPredicate());
+ return compareCmp<true>(V1, V2,
+ [&R](Instruction *I) { return R.isDeleted(I); });
};
auto Limit = [&R](Value *V) {
unsigned EltSize = R.getVectorElementSize(V);
@@ -9592,10 +9815,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
return true;
if (Opcodes1.size() > Opcodes2.size())
return false;
+ Optional<bool> ConstOrder;
for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
// Undefs are compatible with any other value.
- if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {
+ if (!ConstOrder)
+ ConstOrder =
+ !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);
continue;
+ }
if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
@@ -9614,14 +9842,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
return I1->getOpcode() < I2->getOpcode();
}
- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {
+ if (!ConstOrder)
+ ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();
continue;
+ }
if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
return true;
if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
return false;
}
- return false;
+ return ConstOrder && *ConstOrder;
};
auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
if (V1 == V2)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 44b5e1df0839..1d9e71663cd2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -374,8 +374,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
assert((SplitAt == end() || SplitAt->getParent() == this) &&
"can only split at a position in the same block");
- SmallVector<VPBlockBase *, 2> Succs(getSuccessors().begin(),
- getSuccessors().end());
+ SmallVector<VPBlockBase *, 2> Succs(successors());
// First, disconnect the current block from its successors.
for (VPBlockBase *Succ : Succs)
VPBlockUtils::disconnectBlocks(this, Succ);
@@ -642,6 +641,7 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
void VPInstruction::generateInstruction(VPTransformState &State,
unsigned Part) {
IRBuilder<> &Builder = State.Builder;
+ Builder.SetCurrentDebugLocation(DL);
if (Instruction::isBinaryOp(getOpcode())) {
Value *A = State.get(getOperand(0), Part);
@@ -768,6 +768,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
O << " ";
Operand->printAsOperand(O, SlotTracker);
}
+
+ if (DL) {
+ O << ", !dbg ";
+ DL.print(O);
+ }
}
#endif
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 810dd5030f95..f4a1883e35d5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -39,6 +39,7 @@
#include "llvm/ADT/ilist.h"
#include "llvm/ADT/ilist_node.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/InstructionCost.h"
#include <algorithm>
@@ -51,6 +52,7 @@ namespace llvm {
class BasicBlock;
class DominatorTree;
+class InductionDescriptor;
class InnerLoopVectorizer;
class LoopInfo;
class raw_ostream;
@@ -500,6 +502,8 @@ public:
const VPBlocksTy &getSuccessors() const { return Successors; }
VPBlocksTy &getSuccessors() { return Successors; }
+ iterator_range<VPBlockBase **> successors() { return Successors; }
+
const VPBlocksTy &getPredecessors() const { return Predecessors; }
VPBlocksTy &getPredecessors() { return Predecessors; }
@@ -795,6 +799,7 @@ private:
typedef unsigned char OpcodeTy;
OpcodeTy Opcode;
FastMathFlags FMF;
+ DebugLoc DL;
/// Utility method serving execute(): generates a single instance of the
/// modeled instruction.
@@ -804,12 +809,14 @@ protected:
void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
public:
- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL)
: VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),
- VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
+ VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode),
+ DL(DL) {}
- VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
- : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
+ DebugLoc DL = {})
+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {}
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPValue *V) {
@@ -818,7 +825,7 @@ public:
VPInstruction *clone() const {
SmallVector<VPValue *, 2> Operands(operands());
- return new VPInstruction(Opcode, Operands);
+ return new VPInstruction(Opcode, Operands, DL);
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1003,21 +1010,22 @@ public:
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector and scalar values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
PHINode *IV;
+ const InductionDescriptor &IndDesc;
public:
- VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast,
- TruncInst *Trunc = nullptr)
- : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) {
- if (Trunc)
- new VPValue(Trunc, this);
- else
- new VPValue(IV, this);
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+ const InductionDescriptor &IndDesc)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
+ IV(IV), IndDesc(IndDesc) {}
+
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+ const InductionDescriptor &IndDesc,
+ TruncInst *Trunc)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
+ IV(IV), IndDesc(IndDesc) {}
- if (Cast)
- new VPValue(Cast, this);
- }
~VPWidenIntOrFpInductionRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1038,13 +1046,6 @@ public:
/// Returns the start value of the induction.
VPValue *getStartValue() { return getOperand(0); }
- /// Returns the cast VPValue, if one is attached, or nullptr otherwise.
- VPValue *getCastValue() {
- if (getNumDefinedValues() != 2)
- return nullptr;
- return getVPValue(1);
- }
-
/// Returns the first defined value as TruncInst, if it is one or nullptr
/// otherwise.
TruncInst *getTruncInst() {
@@ -1053,6 +1054,9 @@ public:
const TruncInst *getTruncInst() const {
return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
}
+
+ /// Returns the induction descriptor for the recipe.
+ const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
};
/// A recipe for handling first order recurrences and pointer inductions. For
@@ -1169,7 +1173,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {
/// operand.
class VPReductionPHIRecipe : public VPWidenPHIRecipe {
/// Descriptor for the reduction.
- RecurrenceDescriptor &RdxDesc;
+ const RecurrenceDescriptor &RdxDesc;
/// The phi is part of an in-loop reduction.
bool IsInLoop;
@@ -1180,7 +1184,7 @@ class VPReductionPHIRecipe : public VPWidenPHIRecipe {
public:
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
- VPReductionPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc,
+ VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
VPValue &Start, bool IsInLoop = false,
bool IsOrdered = false)
: VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
@@ -1210,7 +1214,9 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
- RecurrenceDescriptor &getRecurrenceDescriptor() { return RdxDesc; }
+ const RecurrenceDescriptor &getRecurrenceDescriptor() const {
+ return RdxDesc;
+ }
/// Returns true, if the phi is part of an ordered reduction.
bool isOrdered() const { return IsOrdered; }
@@ -1340,13 +1346,13 @@ public:
/// The Operands are {ChainOp, VecOp, [Condition]}.
class VPReductionRecipe : public VPRecipeBase, public VPValue {
/// The recurrence decriptor for the reduction in question.
- RecurrenceDescriptor *RdxDesc;
+ const RecurrenceDescriptor *RdxDesc;
/// Pointer to the TTI, needed to create the target reduction
const TargetTransformInfo *TTI;
public:
- VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp,
- VPValue *VecOp, VPValue *CondOp,
+ VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I,
+ VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
const TargetTransformInfo *TTI)
: VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}),
VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) {
@@ -2252,6 +2258,12 @@ public:
return map_range(Operands, Fn);
}
+ /// Returns true if \p VPV is uniform after vectorization.
+ bool isUniformAfterVectorization(VPValue *VPV) const {
+ auto RepR = dyn_cast_or_null<VPReplicateRecipe>(VPV->getDef());
+ return !VPV->getDef() || (RepR && RepR->isUniform());
+ }
+
private:
/// Add to the given dominator tree the header block and every new basic block
/// that was created between it and the latch block, inclusive.
@@ -2340,18 +2352,23 @@ public:
/// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
/// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
- /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
- /// has more than one successor, its conditional bit is propagated to \p
- /// NewBlock. \p NewBlock must have neither successors nor predecessors.
+ /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
+ /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's
+ /// conditional bit is propagated to \p NewBlock. \p NewBlock must have
+ /// neither successors nor predecessors.
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
assert(NewBlock->getSuccessors().empty() &&
- "Can't insert new block with successors.");
- // TODO: move successors from BlockPtr to NewBlock when this functionality
- // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
- // already has successors.
- BlockPtr->setOneSuccessor(NewBlock);
- NewBlock->setPredecessors({BlockPtr});
+ NewBlock->getPredecessors().empty() &&
+ "Can't insert new block with predecessors or successors.");
NewBlock->setParent(BlockPtr->getParent());
+ SmallVector<VPBlockBase *> Succs(BlockPtr->successors());
+ for (VPBlockBase *Succ : Succs) {
+ disconnectBlocks(BlockPtr, Succ);
+ connectBlocks(NewBlock, Succ);
+ }
+ NewBlock->setCondBit(BlockPtr->getCondBit());
+ BlockPtr->setCondBit(nullptr);
+ connectBlocks(BlockPtr, NewBlock);
}
/// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
@@ -2394,6 +2411,31 @@ public:
To->removePredecessor(From);
}
+ /// Try to merge \p Block into its single predecessor, if \p Block is a
+ /// VPBasicBlock and its predecessor has a single successor. Returns a pointer
+ /// to the predecessor \p Block was merged into or nullptr otherwise.
+ static VPBasicBlock *tryToMergeBlockIntoPredecessor(VPBlockBase *Block) {
+ auto *VPBB = dyn_cast<VPBasicBlock>(Block);
+ auto *PredVPBB =
+ dyn_cast_or_null<VPBasicBlock>(Block->getSinglePredecessor());
+ if (!VPBB || !PredVPBB || PredVPBB->getNumSuccessors() != 1)
+ return nullptr;
+
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB))
+ R.moveBefore(*PredVPBB, PredVPBB->end());
+ VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
+ auto *ParentRegion = cast<VPRegionBlock>(Block->getParent());
+ if (ParentRegion->getExit() == Block)
+ ParentRegion->setExit(PredVPBB);
+ SmallVector<VPBlockBase *> Successors(Block->successors());
+ for (auto *Succ : Successors) {
+ VPBlockUtils::disconnectBlocks(Block, Succ);
+ VPBlockUtils::connectBlocks(PredVPBB, Succ);
+ }
+ delete Block;
+ return PredVPBB;
+ }
+
/// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.
static bool isBackEdge(const VPBlockBase *FromBlock,
const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index ac3b3505dc34..86ecd6817873 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -50,14 +50,14 @@ VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,
case EdgeType::FALSE_EDGE:
// CurrBB is the False successor of PredBB - compute not of CBV.
- IntermediateVal = Builder.createNot(CBV);
+ IntermediateVal = Builder.createNot(CBV, {});
break;
}
// Now AND intermediate value with PredBB's block predicate if it has one.
VPValue *BP = PredBB->getPredicate();
if (BP)
- return Builder.createAnd(BP, IntermediateVal);
+ return Builder.createAnd(BP, IntermediateVal, {});
else
return IntermediateVal;
}
@@ -96,7 +96,7 @@ VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {
Worklist.pop_front();
// Create an OR of these values.
- VPValue *Or = Builder.createOr(LHS, RHS);
+ VPValue *Or = Builder.createOr(LHS, RHS, {});
// Push OR to the back of the worklist.
Worklist.push_back(Or);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index c52c8a2229e8..9e19e172dea5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -467,8 +467,9 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
return markFailed();
assert(CombinedOperands.size() > 0 && "Need more some operands");
- auto *VPI = new VPInstruction(Opcode, CombinedOperands);
- VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
+ auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
+ auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
+ VPI->setUnderlyingInstr(Inst);
LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
<< *cast<VPInstruction>(Values[0]) << "\n");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ded5bc04beb5..d2daf558c2c5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -18,7 +18,8 @@ using namespace llvm;
void VPlanTransforms::VPInstructionsToVPRecipes(
Loop *OrigLoop, VPlanPtr &Plan,
- LoopVectorizationLegality::InductionList &Inductions,
+ function_ref<const InductionDescriptor *(PHINode *)>
+ GetIntOrFpInductionDescriptor,
SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) {
auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
@@ -44,11 +45,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
VPRecipeBase *NewRecipe = nullptr;
if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
- InductionDescriptor II = Inductions.lookup(Phi);
- if (II.getKind() == InductionDescriptor::IK_IntInduction ||
- II.getKind() == InductionDescriptor::IK_FpInduction) {
- VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
- NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr);
+ if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {
+ VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());
+ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II);
} else {
Plan->addVPValue(Phi, VPPhi);
continue;
@@ -158,8 +157,7 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
// TODO: add ".cloned" suffix to name of Clone's VPValue.
Clone->insertBefore(SinkCandidate);
- SmallVector<VPUser *, 4> Users(SinkCandidate->user_begin(),
- SinkCandidate->user_end());
+ SmallVector<VPUser *, 4> Users(SinkCandidate->users());
for (auto *U : Users) {
auto *UI = cast<VPRecipeBase>(U);
if (UI->getParent() == SinkTo)
@@ -266,8 +264,7 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
VPValue *PredInst1 =
cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
- SmallVector<VPUser *> Users(Phi1ToMoveV->user_begin(),
- Phi1ToMoveV->user_end());
+ SmallVector<VPUser *> Users(Phi1ToMoveV->users());
for (VPUser *U : Users) {
auto *UI = dyn_cast<VPRecipeBase>(U);
if (!UI || UI->getParent() != Then2)
@@ -295,3 +292,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
delete ToDelete;
return Changed;
}
+
+void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
+ SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove;
+ for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) {
+ auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+ if (!IV || IV->getTruncInst())
+ continue;
+
+ // Visit all casts connected to IV and in Casts. Collect them.
+ // remember them for removal.
+ auto &Casts = IV->getInductionDescriptor().getCastInsts();
+ VPValue *FindMyCast = IV;
+ for (Instruction *IRCast : reverse(Casts)) {
+ VPRecipeBase *FoundUserCast = nullptr;
+ for (auto *U : FindMyCast->users()) {
+ auto *UserCast = cast<VPRecipeBase>(U);
+ if (UserCast->getNumDefinedValues() == 1 &&
+ UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) {
+ FoundUserCast = UserCast;
+ break;
+ }
+ }
+ assert(FoundUserCast && "Missing a cast to remove");
+ CastsToRemove.emplace_back(FoundUserCast, IV);
+ FindMyCast = FoundUserCast->getVPSingleValue();
+ }
+ }
+ for (auto &E : CastsToRemove) {
+ E.first->getVPSingleValue()->replaceAllUsesWith(E.second);
+ E.first->eraseFromParent();
+ }
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index c740f2c022da..a82a562d5e35 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -14,24 +14,37 @@
#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
#include "VPlan.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
namespace llvm {
+class InductionDescriptor;
class Instruction;
+class PHINode;
class ScalarEvolution;
struct VPlanTransforms {
/// Replaces the VPInstructions in \p Plan with corresponding
/// widen recipes.
- static void VPInstructionsToVPRecipes(
- Loop *OrigLoop, VPlanPtr &Plan,
- LoopVectorizationLegality::InductionList &Inductions,
- SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE);
+ static void
+ VPInstructionsToVPRecipes(Loop *OrigLoop, VPlanPtr &Plan,
+ function_ref<const InductionDescriptor *(PHINode *)>
+ GetIntOrFpInductionDescriptor,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions,
+ ScalarEvolution &SE);
static bool sinkScalarOperands(VPlan &Plan);
static bool mergeReplicateRegions(VPlan &Plan);
+
+ /// Remove redundant casts of inductions.
+ ///
+ /// Such redundant casts are casts of induction variables that can be ignored,
+ /// because we already proved that the casted phi is equal to the uncasted phi
+ /// in the vectorized loop. There is no need to vectorize the cast - the same
+ /// value can be used for both the phi and casts in the vector loop.
+ static void removeRedundantInductionCasts(VPlan &Plan);
};
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 6d6ea4eb30f1..7732d9367985 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -156,5 +156,31 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
RecipeI++;
}
}
+
+ const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry());
+ const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry());
+ if (!Entry) {
+ errs() << "VPlan entry block is not a VPBasicBlock\n";
+ return false;
+ }
+ const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit());
+ if (!Exit) {
+ errs() << "VPlan exit block is not a VPBasicBlock\n";
+ return false;
+ }
+
+ for (const VPRegionBlock *Region :
+ VPBlockUtils::blocksOnly<const VPRegionBlock>(
+ depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(
+ Plan.getEntry())))) {
+ if (Region->getEntry()->getNumPredecessors() != 0) {
+ errs() << "region entry block has predecessors\n";
+ return false;
+ }
+ if (Region->getExit()->getNumSuccessors() != 0) {
+ errs() << "region exit block has successors\n";
+ return false;
+ }
+ }
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 57b11e9414ba..c0aedab2fed0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -989,9 +989,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
if (!FixedVT)
return false;
- InstructionCost OriginalCost = TTI.getMemoryOpCost(
- Instruction::Load, LI->getType(), Align(LI->getAlignment()),
- LI->getPointerAddressSpace());
+ InstructionCost OriginalCost =
+ TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(),
+ LI->getPointerAddressSpace());
InstructionCost ScalarizedCost = 0;
Instruction *LastCheckedInst = LI;