aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-02-05 20:07:43 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-05-14 11:44:47 +0000
commit1fd87a682ad7442327078e1eeb63edc4258f9815 (patch)
tree83b42223e987ef7df2e1036937bc1bb627fa2779 /contrib/llvm-project/llvm/lib/Transforms/Vectorize
parent04eeddc0aa8e0a417a16eaf9d7d095207f4a8623 (diff)
parentecbca9f5fb7d7613d2b94982c4825eb0d33d6842 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp161
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp198
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h89
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp17
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp1
7 files changed, 354 insertions, 121 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d11f4146b590..3290439ecd07 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -632,13 +632,6 @@ protected:
Instruction *EntryVal, VPValue *Def,
VPTransformState &State);
- /// Returns true if an instruction \p I should be scalarized instead of
- /// vectorized for the chosen vectorization factor.
- bool shouldScalarizeInstruction(Instruction *I) const;
-
- /// Returns true if we should generate a scalar version of \p IV.
- bool needsScalarInduction(Instruction *IV) const;
-
/// Returns (and creates if needed) the original loop trip count.
Value *getOrCreateTripCount(Loop *NewLoop);
@@ -2479,21 +2472,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
VecInd->addIncoming(LastInduction, LoopVectorLatch);
}
-bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
- return Cost->isScalarAfterVectorization(I, VF) ||
- Cost->isProfitableToScalarize(I, VF);
-}
-
-bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
- if (shouldScalarizeInstruction(IV))
- return true;
- auto isScalarInst = [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
- };
- return llvm::any_of(IV->users(), isScalarInst);
-}
-
void InnerLoopVectorizer::widenIntOrFpInduction(
PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
Value *CanonicalIV) {
@@ -2549,27 +2527,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(
return ScalarIV;
};
- // Create the vector values from the scalar IV, in the absence of creating a
- // vector IV.
- auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
- Value *Broadcasted = getBroadcastInstrs(ScalarIV);
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *StartIdx;
- if (Step->getType()->isFloatingPointTy())
- StartIdx =
- getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);
- else
- StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
-
- Value *EntryPart =
- getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(),
- State.VF, State.Builder);
- State.set(Def, EntryPart, Part);
- if (Trunc)
- addMetadata(EntryPart, Trunc);
- }
- };
-
// Fast-math-flags propagate from the original induction instruction.
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
@@ -2605,36 +2562,18 @@ void InnerLoopVectorizer::widenIntOrFpInduction(
return;
}
- // Determine if we want a scalar version of the induction variable. This is
- // true if the induction variable itself is not widened, or if it has at
- // least one user in the loop that is not widened.
- auto NeedsScalarIV = needsScalarInduction(EntryVal);
- if (!NeedsScalarIV) {
+ // Create a new independent vector induction variable, if one is needed.
+ if (Def->needsVectorIV())
createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
- return;
- }
- // Try to create a new independent vector induction variable. If we can't
- // create the phi node, we will splat the scalar induction variable in each
- // loop iteration.
- if (!shouldScalarizeInstruction(EntryVal)) {
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
- Value *ScalarIV = CreateScalarIV(Step);
+ if (Def->needsScalarIV()) {
// Create scalar steps that can be used by instructions we will later
// scalarize. Note that the addition of the scalar steps will not increase
// the number of instructions in the loop in the common case prior to
// InstCombine. We will be trading one vector extract for each scalar step.
+ Value *ScalarIV = CreateScalarIV(Step);
buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
- return;
}
-
- // All IV users are scalar instructions, so only emit a scalar IV, not a
- // vectorised IV. Except when we tail-fold, then the splat IV feeds the
- // predicate used by the masked loads/stores.
- Value *ScalarIV = CreateScalarIV(Step);
- if (!Cost->isScalarEpilogueAllowed())
- CreateSplatIV(ScalarIV, Step);
- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
}
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
@@ -2663,17 +2602,15 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
}
// Determine the number of scalars we need to generate for each unroll
- // iteration. If EntryVal is uniform, we only need to generate the first
- // lane. Otherwise, we generate all VF values.
- bool IsUniform =
- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
- unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
+ // iteration.
+ bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
+ unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
// Compute the scalar steps and save the results in State.
Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
ScalarIVTy->getScalarSizeInBits());
Type *VecIVTy = nullptr;
Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
- if (!IsUniform && State.VF.isScalable()) {
+ if (!FirstLaneOnly && State.VF.isScalable()) {
VecIVTy = VectorType::get(ScalarIVTy, State.VF);
UnitStepVec =
Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
@@ -2684,7 +2621,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
- if (!IsUniform && State.VF.isScalable()) {
+ if (!FirstLaneOnly && State.VF.isScalable()) {
auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
if (ScalarIVTy->isFloatingPointTy())
@@ -4565,7 +4502,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// Determine the number of scalars we need to generate for each unroll
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
- bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
+ bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
assert((IsUniform || !State.VF.isScalable()) &&
"Cannot scalarize a scalable VF");
unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
@@ -5889,7 +5826,9 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// consider interleaving beneficial (eg. MVE).
if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
return false;
- if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
+ // FIXME: We should consider changing the threshold for scalable
+ // vectors to take VScaleForTuning into account.
+ if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
return true;
return false;
}
@@ -5940,29 +5879,21 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
return Result;
}
- auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
- if (MainLoopVF.isScalable())
- LLVM_DEBUG(
- dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
- "yet supported. Converting to fixed-width (VF="
- << FixedMainLoopVF << ") instead\n");
-
- if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
+ if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n");
return Result;
}
for (auto &NextVF : ProfitableVFs)
- if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
- (Result.Width.getFixedValue() == 1 ||
- isMoreProfitable(NextVF, Result)) &&
+ if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+ (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
LVP.hasPlanWithVF(NextVF.Width))
Result = NextVF;
if (Result != VectorizationFactor::Disabled())
LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
- << Result.Width.getFixedValue() << "\n";);
+ << Result.Width << "\n";);
return Result;
}
@@ -8546,16 +8477,54 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
Mask, Consecutive, Reverse);
}
-VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
- ArrayRef<VPValue *> Operands) const {
+static VPWidenIntOrFpInductionRecipe *
+createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
+ VPValue *Start, const InductionDescriptor &IndDesc,
+ LoopVectorizationCostModel &CM, Loop &OrigLoop,
+ VFRange &Range) {
+ // Returns true if an instruction \p I should be scalarized instead of
+ // vectorized for the chosen vectorization factor.
+ auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
+ return CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF);
+ };
+
+ bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) {
+ // Returns true if we should generate a scalar version of \p IV.
+ if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
+ return true;
+ auto isScalarInst = [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
+ };
+ return any_of(PhiOrTrunc->users(), isScalarInst);
+ },
+ Range);
+ bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) {
+ return ShouldScalarizeInstruction(PhiOrTrunc, VF);
+ },
+ Range);
+ assert(IndDesc.getStartValue() ==
+ Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
+ if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI,
+ NeedsScalarIV, !NeedsScalarIVOnly);
+ }
+ assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
+ !NeedsScalarIVOnly);
+}
+
+VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
+ PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
+
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
- if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
- assert(II->getStartValue() ==
- Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
- return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
- }
+ if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
+ return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop,
+ Range);
return nullptr;
}
@@ -8583,7 +8552,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
auto *Phi = cast<PHINode>(I->getOperand(0));
const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);
+ return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range);
}
return nullptr;
}
@@ -8865,7 +8834,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (auto Phi = dyn_cast<PHINode>(Instr)) {
if (Phi->getParent() != OrigLoop->getHeader())
return tryToBlend(Phi, Operands, Plan);
- if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
+ if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
return toVPRecipeResult(Recipe);
VPHeaderPHIRecipe *PhiRecipe = nullptr;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 99c265fc5101..15b349f53fd9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -471,17 +471,36 @@ static bool isValidForAlternation(unsigned Opcode) {
return true;
}
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ unsigned BaseIndex = 0);
+
+/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
+/// compatible instructions or constants, or just some other regular values.
+static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
+ Value *Op1) {
+ return (isConstant(BaseOp0) && isConstant(Op0)) ||
+ (isConstant(BaseOp1) && isConstant(Op1)) ||
+ (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
+ !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
+ getSameOpcode({BaseOp0, Op0}).getOpcode() ||
+ getSameOpcode({BaseOp1, Op1}).getOpcode();
+}
+
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
- unsigned BaseIndex = 0) {
+ unsigned BaseIndex) {
// Make sure these are all Instructions.
if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+ bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
+ CmpInst::Predicate BasePred =
+ IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
+ : CmpInst::BAD_ICMP_PREDICATE;
unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
unsigned AltOpcode = Opcode;
unsigned AltIndex = BaseIndex;
@@ -514,6 +533,57 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
continue;
}
}
+ } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) {
+ auto *BaseInst = cast<Instruction>(VL[BaseIndex]);
+ auto *Inst = cast<Instruction>(VL[Cnt]);
+ Type *Ty0 = BaseInst->getOperand(0)->getType();
+ Type *Ty1 = Inst->getOperand(0)->getType();
+ if (Ty0 == Ty1) {
+ Value *BaseOp0 = BaseInst->getOperand(0);
+ Value *BaseOp1 = BaseInst->getOperand(1);
+ Value *Op0 = Inst->getOperand(0);
+ Value *Op1 = Inst->getOperand(1);
+ CmpInst::Predicate CurrentPred =
+ cast<CmpInst>(VL[Cnt])->getPredicate();
+ CmpInst::Predicate SwappedCurrentPred =
+ CmpInst::getSwappedPredicate(CurrentPred);
+ // Check for compatible operands. If the corresponding operands are not
+ // compatible - need to perform alternate vectorization.
+ if (InstOpcode == Opcode) {
+ if (BasePred == CurrentPred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1))
+ continue;
+ if (BasePred == SwappedCurrentPred &&
+ areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0))
+ continue;
+ if (E == 2 &&
+ (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
+ continue;
+ auto *AltInst = cast<CmpInst>(VL[AltIndex]);
+ CmpInst::Predicate AltPred = AltInst->getPredicate();
+ Value *AltOp0 = AltInst->getOperand(0);
+ Value *AltOp1 = AltInst->getOperand(1);
+ // Check if operands are compatible with alternate operands.
+ if (AltPred == CurrentPred &&
+ areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1))
+ continue;
+ if (AltPred == SwappedCurrentPred &&
+ areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0))
+ continue;
+ }
+ if (BaseIndex == AltIndex) {
+ assert(isValidForAlternation(Opcode) &&
+ isValidForAlternation(InstOpcode) &&
+ "Cast isn't safe for alternation, logic needs to be updated!");
+ AltIndex = Cnt;
+ continue;
+ }
+ auto *AltInst = cast<CmpInst>(VL[AltIndex]);
+ CmpInst::Predicate AltPred = AltInst->getPredicate();
+ if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
+ AltPred == CurrentPred || AltPred == SwappedCurrentPred)
+ continue;
+ }
} else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
@@ -3307,9 +3377,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
MapVector<OrdersType, unsigned,
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
OrdersUses;
+ // Do the analysis for each tree entry only once, otherwise the order of
+ // the same node my be considered several times, though might be not
+ // profitable.
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
for (const auto &Op : Data.second) {
TreeEntry *OpTE = Op.second;
+ if (!VisitedOps.insert(OpTE).second)
+ continue;
if (!OpTE->ReuseShuffleIndices.empty() ||
(IgnoreReorder && OpTE == VectorizableTree.front().get()))
continue;
@@ -3333,9 +3408,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
} else {
++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
}
- if (VisitedOps.insert(OpTE).second)
- OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
- OpTE->UserTreeIndices.size();
+ OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
+ OpTE->UserTreeIndices.size();
assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
--OrdersUses[{}];
}
@@ -4350,9 +4424,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
- if (isa<BinaryOperator>(VL0)) {
+ auto *CI = dyn_cast<CmpInst>(VL0);
+ if (isa<BinaryOperator>(VL0) || CI) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ if (!CI || all_of(VL, [](Value *V) {
+ return cast<CmpInst>(V)->isCommutative();
+ })) {
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ } else {
+ CmpInst::Predicate P0 = CI->getPredicate();
+ CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate();
+ CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
+ Value *BaseOp0 = VL0->getOperand(0);
+ Value *BaseOp1 = VL0->getOperand(1);
+ // Collect operands - commute if it uses the swapped predicate or
+ // alternate operation.
+ for (Value *V : VL) {
+ auto *Cmp = cast<CmpInst>(V);
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ CmpInst::Predicate CurrentPred = CI->getPredicate();
+ CmpInst::Predicate CurrentPredSwapped =
+ CmpInst::getSwappedPredicate(CurrentPred);
+ if (P0 == AltP0 || P0 == AltP0Swapped) {
+ if ((P0 == CurrentPred &&
+ !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
+ (P0 == CurrentPredSwapped &&
+ !areCompatibleCmpOps(BaseOp0, BaseOp1, RHS, LHS)))
+ std::swap(LHS, RHS);
+ } else if (!areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) {
+ std::swap(LHS, RHS);
+ }
+ Left.push_back(LHS);
+ Right.push_back(RHS);
+ }
+ }
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -5284,7 +5390,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
((Instruction::isBinaryOp(E->getOpcode()) &&
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode()))) &&
+ Instruction::isCast(E->getAltOpcode())) ||
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
"Invalid Shuffle Vector Operand");
InstructionCost ScalarCost = 0;
if (NeedToShuffleReuses) {
@@ -5332,6 +5439,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+ VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+ Builder.getInt1Ty(),
+ CI0->getPredicate(), CostKind, VL0);
+ VecCost += TTI->getCmpSelInstrCost(
+ E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
+ E->getAltOp());
} else {
Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
@@ -5348,6 +5463,29 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
[E](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
+ auto *AltCI0 = cast<CmpInst>(E->getAltOp());
+ auto *CI = cast<CmpInst>(I);
+ CmpInst::Predicate P0 = CI0->getPredicate();
+ CmpInst::Predicate AltP0 = AltCI0->getPredicate();
+ CmpInst::Predicate AltP0Swapped =
+ CmpInst::getSwappedPredicate(AltP0);
+ CmpInst::Predicate CurrentPred = CI->getPredicate();
+ CmpInst::Predicate CurrentPredSwapped =
+ CmpInst::getSwappedPredicate(CurrentPred);
+ if (P0 == AltP0 || P0 == AltP0Swapped) {
+ // Alternate cmps have same/swapped predicate as main cmps but
+ // different order of compatible operands.
+ return !(
+ (P0 == CurrentPred &&
+ areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
+ I->getOperand(0), I->getOperand(1))) ||
+ (P0 == CurrentPredSwapped &&
+ areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
+ I->getOperand(1), I->getOperand(0))));
+ }
+ return CurrentPred != P0 && CurrentPredSwapped != P0;
+ }
return I->getOpcode() == E->getAltOpcode();
},
Mask);
@@ -6830,11 +6968,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
((Instruction::isBinaryOp(E->getOpcode()) &&
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode()))) &&
+ Instruction::isCast(E->getAltOpcode())) ||
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
"Invalid Shuffle Vector Operand");
Value *LHS = nullptr, *RHS = nullptr;
- if (Instruction::isBinaryOp(E->getOpcode())) {
+ if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
setInsertPointAfterBundle(E);
LHS = vectorizeTree(E->getOperand(0));
RHS = vectorizeTree(E->getOperand(1));
@@ -6854,6 +6993,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
V1 = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+ V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
+ auto *AltCI = cast<CmpInst>(E->getAltOp());
+ CmpInst::Predicate AltPred = AltCI->getPredicate();
+ unsigned AltIdx =
+ std::distance(E->Scalars.begin(), find(E->Scalars, AltCI));
+ if (AltCI->getOperand(0) != E->getOperand(0)[AltIdx])
+ AltPred = CmpInst::getSwappedPredicate(AltPred);
+ V1 = Builder.CreateCmp(AltPred, LHS, RHS);
} else {
V0 = Builder.CreateCast(
static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
@@ -6878,6 +7026,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
[E](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
+ auto *AltCI0 = cast<CmpInst>(E->getAltOp());
+ auto *CI = cast<CmpInst>(I);
+ CmpInst::Predicate P0 = CI0->getPredicate();
+ CmpInst::Predicate AltP0 = AltCI0->getPredicate();
+ CmpInst::Predicate AltP0Swapped =
+ CmpInst::getSwappedPredicate(AltP0);
+ CmpInst::Predicate CurrentPred = CI->getPredicate();
+ CmpInst::Predicate CurrentPredSwapped =
+ CmpInst::getSwappedPredicate(CurrentPred);
+ if (P0 == AltP0 || P0 == AltP0Swapped) {
+ // Alternate cmps have same/swapped predicate as main cmps but
+ // different order of compatible operands.
+ return !(
+ (P0 == CurrentPred &&
+ areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
+ I->getOperand(0), I->getOperand(1))) ||
+ (P0 == CurrentPredSwapped &&
+ areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
+ I->getOperand(1), I->getOperand(0))));
+ }
+ return CurrentPred != P0 && CurrentPredSwapped != P0;
+ }
return I->getOpcode() == E->getAltOpcode();
},
Mask, &OpScalars, &AltScalars);
@@ -7676,11 +7847,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
for (ScheduleData *BundleMember = picked; BundleMember;
BundleMember = BundleMember->NextInBundle) {
Instruction *pickedInst = BundleMember->Inst;
- if (pickedInst->getNextNode() != LastScheduledInst) {
- BS->BB->getInstList().remove(pickedInst);
- BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
- pickedInst);
- }
+ if (pickedInst->getNextNode() != LastScheduledInst)
+ pickedInst->moveBefore(LastScheduledInst);
LastScheduledInst = pickedInst;
}
@@ -8444,7 +8612,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
R.reorderTopToBottom();
- R.reorderBottomToTop();
+ R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
R.buildExternalUses();
R.computeMinimumValueSizes();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index e5dded3c0f1e..8822c0004eb2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -75,7 +75,8 @@ class VPRecipeBuilder {
/// Check if an induction recipe should be constructed for \I. If so build and
/// return it. If not, return null.
VPWidenIntOrFpInductionRecipe *
- tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands) const;
+ tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands,
+ VFRange &Range) const;
/// Optimize the special case where the operand of \p I is a constant integer
/// induction variable.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a96c122db2a9..342d4a074e10 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1649,3 +1649,9 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
for (VPValue *Def : Recipe.definedValues())
assignSlot(Def);
}
+
+bool vputils::onlyFirstLaneUsed(VPValue *Def) {
+ return all_of(Def->users(), [Def](VPUser *U) {
+ return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(Def);
+ });
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 824440f98a8b..bcaabca692cc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -759,6 +759,14 @@ public:
bool mayReadOrWriteMemory() const {
return mayReadFromMemory() || mayWriteToMemory();
}
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ /// Conservatively returns false.
+ virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return false;
+ }
};
inline bool VPUser::classof(const VPDef *Def) {
@@ -893,6 +901,24 @@ public:
/// Set the fast-math flags.
void setFastMathFlags(FastMathFlags FMFNew);
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ if (getOperand(0) != Op)
+ return false;
+ switch (getOpcode()) {
+ default:
+ return false;
+ case VPInstruction::ActiveLaneMask:
+ case VPInstruction::CanonicalIVIncrement:
+ case VPInstruction::CanonicalIVIncrementNUW:
+ case VPInstruction::BranchOnCount:
+ return true;
+ };
+ llvm_unreachable("switch should return");
+ }
};
/// VPWidenRecipe is a recipe for producing a copy of vector type its
@@ -1027,18 +1053,24 @@ public:
class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
PHINode *IV;
const InductionDescriptor &IndDesc;
+ bool NeedsScalarIV;
+ bool NeedsVectorIV;
public:
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
- const InductionDescriptor &IndDesc)
+ const InductionDescriptor &IndDesc,
+ bool NeedsScalarIV, bool NeedsVectorIV)
: VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
- IV(IV), IndDesc(IndDesc) {}
+ IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV),
+ NeedsVectorIV(NeedsVectorIV) {}
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
const InductionDescriptor &IndDesc,
- TruncInst *Trunc)
+ TruncInst *Trunc, bool NeedsScalarIV,
+ bool NeedsVectorIV)
: VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
- IV(IV), IndDesc(IndDesc) {}
+ IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV),
+ NeedsVectorIV(NeedsVectorIV) {}
~VPWidenIntOrFpInductionRecipe() override = default;
@@ -1082,6 +1114,12 @@ public:
const TruncInst *TruncI = getTruncInst();
return TruncI ? TruncI->getType() : IV->getType();
}
+
+ /// Returns true if a scalar phi needs to be created for the induction.
+ bool needsScalarIV() const { return NeedsScalarIV; }
+
+ /// Returns true if a vector phi needs to be created for the induction.
+ bool needsVectorIV() const { return NeedsVectorIV; }
};
/// A pure virtual base class for all recipes modeling header phis, including
@@ -1318,6 +1356,17 @@ public:
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ // Recursing through Blend recipes only, must terminate at header phi's the
+ // latest.
+ return all_of(users(), [this](VPUser *U) {
+ return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(this);
+ });
+ }
};
/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
@@ -1495,6 +1544,13 @@ public:
bool isPacked() const { return AlsoPack; }
bool isPredicated() const { return IsPredicated; }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return isUniform();
+ }
};
/// A recipe for generating conditional branches on the bits of a mask.
@@ -1651,6 +1707,16 @@ public:
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+
+ // Widened, consecutive memory operations only demand the first lane of
+ // their address.
+ return Op == getAddr() && isConsecutive();
+ }
};
/// Canonical scalar induction phi of the vector loop. Starting at the specified
@@ -1686,6 +1752,13 @@ public:
const Type *getScalarType() const {
return getOperand(0)->getLiveInIRValue()->getType();
}
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
};
/// A Recipe for widening the canonical induction variable of the vector loop.
@@ -2766,6 +2839,14 @@ public:
/// Return true if all visited instruction can be combined.
bool isCompletelySLP() const { return CompletelySLP; }
};
+
+namespace vputils {
+
+/// Returns true if only the first lane of \p Def is used.
+bool onlyFirstLaneUsed(VPValue *Def);
+
+} // end namespace vputils
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index fb5f3d428189..70ce773a8a85 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -47,7 +47,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {
VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());
- NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II);
+ NewRecipe =
+ new VPWidenIntOrFpInductionRecipe(Phi, Start, *II, false, true);
} else {
Plan->addVPValue(Phi, VPPhi);
continue;
@@ -341,10 +342,16 @@ void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) {
for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
- // If the induction recipe is canonical and the types match, use it
- // directly.
- if (WidenOriginalIV && WidenOriginalIV->isCanonical() &&
- WidenOriginalIV->getScalarType() == WidenNewIV->getScalarType()) {
+ if (!WidenOriginalIV || !WidenOriginalIV->isCanonical() ||
+ WidenOriginalIV->getScalarType() != WidenNewIV->getScalarType())
+ continue;
+
+ // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
+ // everything WidenNewIV's users need. That is, WidenOriginalIV will
+ // generate a vector phi or all users of WidenNewIV demand the first lane
+ // only.
+ if (WidenOriginalIV->needsVectorIV() ||
+ vputils::onlyFirstLaneUsed(WidenNewIV)) {
WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
WidenNewIV->eraseFromParent();
return;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp
index 0296a995ad29..010ca28fc237 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -18,6 +18,7 @@
#include "llvm/Analysis/Passes.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
using namespace llvm;