aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-07-14 18:58:48 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-02-08 19:03:59 +0000
commit753f127f3ace09432b2baeffd71a308760641a62 (patch)
tree97694ab339c0ca6145ebb429c7505019565b9a60 /contrib/llvm-project/llvm/lib/Transforms/Vectorize
parent81ad626541db97eb356e2c1d4a20eb2a26a766ab (diff)
parent1f917f69ff07f09b6dbb670971f57f8efe718b84 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp30
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h51
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp733
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp73
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h62
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp365
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp69
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp174
11 files changed, 1006 insertions, 574 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 6242d9a93fc1..183ba86abcb4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -386,20 +386,6 @@ static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
return true;
}
-/// Check whether it is safe to if-convert this phi node.
-///
-/// Phi nodes with constant expressions that can trap are not safe to if
-/// convert.
-static bool canIfConvertPHINodes(BasicBlock *BB) {
- for (PHINode &Phi : BB->phis()) {
- for (Value *V : Phi.incoming_values())
- if (auto *C = dyn_cast<Constant>(V))
- if (C->canTrap())
- return false;
- }
- return true;
-}
-
static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
if (Ty->isPointerTy())
return DL.getIntPtrType(Ty);
@@ -993,7 +979,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
}
}
- Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
PSE.addPredicate(LAI->getPSE().getPredicate());
return true;
}
@@ -1098,13 +1083,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(
SmallPtrSetImpl<const Instruction *> &MaskedOp,
SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
for (Instruction &I : *BB) {
- // Check that we don't have a constant expression that can trap as operand.
- for (Value *Operand : I.operands()) {
- if (auto *C = dyn_cast<Constant>(Operand))
- if (C->canTrap())
- return false;
- }
-
// We can predicate blocks with calls to assume, as long as we drop them in
// case we flatten the CFG via predication.
if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
@@ -1190,7 +1168,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
}
// Collect the blocks that need predication.
- BasicBlock *Header = TheLoop->getHeader();
for (BasicBlock *BB : TheLoop->blocks()) {
// We don't support switch statements inside loops.
if (!isa<BranchInst>(BB->getTerminator())) {
@@ -1212,13 +1189,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
BB->getTerminator());
return false;
}
- } else if (BB != Header && !canIfConvertPHINodes(BB)) {
- reportVectorizationFailure(
- "Control flow cannot be substituted for a select",
- "control flow cannot be substituted for a select",
- "NoCFGForSelect", ORE, TheLoop,
- BB->getTerminator());
- return false;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 0cb2032fa45a..2e9a9fe0640e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -33,7 +33,6 @@ class LoopInfo;
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class PredicatedScalarEvolution;
-class LoopVectorizationRequirements;
class LoopVectorizeHints;
class OptimizationRemarkEmitter;
class TargetTransformInfo;
@@ -46,8 +45,9 @@ class VPBuilder {
VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
VPInstruction *createInstruction(unsigned Opcode,
- ArrayRef<VPValue *> Operands, DebugLoc DL) {
- VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL);
+ ArrayRef<VPValue *> Operands, DebugLoc DL,
+ const Twine &Name = "") {
+ VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL, Name);
if (BB)
BB->insert(Instr, InsertPt);
return Instr;
@@ -55,8 +55,8 @@ class VPBuilder {
VPInstruction *createInstruction(unsigned Opcode,
std::initializer_list<VPValue *> Operands,
- DebugLoc DL) {
- return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL);
+ DebugLoc DL, const Twine &Name = "") {
+ return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name);
}
public:
@@ -124,34 +124,37 @@ public:
/// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
/// its underlying Instruction.
VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
- Instruction *Inst = nullptr) {
+ Instruction *Inst = nullptr, const Twine &Name = "") {
DebugLoc DL;
if (Inst)
DL = Inst->getDebugLoc();
- VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL);
+ VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name);
NewVPInst->setUnderlyingValue(Inst);
return NewVPInst;
}
VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
- DebugLoc DL) {
- return createInstruction(Opcode, Operands, DL);
+ DebugLoc DL, const Twine &Name = "") {
+ return createInstruction(Opcode, Operands, DL, Name);
}
- VPValue *createNot(VPValue *Operand, DebugLoc DL) {
- return createInstruction(VPInstruction::Not, {Operand}, DL);
+ VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") {
+ return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
}
- VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) {
- return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL);
+ VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL,
+ const Twine &Name = "") {
+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name);
}
- VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) {
- return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL);
+ VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL,
+ const Twine &Name = "") {
+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL, Name);
}
VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
- DebugLoc DL) {
- return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL);
+ DebugLoc DL, const Twine &Name = "") {
+ return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL,
+ Name);
}
//===--------------------------------------------------------------------===//
@@ -191,6 +194,10 @@ struct VectorizationFactor {
/// Cost of the scalar loop.
InstructionCost ScalarCost;
+ /// The minimum trip count required to make vectorization profitable, e.g. due
+ /// to runtime checks.
+ ElementCount MinProfitableTripCount;
+
VectorizationFactor(ElementCount Width, InstructionCost Cost,
InstructionCost ScalarCost)
: Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
@@ -268,8 +275,6 @@ class LoopVectorizationPlanner {
const LoopVectorizeHints &Hints;
- LoopVectorizationRequirements &Requirements;
-
OptimizationRemarkEmitter *ORE;
SmallVector<VPlanPtr, 4> VPlans;
@@ -285,10 +290,9 @@ public:
InterleavedAccessInfo &IAI,
PredicatedScalarEvolution &PSE,
const LoopVectorizeHints &Hints,
- LoopVectorizationRequirements &Requirements,
OptimizationRemarkEmitter *ORE)
: OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
- PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {}
+ PSE(PSE), Hints(Hints), ORE(ORE) {}
/// Plan how to best vectorize, return the best VF and its cost, or None if
/// vectorization and interleaving should be avoided up front.
@@ -332,11 +336,6 @@ public:
bool requiresTooManyRuntimeChecks() const;
protected:
- /// Collect the instructions from the original loop that would be trivially
- /// dead in the vectorized loop if generated.
- void collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
/// legal to vectorize the loop.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b637b2d5ddae..0777a1385916 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -196,10 +196,9 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
"value are vectorized only if no scalar iteration overheads "
"are incurred."));
-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
- "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
- cl::desc("The maximum allowed number of runtime memory checks with a "
- "vectorize(enable) pragma."));
+static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
+ "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks"));
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
@@ -442,6 +441,7 @@ public:
const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
+ ElementCount MinProfitableTripCount,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
@@ -453,6 +453,11 @@ public:
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
+
+ if (MinProfitableTripCount.isZero())
+ this->MinProfitableTripCount = VecWidth;
+ else
+ this->MinProfitableTripCount = MinProfitableTripCount;
}
virtual ~InnerLoopVectorizer() = default;
@@ -656,6 +661,8 @@ protected:
/// vector elements.
ElementCount VF;
+ ElementCount MinProfitableTripCount;
+
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
@@ -735,6 +742,7 @@ public:
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+ ElementCount::getFixed(1),
ElementCount::getFixed(1), UnrollFactor, LVL, CM,
BFI, PSI, Check) {}
@@ -783,8 +791,8 @@ public:
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
GeneratedRTChecks &Checks)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
- Checks),
+ EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
+ CM, BFI, PSI, Checks),
EPI(EPI) {}
// Override this function to handle the more complex control flow around the
@@ -1018,7 +1026,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
isa<VPInterleaveRecipe>(CurRec) ||
isa<VPScalarIVStepsRecipe>(CurRec) ||
- isa<VPCanonicalIVPHIRecipe>(CurRec))
+ isa<VPCanonicalIVPHIRecipe>(CurRec) ||
+ isa<VPActiveLaneMaskPHIRecipe>(CurRec))
continue;
// This recipe contributes to the address computation of a widen
@@ -1503,6 +1512,13 @@ public:
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const { return FoldTailByMasking; }
+ /// Returns true if were tail-folding and want to use the active lane mask
+ /// for vector loop control flow.
+ bool useActiveLaneMaskForControlFlow() const {
+ return FoldTailByMasking &&
+ TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
+ }
+
/// Returns true if the instructions in this block requires predication
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
@@ -1551,14 +1567,14 @@ public:
Scalars.clear();
}
-private:
- unsigned NumPredStores = 0;
-
/// Convenience function that returns the value of vscale_range iff
/// vscale_range.min == vscale_range.max or otherwise returns the value
/// returned by the corresponding TLI method.
Optional<unsigned> getVScaleForTuning() const;
+private:
+ unsigned NumPredStores = 0;
+
/// \return An upper bound for the vectorization factors for both
/// fixed and scalable vectorization, where the minimum-known number of
/// elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -1661,7 +1677,8 @@ private:
/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.
- SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+ DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
+ PredicatedBBsAfterVectorization;
/// Records whether it is allowed to have the original scalar loop execute at
/// least once. This may be needed as a fallback loop in case runtime
@@ -1849,14 +1866,17 @@ class GeneratedRTChecks {
DominatorTree *DT;
LoopInfo *LI;
+ TargetTransformInfo *TTI;
SCEVExpander SCEVExp;
SCEVExpander MemCheckExp;
+ bool CostTooHigh = false;
+
public:
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
- const DataLayout &DL)
- : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
+ TargetTransformInfo *TTI, const DataLayout &DL)
+ : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
MemCheckExp(SE, DL, "scev.check") {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
@@ -1867,6 +1887,15 @@ public:
void Create(Loop *L, const LoopAccessInfo &LAI,
const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
+ // Hard cutoff to limit compile-time increase in case a very large number of
+ // runtime checks needs to be generated.
+ // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
+ // profile info.
+ CostTooHigh =
+ LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
+ if (CostTooHigh)
+ return;
+
BasicBlock *LoopHeader = L->getHeader();
BasicBlock *Preheader = L->getLoopPreheader();
@@ -1938,6 +1967,44 @@ public:
}
}
+ InstructionCost getCost() {
+ if (SCEVCheckBlock || MemCheckBlock)
+ LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
+
+ if (CostTooHigh) {
+ InstructionCost Cost;
+ Cost.setInvalid();
+ LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
+ return Cost;
+ }
+
+ InstructionCost RTCheckCost = 0;
+ if (SCEVCheckBlock)
+ for (Instruction &I : *SCEVCheckBlock) {
+ if (SCEVCheckBlock->getTerminator() == &I)
+ continue;
+ InstructionCost C =
+ TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+ LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
+ RTCheckCost += C;
+ }
+ if (MemCheckBlock)
+ for (Instruction &I : *MemCheckBlock) {
+ if (MemCheckBlock->getTerminator() == &I)
+ continue;
+ InstructionCost C =
+ TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+ LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
+ RTCheckCost += C;
+ }
+
+ if (SCEVCheckBlock || MemCheckBlock)
+ LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
+ << "\n");
+
+ return RTCheckCost;
+ }
+
/// Remove the created SCEV & memory runtime check blocks & instructions, if
/// unused.
~GeneratedRTChecks() {
@@ -2880,9 +2947,16 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// If tail is to be folded, vector loop takes care of all iterations.
Type *CountTy = Count->getType();
Value *CheckMinIters = Builder.getFalse();
- Value *Step = createStepForVF(Builder, CountTy, VF, UF);
+ auto CreateStep = [&]() {
+ // Create step with max(MinProTripCount, UF * VF).
+ if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue())
+ return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
+ return createStepForVF(Builder, CountTy, VF, UF);
+ };
+
if (!Cost->foldTailByMasking())
- CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
+ CheckMinIters =
+ Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
else if (VF.isScalable()) {
// vscale is not necessarily a power-of-2, which means we cannot guarantee
// an overflow to zero when updating induction variables and so an
@@ -2894,8 +2968,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
// Don't execute the vector loop if (UMax - n) < (VF * UF).
- CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
+ CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
}
+
// Create new preheader for vector loop.
LoopVectorPreHeader =
SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
@@ -2920,7 +2995,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
}
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
-
BasicBlock *const SCEVCheckBlock =
RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
if (!SCEVCheckBlock)
@@ -4792,7 +4866,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
MaxVScale =
TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
MaxScalableVF = ElementCount::getScalable(
- MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+ MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0);
if (!MaxScalableVF)
reportVectorizationInfo(
"Max legal vector width too small, scalable vectorization "
@@ -5187,9 +5261,9 @@ bool LoopVectorizationCostModel::isMoreProfitable(
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
if (Optional<unsigned> VScale = getVScaleForTuning()) {
if (A.Width.isScalable())
- EstimatedWidthA *= VScale.getValue();
+ EstimatedWidthA *= VScale.value();
if (B.Width.isScalable())
- EstimatedWidthB *= VScale.getValue();
+ EstimatedWidthB *= VScale.value();
}
// Assume vscale may be larger than 1 (or the value being tuned for),
@@ -5872,10 +5946,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
- auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
+ const auto &TTICapture = TTI;
+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
return 0;
- return TTI.getRegUsageForType(VectorType::get(Ty, VF));
+ return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
};
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -6014,6 +6089,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
// map will indicate that we've analyzed it already.
ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+ PredicatedBBsAfterVectorization[VF].clear();
+
// Find all the instructions that are scalar with predication in the loop and
// determine if it would be better to not if-convert the blocks they are in.
// If so, we also record the instructions to scalarize.
@@ -6031,7 +6108,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
// Remember that BB will remain after vectorization.
- PredicatedBBsAfterVectorization.insert(BB);
+ PredicatedBBsAfterVectorization[VF].insert(BB);
}
}
}
@@ -6896,8 +6973,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
bool ScalarPredicatedBB = false;
BranchInst *BI = cast<BranchInst>(I);
if (VF.isVector() && BI->isConditional() &&
- (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
- PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
+ (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
+ PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
ScalarPredicatedBB = true;
if (ScalarPredicatedBB) {
@@ -7363,14 +7440,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
return VectorizationFactor::Disabled();
}
-bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {
- unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
- return (NumRuntimePointerChecks >
- VectorizerParams::RuntimeMemoryCheckThreshold &&
- !Hints.allowReordering()) ||
- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
-}
-
Optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -7439,7 +7508,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
- return CM.selectVectorizationFactor(VFCandidates);
+ VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
+ assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
+ return VF;
}
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -7554,7 +7625,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
if (VectorizedLoopID)
- L->setLoopID(VectorizedLoopID.getValue());
+ L->setLoopID(VectorizedLoopID.value());
else {
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
@@ -7585,51 +7656,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
}
#endif
-void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions) {
-
- // We create new control-flow for the vectorized loop, so the original exit
- // conditions will be dead after vectorization if it's only used by the
- // terminator
- SmallVector<BasicBlock*> ExitingBlocks;
- OrigLoop->getExitingBlocks(ExitingBlocks);
- for (auto *BB : ExitingBlocks) {
- auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
- if (!Cmp || !Cmp->hasOneUse())
- continue;
-
- // TODO: we should introduce a getUniqueExitingBlocks on Loop
- if (!DeadInstructions.insert(Cmp).second)
- continue;
-
- // The operands of the icmp is often a dead trunc, used by IndUpdate.
- // TODO: can recurse through operands in general
- for (Value *Op : Cmp->operands()) {
- if (isa<TruncInst>(Op) && Op->hasOneUse())
- DeadInstructions.insert(cast<Instruction>(Op));
- }
- }
-
- // We create new "steps" for induction variable updates to which the original
- // induction variables map. An original update instruction will be dead if
- // all its users except the induction variable are dead.
- auto *Latch = OrigLoop->getLoopLatch();
- for (auto &Induction : Legal->getInductionVars()) {
- PHINode *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-
- // If the tail is to be folded by masking, the primary induction variable,
- // if exists, isn't dead: it will be used for masking. Don't kill it.
- if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
- continue;
-
- if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
- return U == Ind || DeadInstructions.count(cast<Instruction>(U));
- }))
- DeadInstructions.insert(IndUpdate);
- }
-}
-
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
//===--------------------------------------------------------------------===//
@@ -8001,11 +8027,19 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
if (!CM.blockNeedsPredicationForAnyReason(BB))
return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+ assert(CM.foldTailByMasking() && "must fold the tail");
+
+ // If we're using the active lane mask for control flow, then we get the
+ // mask from the active lane mask PHI that is cached in the VPlan.
+ PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
+ if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
+ return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
+
// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
// constructing the desired canonical IV in the header block as its first
// non-phi instructions.
- assert(CM.foldTailByMasking() && "must fold the tail");
+
VPBasicBlock *HeaderVPBB =
Plan->getVectorLoopRegion()->getEntryBasicBlock();
auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
@@ -8014,9 +8048,10 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
- if (CM.TTI.emitGetActiveLaneMask()) {
+ if (EmitGetActiveLaneMask != PredicationStyle::None) {
VPValue *TC = Plan->getOrCreateTripCount();
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
+ BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
+ nullptr, "active.lane.mask");
} else {
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
@@ -8409,9 +8444,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
return RegSucc;
}
-VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
- VPRecipeBase *PredRecipe,
- VPlanPtr &Plan) {
+VPRegionBlock *VPRecipeBuilder::createReplicateRegion(
+ Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) {
// Instructions marked for predication are replicated and placed under an
// if-then construct to prevent side-effects.
@@ -8425,7 +8459,7 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
auto *PHIRecipe = Instr->getType()->isVoidTy()
? nullptr
- : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
+ : new VPPredInstPHIRecipe(PredRecipe);
if (PHIRecipe) {
Plan->removeVPValueFor(Instr);
Plan->addVPValue(Instr, PHIRecipe);
@@ -8517,19 +8551,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
- // Collect instructions from the original loop that will become trivially dead
- // in the vectorized loop. We don't need to vectorize these instructions. For
- // example, original induction update instructions can become dead because we
- // separately emit induction "steps" when generating code for the new loop.
- // Similarly, we create a new latch condition when setting up the structure
- // of the new loop, so the old one can become dead.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- collectTriviallyDeadInstructions(DeadInstructions);
-
// Add assume instructions we need to drop to DeadInstructions, to prevent
// them from being added to the VPlan.
// TODO: We only need to drop assumes in blocks that get flattend. If the
// control flow is preserved, we should keep them.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
auto &ConditionalAssumes = Legal->getConditionalAssumes();
DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
@@ -8565,32 +8591,84 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
}
}
-// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
-// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
-// BranchOnCount VPInstruction to the latch.
+// Add the necessary canonical IV and branch recipes required to control the
+// loop.
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
- bool HasNUW) {
+ bool HasNUW,
+ bool UseLaneMaskForLoopControlFlow) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getOrAddVPValue(StartIdx);
+ // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
Header->insert(CanonicalIVPHI, Header->begin());
+ // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
+ // IV by VF * UF.
auto *CanonicalIVIncrement =
new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
: VPInstruction::CanonicalIVIncrement,
- {CanonicalIVPHI}, DL);
+ {CanonicalIVPHI}, DL, "index.next");
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
EB->appendRecipe(CanonicalIVIncrement);
- auto *BranchOnCount =
- new VPInstruction(VPInstruction::BranchOnCount,
- {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
- EB->appendRecipe(BranchOnCount);
+ if (UseLaneMaskForLoopControlFlow) {
+ // Create the active lane mask instruction in the vplan preheader.
+ VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
+
+ // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
+ // we have to take unrolling into account. Each part needs to start at
+ // Part * VF
+ auto *CanonicalIVIncrementParts =
+ new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
+ : VPInstruction::CanonicalIVIncrementForPart,
+ {StartV}, DL, "index.part.next");
+ Preheader->appendRecipe(CanonicalIVIncrementParts);
+
+ // Create the ActiveLaneMask instruction using the correct start values.
+ VPValue *TC = Plan.getOrCreateTripCount();
+ auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
+ {CanonicalIVIncrementParts, TC}, DL,
+ "active.lane.mask.entry");
+ Preheader->appendRecipe(EntryALM);
+
+ // Now create the ActiveLaneMaskPhi recipe in the main loop using the
+ // preheader ActiveLaneMask instruction.
+ auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+ Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
+
+ // Create the active lane mask for the next iteration of the loop.
+ CanonicalIVIncrementParts =
+ new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
+ : VPInstruction::CanonicalIVIncrementForPart,
+ {CanonicalIVIncrement}, DL);
+ EB->appendRecipe(CanonicalIVIncrementParts);
+
+ auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
+ {CanonicalIVIncrementParts, TC}, DL,
+ "active.lane.mask.next");
+ EB->appendRecipe(ALM);
+ LaneMaskPhi->addOperand(ALM);
+
+ // We have to invert the mask here because a true condition means jumping
+ // to the exit block.
+ auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
+ EB->appendRecipe(NotMask);
+
+ VPInstruction *BranchBack =
+ new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
+ EB->appendRecipe(BranchBack);
+ } else {
+ // Add the BranchOnCount VPInstruction to the latch.
+ VPInstruction *BranchBack = new VPInstruction(
+ VPInstruction::BranchOnCount,
+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+ EB->appendRecipe(BranchBack);
+ }
}
// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
@@ -8691,7 +8769,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
DLInst ? DLInst->getDebugLoc() : DebugLoc(),
- !CM.foldTailByMasking());
+ !CM.foldTailByMasking(),
+ CM.useActiveLaneMaskForControlFlow());
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
@@ -8961,8 +9040,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
VPlanTransforms::sinkScalarOperands(*Plan);
- VPlanTransforms::mergeReplicateRegions(*Plan);
VPlanTransforms::removeDeadRecipes(*Plan);
+ VPlanTransforms::mergeReplicateRegions(*Plan);
VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
// Fold Exit block into its predecessor if possible.
@@ -9006,7 +9085,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
Term->eraseFromParent();
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
- true);
+ true, CM.useActiveLaneMaskForControlFlow());
return Plan;
}
@@ -9078,7 +9157,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
Plan->removeVPValueFor(R);
Plan->addVPValue(R, RedRecipe);
- WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
+ // Append the recipe to the end of the VPBasicBlock because we need to
+ // ensure that it comes after all of it's inputs, including CondOp.
+ WidenRecipe->getParent()->appendRecipe(RedRecipe);
WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
WidenRecipe->eraseFromParent();
@@ -9151,229 +9232,6 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
*this, State);
}
-void VPWidenSelectRecipe::execute(VPTransformState &State) {
- auto &I = *cast<SelectInst>(getUnderlyingInstr());
- State.setDebugLocFromInst(&I);
-
- // The condition can be loop invariant but still defined inside the
- // loop. This means that we can't just use the original 'cond' value.
- // We have to take the 'vectorized' value and pick the first lane.
- // Instcombine will make this a no-op.
- auto *InvarCond =
- InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
-
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
- Value *Op0 = State.get(getOperand(1), Part);
- Value *Op1 = State.get(getOperand(2), Part);
- Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
- State.set(this, Sel, Part);
- State.addMetadata(Sel, &I);
- }
-}
-
-void VPWidenRecipe::execute(VPTransformState &State) {
- auto &I = *cast<Instruction>(getUnderlyingValue());
- auto &Builder = State.Builder;
- switch (I.getOpcode()) {
- case Instruction::Call:
- case Instruction::Br:
- case Instruction::PHI:
- case Instruction::GetElementPtr:
- case Instruction::Select:
- llvm_unreachable("This instruction is handled by a different recipe.");
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::SRem:
- case Instruction::URem:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::FNeg:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::FDiv:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- // Just widen unops and binops.
- State.setDebugLocFromInst(&I);
-
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- SmallVector<Value *, 2> Ops;
- for (VPValue *VPOp : operands())
- Ops.push_back(State.get(VPOp, Part));
-
- Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
-
- if (auto *VecOp = dyn_cast<Instruction>(V)) {
- VecOp->copyIRFlags(&I);
-
- // If the instruction is vectorized and was in a basic block that needed
- // predication, we can't propagate poison-generating flags (nuw/nsw,
- // exact, etc.). The control flow has been linearized and the
- // instruction is no longer guarded by the predicate, which could make
- // the flag properties to no longer hold.
- if (State.MayGeneratePoisonRecipes.contains(this))
- VecOp->dropPoisonGeneratingFlags();
- }
-
- // Use this vector value for all users of the original instruction.
- State.set(this, V, Part);
- State.addMetadata(V, &I);
- }
-
- break;
- }
- case Instruction::Freeze: {
- State.setDebugLocFromInst(&I);
-
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *Op = State.get(getOperand(0), Part);
-
- Value *Freeze = Builder.CreateFreeze(Op);
- State.set(this, Freeze, Part);
- }
- break;
- }
- case Instruction::ICmp:
- case Instruction::FCmp: {
- // Widen compares. Generate vector compares.
- bool FCmp = (I.getOpcode() == Instruction::FCmp);
- auto *Cmp = cast<CmpInst>(&I);
- State.setDebugLocFromInst(Cmp);
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *A = State.get(getOperand(0), Part);
- Value *B = State.get(getOperand(1), Part);
- Value *C = nullptr;
- if (FCmp) {
- // Propagate fast math flags.
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- Builder.setFastMathFlags(Cmp->getFastMathFlags());
- C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
- } else {
- C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
- }
- State.set(this, C, Part);
- State.addMetadata(C, &I);
- }
-
- break;
- }
-
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- auto *CI = cast<CastInst>(&I);
- State.setDebugLocFromInst(CI);
-
- /// Vectorize casts.
- Type *DestTy = (State.VF.isScalar())
- ? CI->getType()
- : VectorType::get(CI->getType(), State.VF);
-
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *A = State.get(getOperand(0), Part);
- Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
- State.set(this, Cast, Part);
- State.addMetadata(Cast, &I);
- }
- break;
- }
- default:
- // This instruction is not vectorized by simple widening.
- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
- llvm_unreachable("Unhandled instruction!");
- } // end of switch.
-}
-
-void VPWidenGEPRecipe::execute(VPTransformState &State) {
- auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
- // Construct a vector GEP by widening the operands of the scalar GEP as
- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
- // results in a vector of pointers when at least one operand of the GEP
- // is vector-typed. Thus, to keep the representation compact, we only use
- // vector-typed operands for loop-varying values.
-
- if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
- // If we are vectorizing, but the GEP has only loop-invariant operands,
- // the GEP we build (by only using vector-typed operands for
- // loop-varying values) would be a scalar pointer. Thus, to ensure we
- // produce a vector of pointers, we need to either arbitrarily pick an
- // operand to broadcast, or broadcast a clone of the original GEP.
- // Here, we broadcast a clone of the original.
- //
- // TODO: If at some point we decide to scalarize instructions having
- // loop-invariant operands, this special case will no longer be
- // required. We would add the scalarization decision to
- // collectLoopScalars() and teach getVectorValue() to broadcast
- // the lane-zero scalar value.
- auto *Clone = State.Builder.Insert(GEP->clone());
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
- State.set(this, EntryPart, Part);
- State.addMetadata(EntryPart, GEP);
- }
- } else {
- // If the GEP has at least one loop-varying operand, we are sure to
- // produce a vector of pointers. But if we are only unrolling, we want
- // to produce a scalar GEP for each unroll part. Thus, the GEP we
- // produce with the code below will be scalar (if VF == 1) or vector
- // (otherwise). Note that for the unroll-only case, we still maintain
- // values in the vector mapping with initVector, as we do for other
- // instructions.
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- // The pointer operand of the new GEP. If it's loop-invariant, we
- // won't broadcast it.
- auto *Ptr = IsPtrLoopInvariant
- ? State.get(getOperand(0), VPIteration(0, 0))
- : State.get(getOperand(0), Part);
-
- // Collect all the indices for the new GEP. If any index is
- // loop-invariant, we won't broadcast it.
- SmallVector<Value *, 4> Indices;
- for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
- VPValue *Operand = getOperand(I);
- if (IsIndexLoopInvariant[I - 1])
- Indices.push_back(State.get(Operand, VPIteration(0, 0)));
- else
- Indices.push_back(State.get(Operand, Part));
- }
-
- // If the GEP instruction is vectorized and was in a basic block that
- // needed predication, we can't propagate the poison-generating 'inbounds'
- // flag. The control flow has been linearized and the GEP is no longer
- // guarded by the predicate, which could make the 'inbounds' properties to
- // no longer hold.
- bool IsInBounds =
- GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
-
- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
- // but it should be a vector, otherwise.
- auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
- Indices, "", IsInBounds);
- assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
- "NewGEP is not a pointer vector");
- State.set(this, NewGEP, Part);
- State.addMetadata(NewGEP, GEP);
- }
- }
-}
-
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
@@ -9632,45 +9490,6 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
}
}
-void VPBlendRecipe::execute(VPTransformState &State) {
- State.setDebugLocFromInst(Phi);
- // We know that all PHIs in non-header blocks are converted into
- // selects, so we don't have to worry about the insertion order and we
- // can just use the builder.
- // At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- unsigned NumIncoming = getNumIncomingValues();
-
- // Generate a sequence of selects of the form:
- // SELECT(Mask3, In3,
- // SELECT(Mask2, In2,
- // SELECT(Mask1, In1,
- // In0)))
- // Note that Mask0 is never used: lanes for which no path reaches this phi and
- // are essentially undef are taken from In0.
- InnerLoopVectorizer::VectorParts Entry(State.UF);
- for (unsigned In = 0; In < NumIncoming; ++In) {
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- // We might have single edge PHIs (blocks) - use an identity
- // 'select' for the first PHI operand.
- Value *In0 = State.get(getIncomingValue(In), Part);
- if (In == 0)
- Entry[Part] = In0; // Initialize with the first incoming value.
- else {
- // Select between the current value and the previous incoming edge
- // based on the incoming mask.
- Value *Cond = State.get(getMask(In), Part);
- Entry[Part] =
- State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
- }
- }
- }
- for (unsigned Part = 0; Part < State.UF; ++Part)
- State.set(this, Entry[Part], Part);
-}
-
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
@@ -9758,32 +9577,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
State);
}
-void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
- assert(State.Instance && "Branch on Mask works only on single instance.");
-
- unsigned Part = State.Instance->Part;
- unsigned Lane = State.Instance->Lane.getKnownLane();
-
- Value *ConditionBit = nullptr;
- VPValue *BlockInMask = getMask();
- if (BlockInMask) {
- ConditionBit = State.get(BlockInMask, Part);
- if (ConditionBit->getType()->isVectorTy())
- ConditionBit = State.Builder.CreateExtractElement(
- ConditionBit, State.Builder.getInt32(Lane));
- } else // Block in mask is all-one.
- ConditionBit = State.Builder.getTrue();
-
- // Replace the temporary unreachable terminator with a new conditional branch,
- // whose two destinations will be set later when they are created.
- auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
- assert(isa<UnreachableInst>(CurrentTerminator) &&
- "Expected to replace unreachable terminator with conditional branch.");
- auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
- CondBr->setSuccessor(0, nullptr);
- ReplaceInstWithInst(CurrentTerminator, CondBr);
-}
-
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Predicated instruction PHI works per instance.");
Instruction *ScalarPredInst =
@@ -10103,8 +9896,7 @@ static bool processLoopInVPlanNativePath(
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
- Requirements, ORE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
// Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();
@@ -10123,10 +9915,10 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
{
- GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
F->getParent()->getDataLayout());
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
- &CM, BFI, PSI, Checks);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
+ VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -10183,6 +9975,105 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
}
}
+static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
+ VectorizationFactor &VF,
+ Optional<unsigned> VScale, Loop *L,
+ ScalarEvolution &SE) {
+ InstructionCost CheckCost = Checks.getCost();
+ if (!CheckCost.isValid())
+ return false;
+
+ // When interleaving only scalar and vector cost will be equal, which in turn
+ // would lead to a divide by 0. Fall back to hard threshold.
+ if (VF.Width.isScalar()) {
+ if (CheckCost > VectorizeMemoryCheckThreshold) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Interleaving only is not profitable due to runtime checks\n");
+ return false;
+ }
+ return true;
+ }
+
+ // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
+ double ScalarC = *VF.ScalarCost.getValue();
+ if (ScalarC == 0)
+ return true;
+
+ // First, compute the minimum iteration count required so that the vector
+ // loop outperforms the scalar loop.
+ // The total cost of the scalar loop is
+ // ScalarC * TC
+ // where
+ // * TC is the actual trip count of the loop.
+ // * ScalarC is the cost of a single scalar iteration.
+ //
+ // The total cost of the vector loop is
+ // RtC + VecC * (TC / VF) + EpiC
+ // where
+ // * RtC is the cost of the generated runtime checks
+ // * VecC is the cost of a single vector iteration.
+ // * TC is the actual trip count of the loop
+ // * VF is the vectorization factor
+ // * EpiCost is the cost of the generated epilogue, including the cost
+ // of the remaining scalar operations.
+ //
+ // Vectorization is profitable once the total vector cost is less than the
+ // total scalar cost:
+ // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
+ //
+ // Now we can compute the minimum required trip count TC as
+ // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
+ //
+ // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
+ // the computations are performed on doubles, not integers and the result
+ // is rounded up, hence we get an upper estimate of the TC.
+ unsigned IntVF = VF.Width.getKnownMinValue();
+ if (VF.Width.isScalable()) {
+ unsigned AssumedMinimumVscale = 1;
+ if (VScale)
+ AssumedMinimumVscale = *VScale;
+ IntVF *= AssumedMinimumVscale;
+ }
+ double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
+ double RtC = *CheckCost.getValue();
+ double MinTC1 = RtC / (ScalarC - VecCOverVF);
+
+ // Second, compute a minimum iteration count so that the cost of the
+ // runtime checks is only a fraction of the total scalar loop cost. This
+ // adds a loop-dependent bound on the overhead incurred if the runtime
+ // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
+ // * TC. To bound the runtime check to be a fraction 1/X of the scalar
+ // cost, compute
+ // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
+ double MinTC2 = RtC * 10 / ScalarC;
+
+ // Now pick the larger minimum. If it is not a multiple of VF, choose the
+ // next closest multiple of VF. This should partly compensate for ignoring
+ // the epilogue cost.
+ uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
+ VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
+ << VF.MinProfitableTripCount << "\n");
+
+ // Skip vectorization if the expected trip count is less than the minimum
+ // required trip count.
+ if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
+ if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
+ VF.MinProfitableTripCount)) {
+ LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
+ "trip count < minimum profitable VF ("
+ << *ExpectedTC << " < " << VF.MinProfitableTripCount
+ << ")\n");
+
+ return false;
+ }
+ }
+ return true;
+}
+
LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
!EnableLoopInterleaving),
@@ -10340,8 +10231,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectElementTypesForWidening();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
- Requirements, ORE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
@@ -10353,10 +10243,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
- GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
F->getParent()->getDataLayout());
if (MaybeVF) {
- if (LVP.requiresTooManyRuntimeChecks()) {
+ VF = *MaybeVF;
+ // Select the interleave count.
+ IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+
+ unsigned SelectedIC = std::max(IC, UserIC);
+ // Optimistically generate runtime checks if they are needed. Drop them if
+ // they turn out to not be profitable.
+ if (VF.Width.isVector() || SelectedIC > 1)
+ Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+
+ // Check if it is profitable to vectorize with runtime checks.
+ bool ForceVectorization =
+ Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+ if (!ForceVectorization &&
+ !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
+ *PSE.getSE())) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
@@ -10368,15 +10273,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Hints.emitRemarkWithHints();
return false;
}
- VF = *MaybeVF;
- // Select the interleave count.
- IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
-
- unsigned SelectedIC = std::max(IC, UserIC);
- // Optimistically generate runtime checks if they are needed. Drop them if
- // they turn out to not be profitable.
- if (VF.Width.isVector() || SelectedIC > 1)
- Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
}
// Identify the diagnostic messages that should be produced.
@@ -10533,8 +10429,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (!MainILV.areSafetyChecksAdded())
DisableRuntimeUnroll = true;
} else {
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
- &LVL, &CM, BFI, PSI, Checks);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
+ VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
+ PSI, Checks);
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
@@ -10564,7 +10461,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupEpilogue});
if (RemainderLoopID) {
- L->setLoopID(RemainderLoopID.getValue());
+ L->setLoopID(RemainderLoopID.value());
} else {
if (DisableRuntimeUnroll)
AddRuntimeUnrollDisableMetaData(L);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 019a09665a67..e136cd9aedac 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2637,7 +2637,7 @@ private:
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
if (result) {
- return result.getValue();
+ return result.value();
}
bool aliased = true;
if (Loc1.Ptr && isSimple(Inst1))
@@ -4592,7 +4592,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
};
InstructionsState S = getSameOpcode(VL);
- if (Depth == RecursionMaxDepth) {
+
+ // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
+ // a load), in which case peek through to include it in the tree, without
+ // ballooning over-budget.
+ if (Depth >= RecursionMaxDepth &&
+ !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
+ VL.size() >= 4 &&
+ (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
+ return match(I,
+ m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
+ cast<Instruction>(I)->getOpcode() ==
+ cast<Instruction>(S.MainOp)->getOpcode();
+ })))) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -11217,7 +11229,7 @@ public:
return OptimizationRemarkMissed(
SV_NAME, "HorSLPNotBeneficial",
ReducedValsToOps.find(VL[0])->second.front())
- << "Vectorizing horizontal reduction is possible"
+ << "Vectorizing horizontal reduction is possible "
<< "but not beneficial with cost " << ore::NV("Cost", Cost)
<< " and threshold "
<< ore::NV("Threshold", -SLPCostThreshold);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 97f2b1a93815..c7949c42c03e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -159,7 +159,8 @@ public:
/// Create a replicating region for instruction \p I that requires
/// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
- VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
+ VPRegionBlock *createReplicateRegion(Instruction *I,
+ VPReplicateRecipe *PredRecipe,
VPlanPtr &Plan);
/// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4d709097c306..30032dda7f60 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -248,25 +248,27 @@ void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {
}
void VPTransformState::setDebugLocFromInst(const Value *V) {
- if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
- const DILocation *DIL = Inst->getDebugLoc();
-
- // When a FSDiscriminator is enabled, we don't need to add the multiply
- // factors to the discriminators.
- if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
- !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
- // FIXME: For scalable vectors, assume vscale=1.
- auto NewDIL =
- DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
- if (NewDIL)
- Builder.SetCurrentDebugLocation(*NewDIL);
- else
- LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
- << DIL->getFilename() << " Line: " << DIL->getLine());
- } else
- Builder.SetCurrentDebugLocation(DIL);
- } else
+ const Instruction *Inst = dyn_cast<Instruction>(V);
+ if (!Inst) {
Builder.SetCurrentDebugLocation(DebugLoc());
+ return;
+ }
+
+ const DILocation *DIL = Inst->getDebugLoc();
+ // When a FSDiscriminator is enabled, we don't need to add the multiply
+ // factors to the discriminators.
+ if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
+ !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
+ // FIXME: For scalable vectors, assume vscale=1.
+ auto NewDIL =
+ DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
+ if (NewDIL)
+ Builder.SetCurrentDebugLocation(*NewDIL);
+ else
+ LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
+ << DIL->getFilename() << " Line: " << DIL->getLine());
+ } else
+ Builder.SetCurrentDebugLocation(DIL);
}
BasicBlock *
@@ -566,6 +568,24 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
}
#endif
+VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {
+ VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
+ for (VPRecipeBase &R : Header->phis()) {
+ if (isa<VPActiveLaneMaskPHIRecipe>(&R))
+ return cast<VPActiveLaneMaskPHIRecipe>(&R);
+ }
+ return nullptr;
+}
+
+static bool canSimplifyBranchOnCond(VPInstruction *Term) {
+ VPInstruction *Not = dyn_cast<VPInstruction>(Term->getOperand(0));
+ if (!Not || Not->getOpcode() != VPInstruction::Not)
+ return false;
+
+ VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0));
+ return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask;
+}
+
void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
Value *CanonicalIVStartValue,
VPTransformState &State,
@@ -573,11 +593,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();
auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
- // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when
- // preparing to execute the plan for the main vector loop.
- if (!IsEpilogueVectorization && Term &&
- Term->getOpcode() == VPInstruction::BranchOnCount &&
- isa<ConstantInt>(TripCountV)) {
+ // Try to simplify the branch condition if TC <= VF * UF when preparing to
+ // execute the plan for the main vector loop. We only do this if the
+ // terminator is:
+ // 1. BranchOnCount, or
+ // 2. BranchOnCond where the input is Not(ActiveLaneMask).
+ if (!IsEpilogueVectorization && Term && isa<ConstantInt>(TripCountV) &&
+ (Term->getOpcode() == VPInstruction::BranchOnCount ||
+ (Term->getOpcode() == VPInstruction::BranchOnCond &&
+ canSimplifyBranchOnCond(Term)))) {
ConstantInt *C = cast<ConstantInt>(TripCountV);
uint64_t TCVal = C->getZExtValue();
if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) {
@@ -697,7 +721,8 @@ void VPlan::execute(VPTransformState *State) {
// generated.
bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
- cast<VPReductionPHIRecipe>(PhiR)->isOrdered();
+ (isa<VPReductionPHIRecipe>(PhiR) &&
+ cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 09da4a545d0d..f009a7ee6b4b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -784,6 +784,10 @@ public:
ActiveLaneMask,
CanonicalIVIncrement,
CanonicalIVIncrementNUW,
+ // The next two are similar to the above, but instead increment the
+ // canonical IV separately for each unrolled part.
+ CanonicalIVIncrementForPart,
+ CanonicalIVIncrementForPartNUW,
BranchOnCount,
BranchOnCond
};
@@ -794,6 +798,9 @@ private:
FastMathFlags FMF;
DebugLoc DL;
+ /// An optional name that can be used for the generated IR instruction.
+ const std::string Name;
+
/// Utility method serving execute(): generates a single instance of the
/// modeled instruction.
void generateInstruction(VPTransformState &State, unsigned Part);
@@ -802,14 +809,15 @@ protected:
void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
public:
- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL)
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
+ const Twine &Name = "")
: VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),
VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode),
- DL(DL) {}
+ DL(DL), Name(Name.str()) {}
VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
- DebugLoc DL = {})
- : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {}
+ DebugLoc DL = {}, const Twine &Name = "")
+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPValue *V) {
@@ -818,7 +826,7 @@ public:
VPInstruction *clone() const {
SmallVector<VPValue *, 2> Operands(operands());
- return new VPInstruction(Opcode, Operands, DL);
+ return new VPInstruction(Opcode, Operands, DL, Name);
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -897,6 +905,8 @@ public:
case VPInstruction::ActiveLaneMask:
case VPInstruction::CanonicalIVIncrement:
case VPInstruction::CanonicalIVIncrementNUW:
+ case VPInstruction::CanonicalIVIncrementForPart:
+ case VPInstruction::CanonicalIVIncrementForPartNUW:
case VPInstruction::BranchOnCount:
return true;
};
@@ -1125,6 +1135,7 @@ public:
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPRecipeBase *B) {
return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC ||
+ B->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC ||
B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC ||
B->getVPDefID() == VPRecipeBase::VPReductionPHISC ||
B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC ||
@@ -1132,6 +1143,7 @@ public:
}
static inline bool classof(const VPValue *V) {
return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC ||
+ V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC ||
V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC ||
V->getVPValueID() == VPValue::VPVReductionPHISC ||
V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC ||
@@ -1861,6 +1873,42 @@ public:
}
};
+/// A recipe for generating the active lane mask for the vector loop that is
+/// used to predicate the vector operations.
+/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
+/// remove VPActiveLaneMaskPHIRecipe.
+class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
+ DebugLoc DL;
+
+public:
+ VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
+ : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC,
+ VPActiveLaneMaskPHISC, nullptr, StartMask),
+ DL(DL) {}
+
+ ~VPActiveLaneMaskPHIRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPActiveLaneMaskPHISC;
+ }
+ static inline bool classof(const VPHeaderPHIRecipe *D) {
+ return D->getVPDefID() == VPActiveLaneMaskPHISC;
+ }
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC;
+ }
+
+ /// Generate the active lane mask phi of the vector loop.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A Recipe for widening the canonical induction variable of the vector loop.
class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
public:
@@ -2656,6 +2704,10 @@ public:
return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
}
+ /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there
+ /// be only one at most. If there isn't one, then return nullptr.
+ VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi();
+
void addLiveOut(PHINode *PN, VPValue *V);
void clearLiveOuts() {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 92422b17457c..fdd901a4a70d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -26,13 +26,19 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <cassert>
using namespace llvm;
+using VectorParts = SmallVector<Value *, 2>;
+
extern cl::opt<bool> EnableVPlanNativePath;
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
bool VPRecipeBase::mayWriteToMemory() const {
switch (getVPDefID()) {
case VPWidenMemoryInstructionSC: {
@@ -186,7 +192,8 @@ void VPInstruction::generateInstruction(VPTransformState &State,
if (Instruction::isBinaryOp(getOpcode())) {
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
- Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
+ Value *V =
+ Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
State.set(this, V, Part);
return;
}
@@ -194,14 +201,14 @@ void VPInstruction::generateInstruction(VPTransformState &State,
switch (getOpcode()) {
case VPInstruction::Not: {
Value *A = State.get(getOperand(0), Part);
- Value *V = Builder.CreateNot(A);
+ Value *V = Builder.CreateNot(A, Name);
State.set(this, V, Part);
break;
}
case VPInstruction::ICmpULE: {
Value *IV = State.get(getOperand(0), Part);
Value *TC = State.get(getOperand(1), Part);
- Value *V = Builder.CreateICmpULE(IV, TC);
+ Value *V = Builder.CreateICmpULE(IV, TC, Name);
State.set(this, V, Part);
break;
}
@@ -209,7 +216,7 @@ void VPInstruction::generateInstruction(VPTransformState &State,
Value *Cond = State.get(getOperand(0), Part);
Value *Op1 = State.get(getOperand(1), Part);
Value *Op2 = State.get(getOperand(2), Part);
- Value *V = Builder.CreateSelect(Cond, Op1, Op2);
+ Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name);
State.set(this, V, Part);
break;
}
@@ -223,7 +230,7 @@ void VPInstruction::generateInstruction(VPTransformState &State,
auto *PredTy = VectorType::get(Int1Ty, State.VF);
Instruction *Call = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
- {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
+ {VIVElem0, ScalarTC}, nullptr, Name);
State.set(this, Call, Part);
break;
}
@@ -247,7 +254,8 @@ void VPInstruction::generateInstruction(VPTransformState &State,
State.set(this, PartMinus1, Part);
} else {
Value *V2 = State.get(getOperand(1), Part);
- State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part);
+ State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name),
+ Part);
}
break;
}
@@ -261,7 +269,7 @@ void VPInstruction::generateInstruction(VPTransformState &State,
// elements) times the unroll factor (num of SIMD instructions).
Value *Step =
createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
- Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
+ Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
} else {
Next = State.get(this, 0);
}
@@ -269,6 +277,23 @@ void VPInstruction::generateInstruction(VPTransformState &State,
State.set(this, Next, Part);
break;
}
+
+ case VPInstruction::CanonicalIVIncrementForPart:
+ case VPInstruction::CanonicalIVIncrementForPartNUW: {
+ bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;
+ auto *IV = State.get(getOperand(0), VPIteration(0, 0));
+ if (Part == 0) {
+ State.set(this, IV, Part);
+ break;
+ }
+
+ // The canonical IV is incremented by the vectorization factor (num of SIMD
+ // elements) times the unroll part.
+ Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
+ Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false);
+ State.set(this, Next, Part);
+ break;
+ }
case VPInstruction::BranchOnCond: {
if (Part != 0)
break;
@@ -370,6 +395,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BranchOnCond:
O << "branch-on-cond";
break;
+ case VPInstruction::CanonicalIVIncrementForPart:
+ O << "VF * Part + ";
+ break;
+ case VPInstruction::CanonicalIVIncrementForPartNUW:
+ O << "VF * Part +(nuw) ";
+ break;
case VPInstruction::BranchOnCount:
O << "branch-on-count ";
break;
@@ -431,7 +462,158 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
getOperand(2)->printAsOperand(O, SlotTracker);
O << (InvariantCond ? " (condition is loop invariant)" : "");
}
+#endif
+
+void VPWidenSelectRecipe::execute(VPTransformState &State) {
+ auto &I = *cast<SelectInst>(getUnderlyingInstr());
+ State.setDebugLocFromInst(&I);
+
+ // The condition can be loop invariant but still defined inside the
+ // loop. This means that we can't just use the original 'cond' value.
+ // We have to take the 'vectorized' value and pick the first lane.
+ // Instcombine will make this a no-op.
+ auto *InvarCond =
+ InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
+ Value *Op0 = State.get(getOperand(1), Part);
+ Value *Op1 = State.get(getOperand(2), Part);
+ Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
+ State.set(this, Sel, Part);
+ State.addMetadata(Sel, &I);
+ }
+}
+
+void VPWidenRecipe::execute(VPTransformState &State) {
+ auto &I = *cast<Instruction>(getUnderlyingValue());
+ auto &Builder = State.Builder;
+ switch (I.getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Br:
+ case Instruction::PHI:
+ case Instruction::GetElementPtr:
+ case Instruction::Select:
+ llvm_unreachable("This instruction is handled by a different recipe.");
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::FNeg:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Just widen unops and binops.
+ State.setDebugLocFromInst(&I);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ SmallVector<Value *, 2> Ops;
+ for (VPValue *VPOp : operands())
+ Ops.push_back(State.get(VPOp, Part));
+
+ Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
+
+ if (auto *VecOp = dyn_cast<Instruction>(V)) {
+ VecOp->copyIRFlags(&I);
+
+ // If the instruction is vectorized and was in a basic block that needed
+ // predication, we can't propagate poison-generating flags (nuw/nsw,
+ // exact, etc.). The control flow has been linearized and the
+ // instruction is no longer guarded by the predicate, which could make
+ // the flag properties to no longer hold.
+ if (State.MayGeneratePoisonRecipes.contains(this))
+ VecOp->dropPoisonGeneratingFlags();
+ }
+
+ // Use this vector value for all users of the original instruction.
+ State.set(this, V, Part);
+ State.addMetadata(V, &I);
+ }
+
+ break;
+ }
+ case Instruction::Freeze: {
+ State.setDebugLocFromInst(&I);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Op = State.get(getOperand(0), Part);
+
+ Value *Freeze = Builder.CreateFreeze(Op);
+ State.set(this, Freeze, Part);
+ }
+ break;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Widen compares. Generate vector compares.
+ bool FCmp = (I.getOpcode() == Instruction::FCmp);
+ auto *Cmp = cast<CmpInst>(&I);
+ State.setDebugLocFromInst(Cmp);
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *B = State.get(getOperand(1), Part);
+ Value *C = nullptr;
+ if (FCmp) {
+ // Propagate fast math flags.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(Cmp->getFastMathFlags());
+ C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+ } else {
+ C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+ }
+ State.set(this, C, Part);
+ State.addMetadata(C, &I);
+ }
+ break;
+ }
+
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ auto *CI = cast<CastInst>(&I);
+ State.setDebugLocFromInst(CI);
+
+ /// Vectorize casts.
+ Type *DestTy = (State.VF.isScalar())
+ ? CI->getType()
+ : VectorType::get(CI->getType(), State.VF);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+ State.set(this, Cast, Part);
+ State.addMetadata(Cast, &I);
+ }
+ break;
+ }
+ default:
+ // This instruction is not vectorized by simple widening.
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ llvm_unreachable("Unhandled instruction!");
+ } // end of switch.
+}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
@@ -487,7 +669,82 @@ void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
O << Indent << "= SCALAR-STEPS ";
printOperands(O, SlotTracker);
}
+#endif
+
+void VPWidenGEPRecipe::execute(VPTransformState &State) {
+ auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
+ // Construct a vector GEP by widening the operands of the scalar GEP as
+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+ // results in a vector of pointers when at least one operand of the GEP
+ // is vector-typed. Thus, to keep the representation compact, we only use
+ // vector-typed operands for loop-varying values.
+
+ if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+ // If we are vectorizing, but the GEP has only loop-invariant operands,
+ // the GEP we build (by only using vector-typed operands for
+ // loop-varying values) would be a scalar pointer. Thus, to ensure we
+ // produce a vector of pointers, we need to either arbitrarily pick an
+ // operand to broadcast, or broadcast a clone of the original GEP.
+ // Here, we broadcast a clone of the original.
+ //
+ // TODO: If at some point we decide to scalarize instructions having
+ // loop-invariant operands, this special case will no longer be
+ // required. We would add the scalarization decision to
+ // collectLoopScalars() and teach getVectorValue() to broadcast
+ // the lane-zero scalar value.
+ auto *Clone = State.Builder.Insert(GEP->clone());
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
+ State.set(this, EntryPart, Part);
+ State.addMetadata(EntryPart, GEP);
+ }
+ } else {
+ // If the GEP has at least one loop-varying operand, we are sure to
+ // produce a vector of pointers. But if we are only unrolling, we want
+ // to produce a scalar GEP for each unroll part. Thus, the GEP we
+ // produce with the code below will be scalar (if VF == 1) or vector
+ // (otherwise). Note that for the unroll-only case, we still maintain
+ // values in the vector mapping with initVector, as we do for other
+ // instructions.
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ // The pointer operand of the new GEP. If it's loop-invariant, we
+ // won't broadcast it.
+ auto *Ptr = IsPtrLoopInvariant
+ ? State.get(getOperand(0), VPIteration(0, 0))
+ : State.get(getOperand(0), Part);
+
+ // Collect all the indices for the new GEP. If any index is
+ // loop-invariant, we won't broadcast it.
+ SmallVector<Value *, 4> Indices;
+ for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
+ VPValue *Operand = getOperand(I);
+ if (IsIndexLoopInvariant[I - 1])
+ Indices.push_back(State.get(Operand, VPIteration(0, 0)));
+ else
+ Indices.push_back(State.get(Operand, Part));
+ }
+
+ // If the GEP instruction is vectorized and was in a basic block that
+ // needed predication, we can't propagate the poison-generating 'inbounds'
+ // flag. The control flow has been linearized and the GEP is no longer
+ // guarded by the predicate, which could make the 'inbounds' properties to
+ // no longer hold.
+ bool IsInBounds =
+ GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
+
+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+ // but it should be a vector, otherwise.
+ auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
+ Indices, "", IsInBounds);
+ assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
+ "NewGEP is not a pointer vector");
+ State.set(this, NewGEP, Part);
+ State.addMetadata(NewGEP, GEP);
+ }
+ }
+}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-GEP ";
@@ -501,7 +758,48 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
O << " = getelementptr ";
printOperands(O, SlotTracker);
}
+#endif
+void VPBlendRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFromInst(Phi);
+ // We know that all PHIs in non-header blocks are converted into
+ // selects, so we don't have to worry about the insertion order and we
+ // can just use the builder.
+ // At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ unsigned NumIncoming = getNumIncomingValues();
+
+ // Generate a sequence of selects of the form:
+ // SELECT(Mask3, In3,
+ // SELECT(Mask2, In2,
+ // SELECT(Mask1, In1,
+ // In0)))
+ // Note that Mask0 is never used: lanes for which no path reaches this phi and
+ // are essentially undef are taken from In0.
+ VectorParts Entry(State.UF);
+ for (unsigned In = 0; In < NumIncoming; ++In) {
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ // We might have single edge PHIs (blocks) - use an identity
+ // 'select' for the first PHI operand.
+ Value *In0 = State.get(getIncomingValue(In), Part);
+ if (In == 0)
+ Entry[Part] = In0; // Initialize with the first incoming value.
+ else {
+ // Select between the current value and the previous incoming edge
+ // based on the incoming mask.
+ Value *Cond = State.get(getMask(In), Part);
+ Entry[Part] =
+ State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
+ }
+ }
+ }
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ State.set(this, Entry[Part], Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "BLEND ";
@@ -566,7 +864,35 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
if (AlsoPack)
O << " (S->V)";
}
+#endif
+void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Branch on Mask works only on single instance.");
+
+ unsigned Part = State.Instance->Part;
+ unsigned Lane = State.Instance->Lane.getKnownLane();
+
+ Value *ConditionBit = nullptr;
+ VPValue *BlockInMask = getMask();
+ if (BlockInMask) {
+ ConditionBit = State.get(BlockInMask, Part);
+ if (ConditionBit->getType()->isVectorTy())
+ ConditionBit = State.Builder.CreateExtractElement(
+ ConditionBit, State.Builder.getInt32(Lane));
+ } else // Block in mask is all-one.
+ ConditionBit = State.Builder.getTrue();
+
+ // Replace the temporary unreachable terminator with a new conditional branch,
+ // whose two destinations will be set later when they are created.
+ auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
+ assert(isa<UnreachableInst>(CurrentTerminator) &&
+ "Expected to replace unreachable terminator with conditional branch.");
+ auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
+ CondBr->setSuccessor(0, nullptr);
+ ReplaceInstWithInst(CurrentTerminator, CondBr);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "PHI-PREDICATED-INSTRUCTION ";
@@ -838,3 +1164,28 @@ void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
printOperands(O, SlotTracker);
}
#endif
+
+// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
+// remove VPActiveLaneMaskPHIRecipe.
+void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+ Value *StartMask = State.get(getOperand(0), Part);
+ PHINode *EntryPart =
+ State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
+ EntryPart->addIncoming(StartMask, VectorPH);
+ EntryPart->setDebugLoc(DL);
+ State.set(this, EntryPart, Part);
+ }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "ACTIVE-LANE-MASK-PHI ";
+
+ printAsOperand(O, SlotTracker);
+ O << " = phi ";
+ printOperands(O, SlotTracker);
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 5fc676834331..c99fae1b2ab4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -103,6 +103,7 @@ public:
// Phi-like VPValues. Need to be kept together.
VPVBlendSC,
VPVCanonicalIVPHISC,
+ VPVActiveLaneMaskPHISC,
VPVFirstOrderRecurrencePHISC,
VPVWidenPHISC,
VPVWidenIntOrFpInductionSC,
@@ -358,6 +359,7 @@ public:
// Phi-like recipes. Need to be kept together.
VPBlendSC,
VPCanonicalIVPHISC,
+ VPActiveLaneMaskPHISC,
VPFirstOrderRecurrencePHISC,
VPWidenPHISC,
VPWidenIntOrFpInductionSC,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index f917883145c0..3501de6ab38e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -133,32 +133,48 @@ void VPlanVerifier::verifyHierarchicalCFG(
verifyRegionRec(TopRegion);
}
-bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
- auto Iter = depth_first(
- VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
- for (const VPBasicBlock *VPBB :
- VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) {
- // Verify that phi-like recipes are at the beginning of the block, with no
- // other recipes in between.
- auto RecipeI = VPBB->begin();
- auto End = VPBB->end();
- while (RecipeI != End && RecipeI->isPhi())
- RecipeI++;
+static bool verifyVPBasicBlock(const VPBasicBlock *VPBB) {
+ // Verify that phi-like recipes are at the beginning of the block, with no
+ // other recipes in between.
+ auto RecipeI = VPBB->begin();
+ auto End = VPBB->end();
+ unsigned NumActiveLaneMaskPhiRecipes = 0;
+ while (RecipeI != End && RecipeI->isPhi()) {
+ if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI))
+ NumActiveLaneMaskPhiRecipes++;
+ RecipeI++;
+ }
- while (RecipeI != End) {
- if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) {
- errs() << "Found phi-like recipe after non-phi recipe";
+ if (NumActiveLaneMaskPhiRecipes > 1) {
+ errs() << "There should be no more than one VPActiveLaneMaskPHIRecipe";
+ return false;
+ }
+
+ while (RecipeI != End) {
+ if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) {
+ errs() << "Found phi-like recipe after non-phi recipe";
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- errs() << ": ";
- RecipeI->dump();
- errs() << "after\n";
- std::prev(RecipeI)->dump();
+ errs() << ": ";
+ RecipeI->dump();
+ errs() << "after\n";
+ std::prev(RecipeI)->dump();
#endif
- return false;
- }
- RecipeI++;
+ return false;
}
+ RecipeI++;
+ }
+
+ return true;
+}
+
+bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
+ auto Iter = depth_first(
+ VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
+ for (const VPBasicBlock *VPBB :
+ VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) {
+ if (!verifyVPBasicBlock(VPBB))
+ return false;
}
const VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
@@ -181,15 +197,16 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
}
if (Exiting->empty()) {
- errs() << "VPlan vector loop exiting block must end with BranchOnCount "
- "VPInstruction but is empty\n";
+ errs() << "VPlan vector loop exiting block must end with BranchOnCount or "
+ "BranchOnCond VPInstruction but is empty\n";
return false;
}
auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end()));
- if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) {
- errs() << "VPlan vector loop exit must end with BranchOnCount "
- "VPInstruction\n";
+ if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount &&
+ LastInst->getOpcode() != VPInstruction::BranchOnCond)) {
+ errs() << "VPlan vector loop exit must end with BranchOnCount or "
+ "BranchOnCond VPInstruction\n";
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 90598937affc..d12624ffb824 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -414,6 +414,10 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
unsigned NewIndex,
IRBuilder<> &Builder) {
+ // Shufflevectors can only be created for fixed-width vectors.
+ if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType()))
+ return nullptr;
+
// If the extract can be constant-folded, this code is unsimplified. Defer
// to other passes to handle that.
Value *X = ExtElt->getVectorOperand();
@@ -1249,14 +1253,20 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
VT != Op0->getType())
return false;
- auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0));
- auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1));
- auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0));
- auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1));
+ auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
+ auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
+ auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
+ auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
+ SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
auto checkSVNonOpUses = [&](Instruction *I) {
if (!I || I->getOperand(0)->getType() != VT)
return true;
- return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; });
+ return any_of(I->users(), [&](User *U) {
+ return U != Op0 && U != Op1 &&
+ !(isa<ShuffleVectorInst>(U) &&
+ (InputShuffles.contains(cast<Instruction>(U)) ||
+ isInstructionTriviallyDead(cast<Instruction>(U))));
+ });
};
if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
@@ -1271,6 +1281,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
auto *SV = dyn_cast<ShuffleVectorInst>(U);
if (!SV || SV->getType() != VT)
return false;
+ if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
+ (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
+ return false;
if (!llvm::is_contained(Shuffles, SV))
Shuffles.push_back(SV);
}
@@ -1283,13 +1296,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
if (FromReduction && Shuffles.size() > 1)
return false;
+ // Add any shuffle uses for the shuffles we have found, to include them in our
+ // cost calculations.
+ if (!FromReduction) {
+ for (ShuffleVectorInst *SV : Shuffles) {
+ for (auto U : SV->users()) {
+ ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
+ if (SSV && isa<UndefValue>(SSV->getOperand(1)))
+ Shuffles.push_back(SSV);
+ }
+ }
+ }
+
// For each of the output shuffles, we try to sort all the first vector
// elements to the beginning, followed by the second array elements at the
// end. If the binops are legalized to smaller vectors, this may reduce total
// number of binops. We compute the ReconstructMask mask needed to convert
// back to the original lane order.
- SmallVector<int> V1, V2;
- SmallVector<SmallVector<int>> ReconstructMasks;
+ SmallVector<std::pair<int, int>> V1, V2;
+ SmallVector<SmallVector<int>> OrigReconstructMasks;
int MaxV1Elt = 0, MaxV2Elt = 0;
unsigned NumElts = VT->getNumElements();
for (ShuffleVectorInst *SVN : Shuffles) {
@@ -1300,6 +1325,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
// case we need to commute the mask).
Value *SVOp0 = SVN->getOperand(0);
Value *SVOp1 = SVN->getOperand(1);
+ if (isa<UndefValue>(SVOp1)) {
+ auto *SSV = cast<ShuffleVectorInst>(SVOp0);
+ SVOp0 = SSV->getOperand(0);
+ SVOp1 = SSV->getOperand(1);
+ for (unsigned I = 0, E = Mask.size(); I != E; I++) {
+ if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size()))
+ return false;
+ Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]);
+ }
+ }
if (SVOp0 == Op1 && SVOp1 == Op0) {
std::swap(SVOp0, SVOp1);
ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
@@ -1316,21 +1351,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
ReconstructMask.push_back(-1);
} else if (Mask[I] < static_cast<int>(NumElts)) {
MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
- auto It = find(V1, Mask[I]);
+ auto It = find_if(V1, [&](const std::pair<int, int> &A) {
+ return Mask[I] == A.first;
+ });
if (It != V1.end())
ReconstructMask.push_back(It - V1.begin());
else {
ReconstructMask.push_back(V1.size());
- V1.push_back(Mask[I]);
+ V1.emplace_back(Mask[I], V1.size());
}
} else {
MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
- auto It = find(V2, Mask[I] - NumElts);
+ auto It = find_if(V2, [&](const std::pair<int, int> &A) {
+ return Mask[I] - static_cast<int>(NumElts) == A.first;
+ });
if (It != V2.end())
ReconstructMask.push_back(NumElts + It - V2.begin());
else {
ReconstructMask.push_back(NumElts + V2.size());
- V2.push_back(Mask[I] - NumElts);
+ V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
}
}
}
@@ -1339,7 +1378,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
// result. In-order can help simplify the shuffle away.
if (FromReduction)
sort(ReconstructMask);
- ReconstructMasks.push_back(ReconstructMask);
+ OrigReconstructMasks.push_back(std::move(ReconstructMask));
}
// If the Maximum element used from V1 and V2 are not larger than the new
@@ -1351,16 +1390,68 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
MaxV2Elt == static_cast<int>(V2.size()) - 1))
return false;
+ // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
+ // shuffle of another shuffle, or not a shuffle (that is treated like a
+ // identity shuffle).
+ auto GetBaseMaskValue = [&](Instruction *I, int M) {
+ auto *SV = dyn_cast<ShuffleVectorInst>(I);
+ if (!SV)
+ return M;
+ if (isa<UndefValue>(SV->getOperand(1)))
+ if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
+ if (InputShuffles.contains(SSV))
+ return SSV->getMaskValue(SV->getMaskValue(M));
+ return SV->getMaskValue(M);
+ };
+
+ // Attempt to sort the inputs my ascending mask values to make simpler input
+ // shuffles and push complex shuffles down to the uses. We sort on the first
+ // of the two input shuffle orders, to try and get at least one input into a
+ // nice order.
+ auto SortBase = [&](Instruction *A, std::pair<int, int> X,
+ std::pair<int, int> Y) {
+ int MXA = GetBaseMaskValue(A, X.first);
+ int MYA = GetBaseMaskValue(A, Y.first);
+ return MXA < MYA;
+ };
+ stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
+ return SortBase(SVI0A, A, B);
+ });
+ stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
+ return SortBase(SVI1A, A, B);
+ });
+ // Calculate our ReconstructMasks from the OrigReconstructMasks and the
+ // modified order of the input shuffles.
+ SmallVector<SmallVector<int>> ReconstructMasks;
+ for (auto Mask : OrigReconstructMasks) {
+ SmallVector<int> ReconstructMask;
+ for (int M : Mask) {
+ auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
+ auto It = find_if(V, [M](auto A) { return A.second == M; });
+ assert(It != V.end() && "Expected all entries in Mask");
+ return std::distance(V.begin(), It);
+ };
+ if (M < 0)
+ ReconstructMask.push_back(-1);
+ else if (M < static_cast<int>(NumElts)) {
+ ReconstructMask.push_back(FindIndex(V1, M));
+ } else {
+ ReconstructMask.push_back(NumElts + FindIndex(V2, M));
+ }
+ }
+ ReconstructMasks.push_back(std::move(ReconstructMask));
+ }
+
// Calculate the masks needed for the new input shuffles, which get padded
// with undef
SmallVector<int> V1A, V1B, V2A, V2B;
for (unsigned I = 0; I < V1.size(); I++) {
- V1A.push_back(SVI0A->getMaskValue(V1[I]));
- V1B.push_back(SVI0B->getMaskValue(V1[I]));
+ V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
+ V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
}
for (unsigned I = 0; I < V2.size(); I++) {
- V2A.push_back(SVI1A->getMaskValue(V2[I]));
- V2B.push_back(SVI1B->getMaskValue(V2[I]));
+ V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
+ V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
}
while (V1A.size() < NumElts) {
V1A.push_back(UndefMaskElem);
@@ -1371,9 +1462,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
V2B.push_back(UndefMaskElem);
}
- auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) {
- return C +
- TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask());
+ auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
+ auto *SV = dyn_cast<ShuffleVectorInst>(I);
+ if (!SV)
+ return C;
+ return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
+ ? TTI::SK_PermuteSingleSrc
+ : TTI::SK_PermuteTwoSrc,
+ VT, SV->getShuffleMask());
};
auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);
@@ -1386,9 +1482,6 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);
CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
InstructionCost(0), AddShuffleCost);
- // This set helps us only cost each unique shuffle once.
- SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles(
- {SVI0A, SVI0B, SVI1A, SVI1B});
CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
InstructionCost(0), AddShuffleCost);
@@ -1408,22 +1501,35 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
InstructionCost(0), AddShuffleMaskCost);
+ LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
+ LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
+ << " vs CostAfter: " << CostAfter << "\n");
if (CostBefore <= CostAfter)
return false;
// The cost model has passed, create the new instructions.
- Builder.SetInsertPoint(SVI0A);
- Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0),
- SVI0A->getOperand(1), V1A);
- Builder.SetInsertPoint(SVI0B);
- Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0),
- SVI0B->getOperand(1), V1B);
- Builder.SetInsertPoint(SVI1A);
- Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0),
- SVI1A->getOperand(1), V2A);
- Builder.SetInsertPoint(SVI1B);
- Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0),
- SVI1B->getOperand(1), V2B);
+ auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
+ auto *SV = dyn_cast<ShuffleVectorInst>(I);
+ if (!SV)
+ return I;
+ if (isa<UndefValue>(SV->getOperand(1)))
+ if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
+ if (InputShuffles.contains(SSV))
+ return SSV->getOperand(Op);
+ return SV->getOperand(Op);
+ };
+ Builder.SetInsertPoint(SVI0A->getNextNode());
+ Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
+ GetShuffleOperand(SVI0A, 1), V1A);
+ Builder.SetInsertPoint(SVI0B->getNextNode());
+ Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
+ GetShuffleOperand(SVI0B, 1), V1B);
+ Builder.SetInsertPoint(SVI1A->getNextNode());
+ Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
+ GetShuffleOperand(SVI1A, 1), V2A);
+ Builder.SetInsertPoint(SVI1B->getNextNode());
+ Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
+ GetShuffleOperand(SVI1B, 1), V2B);
Builder.SetInsertPoint(Op0);
Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
NSV0A, NSV0B);