summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp18
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp104
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h48
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp3585
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp2218
-rw-r--r--llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp302
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h606
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp4
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanSLP.cpp25
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h212
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp207
14 files changed, 4875 insertions, 2482 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 9b81afbb4b6c..6ec5590d76ba 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -666,6 +666,10 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
cast<IntrinsicInst>(&I)->getIntrinsicID() ==
Intrinsic::sideeffect) {
// Ignore llvm.sideeffect calls.
+ } else if (isa<IntrinsicInst>(&I) &&
+ cast<IntrinsicInst>(&I)->getIntrinsicID() ==
+ Intrinsic::pseudoprobe) {
+ // Ignore llvm.pseudoprobe calls.
} else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
<< '\n');
@@ -762,8 +766,8 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
return Chain.slice(0, ChainIdx);
}
-static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
- const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+static ChainID getChainID(const Value *Ptr) {
+ const Value *ObjPtr = getUnderlyingObject(Ptr);
if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
// The select's themselves are distinct instructions even if they share the
// same condition and evaluate to consecutive pointers for true and false
@@ -830,7 +834,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
continue;
// Save the load locations.
- const ChainID ID = getChainID(Ptr, DL);
+ const ChainID ID = getChainID(Ptr);
LoadRefs[ID].push_back(LI);
} else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
if (!SI->isSimple())
@@ -876,7 +880,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
continue;
// Save store location.
- const ChainID ID = getChainID(Ptr, DL);
+ const ChainID ID = getChainID(Ptr);
StoreRefs[ID].push_back(SI);
}
}
@@ -1027,8 +1031,8 @@ bool Vectorizer::vectorizeStoreChain(
unsigned EltSzInBytes = Sz / 8;
unsigned SzInBytes = EltSzInBytes * ChainSize;
- VectorType *VecTy;
- VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
+ FixedVectorType *VecTy;
+ auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
if (VecStoreTy)
VecTy = FixedVectorType::get(StoreTy->getScalarType(),
Chain.size() * VecStoreTy->getNumElements());
@@ -1180,7 +1184,7 @@ bool Vectorizer::vectorizeLoadChain(
unsigned EltSzInBytes = Sz / 8;
unsigned SzInBytes = EltSzInBytes * ChainSize;
VectorType *VecTy;
- VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
+ auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
if (VecLoadTy)
VecTy = FixedVectorType::get(LoadTy->getScalarType(),
Chain.size() * VecLoadTy->getNumElements());
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 23613775d896..2ab0848193f6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -13,13 +13,16 @@
// pass. It should be easy to create an analysis pass around it if there
// is a need (but D45420 needs to happen first).
//
+
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
using namespace llvm;
@@ -63,6 +66,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
return (Val <= 1);
case HK_ISVECTORIZED:
case HK_PREDICATE:
+ case HK_SCALABLE:
return (Val == 0 || Val == 1);
}
return false;
@@ -75,7 +79,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
- Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L),
+ Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
+ Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L),
ORE(ORE) {
// Populate values with existing loop metadata.
getHintsFromMetadata();
@@ -88,7 +93,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
// If the vectorization width and interleaving count are both 1 then
// consider the loop to have been already vectorized because there's
// nothing more that we can do.
- IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
+ IsVectorized.Value =
+ getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1;
LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
<< "LV: Interleaving disabled by the pass manager\n");
}
@@ -161,7 +167,7 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
if (Force.Value == LoopVectorizeHints::FK_Enabled) {
R << " (Force=" << NV("Force", true);
if (Width.Value != 0)
- R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+ R << ", Vector Width=" << NV("VectorWidth", getWidth());
if (Interleave.Value != 0)
R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
R << ")";
@@ -172,11 +178,11 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
}
const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
- if (getWidth() == 1)
+ if (getWidth() == ElementCount::getFixed(1))
return LV_NAME;
if (getForce() == LoopVectorizeHints::FK_Disabled)
return LV_NAME;
- if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+ if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
return LV_NAME;
return OptimizationRemarkAnalysis::AlwaysPrint;
}
@@ -227,7 +233,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
return;
unsigned Val = C->getZExtValue();
- Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
+ Hint *Hints[] = {&Width, &Interleave, &Force,
+ &IsVectorized, &Predicate, &Scalable};
for (auto H : Hints) {
if (Name == H->Name) {
if (H->validate(Val))
@@ -412,7 +419,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
const ValueToValueMap &Strides =
getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
- bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();
+ Function *F = TheLoop->getHeader()->getParent();
+ bool OptForSize = F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass);
+ bool CanAddPredicate = !OptForSize;
int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
if (Stride == 1 || Stride == -1)
return Stride;
@@ -424,7 +435,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
}
bool LoopVectorizationLegality::canVectorizeOuterLoop() {
- assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
+ assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
// Store the result and return it at the end instead of exiting early, in case
// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
bool Result = true;
@@ -768,7 +779,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// supported on the target.
if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
// Arbitrarily try a vector of 2 elements.
- auto *VecTy = FixedVectorType::get(T, /*NumElements=*/2);
+ auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
assert(VecTy && "did not find vectorized version of stored type");
if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
reportVectorizationFailure(
@@ -783,7 +794,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
// For nontemporal loads, check that a nontemporal vector version is
// supported on the target (arbitrarily try a vector of 2 elements).
- auto *VecTy = FixedVectorType::get(I.getType(), /*NumElements=*/2);
+ auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
assert(VecTy && "did not find vectorized version of load type");
if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
reportVectorizationFailure(
@@ -912,7 +923,10 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
}
bool LoopVectorizationLegality::blockCanBePredicated(
- BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
+ BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+ SmallPtrSetImpl<const Instruction *> &MaskedOp,
+ SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
+ bool PreserveGuards) const {
const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
for (Instruction &I : *BB) {
@@ -930,6 +944,12 @@ bool LoopVectorizationLegality::blockCanBePredicated(
continue;
}
+ // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
+ // TODO: there might be cases that it should block the vectorization. Let's
+ // ignore those for now.
+ if (isa<NoAliasScopeDeclInst>(&I))
+ continue;
+
// We might be able to hoist the load.
if (I.mayReadFromMemory()) {
auto *LI = dyn_cast<LoadInst>(&I);
@@ -999,7 +1019,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
ScalarEvolution &SE = *PSE.getSE();
for (Instruction &I : *BB) {
LoadInst *LI = dyn_cast<LoadInst>(&I);
- if (LI && !mustSuppressSpeculation(*LI) &&
+ if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
SafePointers.insert(LI->getPointerOperand());
}
@@ -1019,7 +1039,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
// We must be able to predicate all blocks that need to be predicated.
if (blockNeedsPredication(BB)) {
- if (!blockCanBePredicated(BB, SafePointers)) {
+ if (!blockCanBePredicated(BB, SafePointers, MaskedOp,
+ ConditionalAssumes)) {
reportVectorizationFailure(
"Control flow cannot be substituted for a select",
"control flow cannot be substituted for a select",
@@ -1044,7 +1065,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
// Helper function to canVectorizeLoopNestCFG.
bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
bool UseVPlanNativePath) {
- assert((UseVPlanNativePath || Lp->empty()) &&
+ assert((UseVPlanNativePath || Lp->isInnermost()) &&
"VPlan-native path is not enabled.");
// TODO: ORE should be improved to show more accurate information when an
@@ -1080,22 +1101,14 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
return false;
}
- // We must have a single exiting block.
- if (!Lp->getExitingBlock()) {
- reportVectorizationFailure("The loop must have an exiting block",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop);
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We only handle bottom-tested loops, i.e. loop in which the condition is
- // checked at the end of each iteration. With that we can assume that all
- // instructions in the loop are executed the same number of times.
- if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
- reportVectorizationFailure("The exiting block is not the loop latch",
+ // We currently must have a single "exit block" after the loop. Note that
+ // multiple "exiting blocks" inside the loop are allowed, provided they all
+ // reach the single exit block.
+ // TODO: This restriction can be relaxed in the near future, it's here solely
+ // to allow separation of changes for review. We need to generalize the phi
+ // update logic in a number of places.
+ if (!Lp->getUniqueExitBlock()) {
+ reportVectorizationFailure("The loop must have a unique exit block",
"loop control flow is not understood by vectorizer",
"CFGNotUnderstood", ORE, TheLoop);
if (DoExtraAnalysis)
@@ -1103,7 +1116,6 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
else
return false;
}
-
return Result;
}
@@ -1154,7 +1166,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
// Specific checks for outer loops. We skip the remaining legal checks at this
// point because they don't support outer loops.
- if (!TheLoop->empty()) {
+ if (!TheLoop->isInnermost()) {
assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
if (!canVectorizeOuterLoop()) {
@@ -1171,7 +1183,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
return Result;
}
- assert(TheLoop->empty() && "Inner loop expected.");
+ assert(TheLoop->isInnermost() && "Inner loop expected.");
// Check if we can if-convert non-single-bb loops.
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
@@ -1246,10 +1258,10 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
Instruction *UI = cast<Instruction>(U);
if (TheLoop->contains(UI))
continue;
- reportVectorizationFailure(
- "Cannot fold tail by masking, loop has an outside user for",
- "Cannot fold tail by masking in the presence of live outs.",
- "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Cannot fold tail by masking, loop has an outside user for "
+ << *UI << "\n");
return false;
}
}
@@ -1257,20 +1269,26 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
// The list of pointers that we can safely read and write to remains empty.
SmallPtrSet<Value *, 8> SafePointers;
+ SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
+ SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
+
// Check and mark all blocks for predication, including those that ordinarily
// do not need predication such as the header block.
for (BasicBlock *BB : TheLoop->blocks()) {
- if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
- reportVectorizationFailure(
- "Cannot fold tail by masking as required",
- "control flow cannot be substituted for a select",
- "NoCFGForSelect", ORE, TheLoop,
- BB->getTerminator());
+ if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
+ TmpConditionalAssumes,
+ /* MaskAllLoads= */ true)) {
+ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
return false;
}
}
LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+
+ MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
+ ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
+ TmpConditionalAssumes.end());
+
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 8dd06983cd84..1795470fa58c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -34,6 +34,7 @@ namespace llvm {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class PredicatedScalarEvolution;
+class VPRecipeBuilder;
/// VPlan-based builder utility analogous to IRBuilder.
class VPBuilder {
@@ -171,16 +172,22 @@ public:
/// Information about vectorization costs
struct VectorizationFactor {
// Vector width with best cost
- unsigned Width;
+ ElementCount Width;
// Cost of the loop with that width
unsigned Cost;
// Width 1 means no vectorization, cost 0 means uncomputed cost.
- static VectorizationFactor Disabled() { return {1, 0}; }
+ static VectorizationFactor Disabled() {
+ return {ElementCount::getFixed(1), 0};
+ }
bool operator==(const VectorizationFactor &rhs) const {
return Width == rhs.Width && Cost == rhs.Cost;
}
+
+ bool operator!=(const VectorizationFactor &rhs) const {
+ return !(*this == rhs);
+ }
};
/// Planner drives the vectorization process after having passed
@@ -226,7 +233,10 @@ class LoopVectorizationPlanner {
/// A builder used to construct the current plan.
VPBuilder Builder;
- unsigned BestVF = 0;
+ /// The best number of elements of the vector types used in the
+ /// transformed loop. BestVF = None means that vectorization is
+ /// disabled.
+ Optional<ElementCount> BestVF = None;
unsigned BestUF = 0;
public:
@@ -241,14 +251,14 @@ public:
/// Plan how to best vectorize, return the best VF and its cost, or None if
/// vectorization and interleaving should be avoided up front.
- Optional<VectorizationFactor> plan(unsigned UserVF, unsigned UserIC);
+ Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
/// Use the VPlan-native path to plan how to best vectorize, return the best
/// VF and its cost.
- VectorizationFactor planInVPlanNativePath(unsigned UserVF);
+ VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
/// Finalize the best decision and dispose of all other VPlans.
- void setBestPlan(unsigned VF, unsigned UF);
+ void setBestPlan(ElementCount VF, unsigned UF);
/// Generate the IR code for the body of the vectorized loop according to the
/// best selected VPlan.
@@ -259,11 +269,21 @@ public:
O << *Plan;
}
+ /// Look through the existing plans and return true if we have one with all
+ /// the vectorization factors in question.
+ bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
+ return any_of(VPlans, [&](const VPlanPtr &Plan) {
+ return all_of(VFs, [&](const ElementCount &VF) {
+ return Plan->hasVF(VF);
+ });
+ });
+ }
+
/// Test a \p Predicate on a \p Range of VF's. Return the value of applying
/// \p Predicate on Range.Start, possibly decreasing Range.End such that the
/// returned value holds for the entire \p Range.
static bool
- getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
+ getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
VFRange &Range);
protected:
@@ -275,7 +295,7 @@ protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
/// legal to vectorize the loop.
- void buildVPlans(unsigned MinVF, unsigned MaxVF);
+ void buildVPlans(ElementCount MinVF, ElementCount MaxVF);
private:
/// Build a VPlan according to the information gathered by Legal. \return a
@@ -286,14 +306,20 @@ private:
/// Build a VPlan using VPRecipes according to the information gather by
/// Legal. This method is only used for the legacy inner loop vectorizer.
VPlanPtr buildVPlanWithVPRecipes(
- VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
- SmallPtrSetImpl<Instruction *> &DeadInstructions,
+ VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
const DenseMap<Instruction *, Instruction *> &SinkAfter);
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
/// legal to vectorize the loop. This method creates VPlans using VPRecipes.
- void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
+ void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+
+ /// Adjust the recipes for any inloop reductions. The chain of instructions
+ /// leading from the loop exit instr to the phi need to be converted to
+ /// reductions, with one operand being vector and the other being the scalar
+ /// reduction chain.
+ void adjustRecipesForInLoopReductions(VPlanPtr &Plan,
+ VPRecipeBuilder &RecipeBuilder);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 35af8e425778..ea0d7673edf6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -130,6 +130,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -157,18 +158,37 @@ using namespace llvm;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
+#ifndef NDEBUG
+const char VerboseDebug[] = DEBUG_TYPE "-verbose";
+#endif
+
/// @{
/// Metadata attribute names
-static const char *const LLVMLoopVectorizeFollowupAll =
- "llvm.loop.vectorize.followup_all";
-static const char *const LLVMLoopVectorizeFollowupVectorized =
+const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
+const char LLVMLoopVectorizeFollowupVectorized[] =
"llvm.loop.vectorize.followup_vectorized";
-static const char *const LLVMLoopVectorizeFollowupEpilogue =
+const char LLVMLoopVectorizeFollowupEpilogue[] =
"llvm.loop.vectorize.followup_epilogue";
/// @}
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+
+static cl::opt<bool> EnableEpilogueVectorization(
+ "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
+ cl::desc("Enable vectorization of epilogue loops."));
+
+static cl::opt<unsigned> EpilogueVectorizationForceVF(
+ "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
+ cl::desc("When epilogue vectorization is enabled, and a value greater than "
+ "1 is specified, forces the given VF for all applicable epilogue "
+ "loops."));
+
+static cl::opt<unsigned> EpilogueVectorizationMinVF(
+ "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
+ cl::desc("Only loops with vectorization factor equal to or larger than "
+ "the specified value are considered for epilogue vectorization."));
/// Loops with a known constant trip count below this number are vectorized only
/// if no scalar iteration overheads are incurred.
@@ -178,13 +198,36 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
"value are vectorized only if no scalar iteration overheads "
"are incurred."));
-// Indicates that an epilogue is undesired, predication is preferred.
-// This means that the vectorizer will try to fold the loop-tail (epilogue)
-// into the loop and predicate the loop body accordingly.
-static cl::opt<bool> PreferPredicateOverEpilog(
- "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
- cl::desc("Indicate that an epilogue is undesired, predication should be "
- "used instead."));
+// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
+// that predication is preferred, and this lists all options. I.e., the
+// vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// and predicate the instructions accordingly. If tail-folding fails, there are
+// different fallback strategies depending on these values:
+namespace PreferPredicateTy {
+ enum Option {
+ ScalarEpilogue = 0,
+ PredicateElseScalarEpilogue,
+ PredicateOrDontVectorize
+ };
+} // namespace PreferPredicateTy
+
+static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
+ "prefer-predicate-over-epilogue",
+ cl::init(PreferPredicateTy::ScalarEpilogue),
+ cl::Hidden,
+ cl::desc("Tail-folding and predication preferences over creating a scalar "
+ "epilogue loop."),
+ cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
+ "scalar-epilogue",
+ "Don't tail-predicate loops, create scalar epilogue"),
+ clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+ "predicate-else-scalar-epilogue",
+ "prefer tail-folding, create scalar epilogue if tail "
+ "folding fails."),
+ clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+ "predicate-dont-vectorize",
+ "prefers tail-folding, don't attempt vectorization if "
+ "tail-folding fails.")));
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -196,7 +239,7 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
/// An interleave-group may need masking if it resides in a block that needs
-/// predication, or in order to mask away gaps.
+/// predication, or in order to mask away gaps.
static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
@@ -230,6 +273,12 @@ static cl::opt<unsigned> ForceTargetInstructionCost(
"an instruction to a single constant value. Mostly "
"useful for getting consistent testing."));
+static cl::opt<bool> ForceTargetSupportsScalableVectors(
+ "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Pretend that scalable vectors are supported, even if the target does "
+ "not support them. This flag should only be used for testing."));
+
static cl::opt<unsigned> SmallLoopCost(
"small-loop-cost", cl::init(20), cl::Hidden,
cl::desc(
@@ -247,6 +296,12 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
cl::desc(
"Enable runtime interleaving until load/store ports are saturated"));
+/// Interleave small loops with scalar reductions.
+static cl::opt<bool> InterleaveSmallLoopScalarReduction(
+ "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
+ cl::desc("Enable interleaving for loops with small iteration counts that "
+ "contain scalar reductions to expose ILP."));
+
/// The number of stores in a loop that are allowed to need predication.
static cl::opt<unsigned> NumberOfStoresToPredicate(
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -265,6 +320,17 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
+static cl::opt<bool>
+ PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
+ cl::Hidden,
+ cl::desc("Prefer in-loop vector reductions, "
+ "overriding the targets preference."));
+
+static cl::opt<bool> PreferPredicatedReductionSelect(
+ "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Prefer predicating a reduction operation over an after loop select."));
+
cl::opt<bool> EnableVPlanNativePath(
"enable-vplan-native-path", cl::init(false), cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
@@ -307,12 +373,14 @@ static Type *getMemInstValueType(Value *I) {
/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type at the given vectorization factor.
-static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
+static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
// Determine if an array of VF elements of type Ty is "bitcast compatible"
// with a <VF x Ty> vector.
- if (VF > 1) {
- auto *VectorTy = FixedVectorType::get(Ty, VF);
- return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
+ if (VF.isVector()) {
+ auto *VectorTy = VectorType::get(Ty, VF);
+ return TypeSize::get(VF.getKnownMinValue() *
+ DL.getTypeAllocSize(Ty).getFixedValue(),
+ VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
}
// If the vectorization factor is one, we just check if an array of type Ty
@@ -393,29 +461,42 @@ public:
LoopInfo *LI, DominatorTree *DT,
const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, unsigned VecWidth,
+ OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
- LoopVectorizationCostModel *CM)
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()),
- VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
+ VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
+ BFI(BFI), PSI(PSI) {
+ // Query this against the original loop and save it here because the profile
+ // of the original loop header may change as the transformation happens.
+ OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
+ OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
+ }
+
virtual ~InnerLoopVectorizer() = default;
- /// Create a new empty loop. Unlink the old loop and connect the new one.
- /// Return the pre-header block of the new loop.
- BasicBlock *createVectorizedLoopSkeleton();
+ /// Create a new empty loop that will contain vectorized instructions later
+ /// on, while the old loop will be used as the scalar remainder. Control flow
+ /// is generated around the vectorized (and scalar epilogue) loops consisting
+ /// of various checks and bypasses. Return the pre-header block of the new
+ /// loop.
+ /// In the case of epilogue vectorization, this function is overriden to
+ /// handle the more complex control flow around the loops.
+ virtual BasicBlock *createVectorizedLoopSkeleton();
/// Widen a single instruction within the innermost loop.
- void widenInstruction(Instruction &I, VPUser &Operands,
+ void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
VPTransformState &State);
/// Widen a single call instruction within the innermost loop.
- void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+ void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
VPTransformState &State);
/// Widen a single select instruction within the innermost loop.
- void widenSelectInstruction(SelectInst &I, VPUser &Operands,
+ void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
bool InvariantCond, VPTransformState &State);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
@@ -431,14 +512,15 @@ public:
/// Vectorize a single GetElementPtrInst based on information gathered and
/// decisions taken during planning.
- void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
- unsigned VF, bool IsPtrLoopInvariant,
+ void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
+ unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
- void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
+ void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
+ Value *StartV, unsigned UF, ElementCount VF);
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
@@ -452,7 +534,8 @@ public:
/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
/// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+ void widenIntOrFpInduction(PHINode *IV, Value *Start,
+ TruncInst *Trunc = nullptr);
/// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
/// vector or scalar value on-demand if one is not yet available. When
@@ -477,6 +560,10 @@ public:
/// value into a vector.
Value *getOrCreateVectorValue(Value *V, unsigned Part);
+ void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
+ VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
+ }
+
/// Return a value in the new loop corresponding to \p V from the original
/// loop at unroll and vector indices \p Instance. If the value has been
/// vectorized but not scalarized, the necessary extractelement instruction
@@ -491,7 +578,9 @@ public:
/// BlockInMask is non-null. Use \p State to translate given VPValues to IR
/// values in the vectorized loop.
void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
+ ArrayRef<VPValue *> VPDefs,
VPTransformState &State, VPValue *Addr,
+ ArrayRef<VPValue *> StoredValues,
VPValue *BlockInMask = nullptr);
/// Vectorize Load and Store instructions with the base address given in \p
@@ -499,8 +588,8 @@ public:
/// non-null. Use \p State to translate given VPValues to IR values in the
/// vectorized loop.
void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
- VPValue *Addr, VPValue *StoredValue,
- VPValue *BlockInMask);
+ VPValue *Def, VPValue *Addr,
+ VPValue *StoredValue, VPValue *BlockInMask);
/// Set the debug location in the builder using the debug location in
/// the instruction.
@@ -544,10 +633,11 @@ protected:
/// Clear NSW/NUW flags from reduction instructions if necessary.
void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
- /// The Loop exit block may have single value PHI nodes with some
- /// incoming value. While vectorizing we only handled real values
- /// that were defined inside the loop and we should have one value for
- /// each predecessor of its parent basic block. See PR14725.
+ /// Fixup the LCSSA phi nodes in the unique exit block. This simply
+ /// means we need to add the appropriate incoming value from the middle
+ /// block as exiting edges from the scalar epilogue loop (if present) are
+ /// already in place, and we exit the vector loop exclusively to the middle
+ /// block.
void fixLCSSAPHIs();
/// Iteratively sink the scalarized operands of a predicated instruction into
@@ -586,7 +676,8 @@ protected:
/// truncate instruction, instead of widening the original IV, we widen a
/// version of the IV truncated to \p EntryVal's type.
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
- Value *Step, Instruction *EntryVal);
+ Value *Step, Value *Start,
+ Instruction *EntryVal);
/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
@@ -654,6 +745,28 @@ protected:
const DataLayout &DL,
const InductionDescriptor &ID) const;
+ /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
+ /// vector loop preheader, middle block and scalar preheader. Also
+ /// allocate a loop object for the new vector loop and return it.
+ Loop *createVectorLoopSkeleton(StringRef Prefix);
+
+ /// Create new phi nodes for the induction variables to resume iteration count
+ /// in the scalar epilogue, from where the vectorized loop left off (given by
+ /// \p VectorTripCount).
+ /// In cases where the loop skeleton is more complicated (eg. epilogue
+ /// vectorization) and the resume values can come from an additional bypass
+ /// block, the \p AdditionalBypass pair provides information about the bypass
+ /// block and the end value on the edge from bypass to this loop.
+ void createInductionResumeValues(
+ Loop *L, Value *VectorTripCount,
+ std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+
+ /// Complete the loop skeleton by adding debug MDs, creating appropriate
+ /// conditional branches in the middle block, preparing the builder and
+ /// running the verifier. Take in the vector loop \p L as argument, and return
+ /// the preheader of the completed vector loop.
+ BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
+
/// Add additional metadata to \p To that was not present on \p Orig.
///
/// Currently this is used to add the noalias annotations based on the
@@ -672,6 +785,11 @@ protected:
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
+ /// Allow subclasses to override and print debug traces before/after vplan
+ /// execution, when trace information is requested.
+ virtual void printDebugTracesAtStart(){};
+ virtual void printDebugTracesAtEnd(){};
+
/// The original loop.
Loop *OrigLoop;
@@ -710,7 +828,7 @@ protected:
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
- unsigned VF;
+ ElementCount VF;
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
@@ -730,7 +848,8 @@ protected:
/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;
- /// The ExitBlock of the scalar loop.
+ /// The (unique) ExitBlock of the scalar loop. Note that
+ /// there can be multiple exiting edges reaching this block.
BasicBlock *LoopExitBlock;
/// The vector loop body.
@@ -779,6 +898,14 @@ protected:
// Vector of original scalar PHIs whose corresponding widened PHIs need to be
// fixed up at the end of vector code generation.
SmallVector<PHINode *, 8> OrigPHIsToFix;
+
+ /// BFI and PSI are used to check for profile guided size optimizations.
+ BlockFrequencyInfo *BFI;
+ ProfileSummaryInfo *PSI;
+
+ // Whether this loop should be optimized for size based on profile guided size
+ // optimizatios.
+ bool OptForSizeBasedOnProfile;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -789,9 +916,11 @@ public:
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
LoopVectorizationLegality *LVL,
- LoopVectorizationCostModel *CM)
- : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
- UnrollFactor, LVL, CM) {}
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+ ElementCount::getFixed(1), UnrollFactor, LVL, CM,
+ BFI, PSI) {}
private:
Value *getBroadcastInstrs(Value *V) override;
@@ -801,6 +930,128 @@ private:
Value *reverseVector(Value *Vec) override;
};
+/// Encapsulate information regarding vectorization of a loop and its epilogue.
+/// This information is meant to be updated and used across two stages of
+/// epilogue vectorization.
+struct EpilogueLoopVectorizationInfo {
+ ElementCount MainLoopVF = ElementCount::getFixed(0);
+ unsigned MainLoopUF = 0;
+ ElementCount EpilogueVF = ElementCount::getFixed(0);
+ unsigned EpilogueUF = 0;
+ BasicBlock *MainLoopIterationCountCheck = nullptr;
+ BasicBlock *EpilogueIterationCountCheck = nullptr;
+ BasicBlock *SCEVSafetyCheck = nullptr;
+ BasicBlock *MemSafetyCheck = nullptr;
+ Value *TripCount = nullptr;
+ Value *VectorTripCount = nullptr;
+
+ EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
+ unsigned EUF)
+ : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
+ EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
+ assert(EUF == 1 &&
+ "A high UF for the epilogue loop is likely not beneficial.");
+ }
+};
+
+/// An extension of the inner loop vectorizer that creates a skeleton for a
+/// vectorized loop that has its epilogue (residual) also vectorized.
+/// The idea is to run the vplan on a given loop twice, firstly to setup the
+/// skeleton and vectorize the main loop, and secondly to complete the skeleton
+/// from the first step and vectorize the epilogue. This is achieved by
+/// deriving two concrete strategy classes from this base class and invoking
+/// them in succession from the loop vectorizer planner.
+class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
+public:
+ InnerLoopAndEpilogueVectorizer(
+ Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ DominatorTree *DT, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+ LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+ EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
+ EPI(EPI) {}
+
+ // Override this function to handle the more complex control flow around the
+ // three loops.
+ BasicBlock *createVectorizedLoopSkeleton() final override {
+ return createEpilogueVectorizedLoopSkeleton();
+ }
+
+ /// The interface for creating a vectorized skeleton using one of two
+ /// different strategies, each corresponding to one execution of the vplan
+ /// as described above.
+ virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+
+ /// Holds and updates state information required to vectorize the main loop
+ /// and its epilogue in two separate passes. This setup helps us avoid
+ /// regenerating and recomputing runtime safety checks. It also helps us to
+ /// shorten the iteration-count-check path length for the cases where the
+ /// iteration count of the loop is so small that the main vector loop is
+ /// completely skipped.
+ EpilogueLoopVectorizationInfo &EPI;
+};
+
+/// A specialized derived class of inner loop vectorizer that performs
+/// vectorization of *main* loops in the process of vectorizing loops and their
+/// epilogues.
+class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
+public:
+ EpilogueVectorizerMainLoop(
+ Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ DominatorTree *DT, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+ LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+ : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+ EPI, LVL, CM, BFI, PSI) {}
+ /// Implements the interface for creating a vectorized skeleton using the
+ /// *main loop* strategy (ie the first pass of vplan execution).
+ BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+protected:
+ /// Emits an iteration count bypass check once for the main loop (when \p
+ /// ForEpilogue is false) and once for the epilogue loop (when \p
+ /// ForEpilogue is true).
+ BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
+ bool ForEpilogue);
+ void printDebugTracesAtStart() override;
+ void printDebugTracesAtEnd() override;
+};
+
+// A specialized derived class of inner loop vectorizer that performs
+// vectorization of *epilogue* loops in the process of vectorizing loops and
+// their epilogues.
+class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
+public:
+ EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE,
+ EpilogueLoopVectorizationInfo &EPI,
+ LoopVectorizationLegality *LVL,
+ llvm::LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+ : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+ EPI, LVL, CM, BFI, PSI) {}
+ /// Implements the interface for creating a vectorized skeleton using the
+ /// *epilogue loop* strategy (ie the second pass of vplan execution).
+ BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+protected:
+ /// Emits an iteration count bypass check after the main vector loop has
+ /// finished to see if there are any iterations left to execute by either
+ /// the vector epilogue or the scalar epilogue.
+ BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
+ BasicBlock *Bypass,
+ BasicBlock *Insert);
+ void printDebugTracesAtStart() override;
+ void printDebugTracesAtEnd() override;
+};
} // end namespace llvm
/// Look for a meaningful debug location on the instruction or it's
@@ -827,7 +1078,9 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
const DILocation *DIL = Inst->getDebugLoc();
if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
!isa<DbgInfoIntrinsic>(Inst)) {
- auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ auto NewDIL =
+ DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
if (NewDIL)
B.SetCurrentDebugLocation(NewDIL.getValue());
else
@@ -881,6 +1134,15 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
return R;
}
+/// Return a value for Step multiplied by VF.
+static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
+ assert(isa<ConstantInt>(Step) && "Expected an integer step");
+ Constant *StepVal = ConstantInt::get(
+ Step->getType(),
+ cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
+ return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
+}
+
namespace llvm {
void reportVectorizationFailure(const StringRef DebugMsg,
@@ -952,7 +1214,10 @@ enum ScalarEpilogueLowering {
CM_ScalarEpilogueNotAllowedLowTripLoop,
// Loop hint predicate indicating an epilogue is undesired.
- CM_ScalarEpilogueNotNeededUsePredicate
+ CM_ScalarEpilogueNotNeededUsePredicate,
+
+ // Directive indicating we must either tail fold or not vectorize
+ CM_ScalarEpilogueNotAllowedUsePredicate
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -979,7 +1244,7 @@ public:
/// \return An upper bound for the vectorization factor, or None if
/// vectorization and interleaving should be avoided up front.
- Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
+ Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
/// \return True if runtime checks are required for vectorization, and false
/// otherwise.
@@ -989,10 +1254,13 @@ public:
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
- VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+ VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
+ VectorizationFactor
+ selectEpilogueVectorizationFactor(const ElementCount MaxVF,
+ const LoopVectorizationPlanner &LVP);
/// Setup cost-based decisions for user vectorization factor.
- void selectUserVectorizationFactor(unsigned UserVF) {
+ void selectUserVectorizationFactor(ElementCount UserVF) {
collectUniformsAndScalars(UserVF);
collectInstsToScalarize(UserVF);
}
@@ -1006,7 +1274,7 @@ public:
/// If interleave count has been specified by metadata it will be returned.
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
/// are the selected vectorization factor and the cost of the selected VF.
- unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
+ unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
/// Memory access instruction may be vectorized in more than one way.
/// Form of instruction after vectorization depends on cost.
@@ -1015,7 +1283,7 @@ public:
/// the lists of loop-uniform and loop-scalar instructions.
/// The calculated cost is saved with widening decision in order to
/// avoid redundant calculations.
- void setCostBasedWideningDecision(unsigned VF);
+ void setCostBasedWideningDecision(ElementCount VF);
/// A struct that represents some properties of the register usage
/// of a loop.
@@ -1030,11 +1298,16 @@ public:
/// \return Returns information about the register usages of the loop for the
/// given vectorization factors.
- SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
+ SmallVector<RegisterUsage, 8>
+ calculateRegisterUsage(ArrayRef<ElementCount> VFs);
/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();
+ /// Split reductions into those that happen in the loop, and those that happen
+ /// outside. In loop reductions are collected into InLoopReductionChains.
+ void collectInLoopReductions();
+
/// \returns The smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this
/// type.
@@ -1044,8 +1317,9 @@ public:
/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
- bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
- assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
+ bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
+ assert(VF.isVector() &&
+ "Profitable to scalarize relevant only for VF > 1.");
// Cost model is not run in the VPlan-native path - return conservative
// result until this changes.
@@ -1059,8 +1333,8 @@ public:
}
/// Returns true if \p I is known to be uniform after vectorization.
- bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
- if (VF == 1)
+ bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
+ if (VF.isScalar())
return true;
// Cost model is not run in the VPlan-native path - return conservative
@@ -1075,8 +1349,8 @@ public:
}
/// Returns true if \p I is known to be scalar after vectorization.
- bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
- if (VF == 1)
+ bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
+ if (VF.isScalar())
return true;
// Cost model is not run in the VPlan-native path - return conservative
@@ -1092,8 +1366,8 @@ public:
/// \returns True if instruction \p I can be truncated to a smaller bitwidth
/// for vectorization factor \p VF.
- bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
- return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
+ bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
+ return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
!isProfitableToScalarize(I, VF) &&
!isScalarAfterVectorization(I, VF);
}
@@ -1110,17 +1384,18 @@ public:
/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// instruction \p I and vector width \p VF.
- void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
- unsigned Cost) {
- assert(VF >= 2 && "Expected VF >=2");
+ void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
+ InstructionCost Cost) {
+ assert(VF.isVector() && "Expected VF >=2");
WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
}
/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// interleaving group \p Grp and vector width \p VF.
- void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
- InstWidening W, unsigned Cost) {
- assert(VF >= 2 && "Expected VF >=2");
+ void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
+ ElementCount VF, InstWidening W,
+ InstructionCost Cost) {
+ assert(VF.isVector() && "Expected VF >=2");
/// Broadcast this decicion to all instructions inside the group.
/// But the cost will be assigned to one instruction only.
for (unsigned i = 0; i < Grp->getFactor(); ++i) {
@@ -1136,15 +1411,14 @@ public:
/// Return the cost model decision for the given instruction \p I and vector
/// width \p VF. Return CM_Unknown if this instruction did not pass
/// through the cost modeling.
- InstWidening getWideningDecision(Instruction *I, unsigned VF) {
- assert(VF >= 2 && "Expected VF >=2");
-
+ InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
+ assert(VF.isVector() && "Expected VF to be a vector VF");
// Cost model is not run in the VPlan-native path - return conservative
// result until this changes.
if (EnableVPlanNativePath)
return CM_GatherScatter;
- std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+ std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
auto Itr = WideningDecisions.find(InstOnVF);
if (Itr == WideningDecisions.end())
return CM_Unknown;
@@ -1153,9 +1427,9 @@ public:
/// Return the vectorization cost for the given instruction \p I and vector
/// width \p VF.
- unsigned getWideningCost(Instruction *I, unsigned VF) {
- assert(VF >= 2 && "Expected VF >=2");
- std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+ InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
+ assert(VF.isVector() && "Expected VF >=2");
+ std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
"The cost is not calculated");
return WideningDecisions[InstOnVF].second;
@@ -1164,7 +1438,7 @@ public:
/// Return True if instruction \p I is an optimizable truncate whose operand
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
- bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
+ bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
// If the instruction is not a truncate, return false.
auto *Trunc = dyn_cast<TruncInst>(I);
if (!Trunc)
@@ -1189,14 +1463,14 @@ public:
/// Collects the instructions to scalarize for each predicated instruction in
/// the loop.
- void collectInstsToScalarize(unsigned VF);
+ void collectInstsToScalarize(ElementCount VF);
/// Collect Uniform and Scalar values for the given \p VF.
/// The sets depend on CM decision for Load/Store instructions
/// that may be vectorized as interleave, gather-scatter or scalarized.
- void collectUniformsAndScalars(unsigned VF) {
+ void collectUniformsAndScalars(ElementCount VF) {
// Do the analysis once.
- if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
+ if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
return;
setCostBasedWideningDecision(VF);
collectLoopUniforms(VF);
@@ -1247,7 +1521,8 @@ public:
/// instructions that may divide by zero.
/// If a non-zero VF has been calculated, we check if I will be scalarized
/// predication for that VF.
- bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
+ bool isScalarWithPredication(Instruction *I,
+ ElementCount VF = ElementCount::getFixed(1));
// Returns true if \p I is an instruction that will be predicated either
// through scalar predication or masked load/store or masked gather/scatter.
@@ -1264,12 +1539,16 @@ public:
/// Returns true if \p I is a memory instruction with consecutive memory
/// access that can be widened.
- bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+ bool
+ memoryInstructionCanBeWidened(Instruction *I,
+ ElementCount VF = ElementCount::getFixed(1));
/// Returns true if \p I is a memory instruction in an interleaved-group
/// of memory accesses that can be vectorized with wide vector loads/stores
/// and shuffles.
- bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
+ bool
+ interleavedAccessCanBeWidened(Instruction *I,
+ ElementCount VF = ElementCount::getFixed(1));
/// Check if \p Instr belongs to any interleaved access group.
bool isAccessInterleaved(Instruction *Instr) {
@@ -1282,11 +1561,16 @@ public:
return InterleaveInfo.getInterleaveGroup(Instr);
}
- /// Returns true if an interleaved group requires a scalar iteration
- /// to handle accesses with gaps, and there is nothing preventing us from
- /// creating a scalar epilogue.
+ /// Returns true if we're required to use a scalar epilogue for at least
+ /// the final iteration of the original loop.
bool requiresScalarEpilogue() const {
- return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
+ if (!isScalarEpilogueAllowed())
+ return false;
+ // If we might exit from anywhere but the latch, must run the exiting
+ // iteration in scalar form.
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
+ return true;
+ return InterleaveInfo.requiresScalarEpilogue();
}
/// Returns true if a scalar epilogue is not allowed due to optsize or a
@@ -1302,17 +1586,34 @@ public:
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
+ /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
+ /// nodes to the chain of instructions representing the reductions. Uses a
+ /// MapVector to ensure deterministic iteration order.
+ using ReductionChainMap =
+ SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
+
+ /// Return the chain of instructions representing an inloop reduction.
+ const ReductionChainMap &getInLoopReductionChains() const {
+ return InLoopReductionChains;
+ }
+
+ /// Returns true if the Phi is part of an inloop reduction.
+ bool isInLoopReduction(PHINode *Phi) const {
+ return InLoopReductionChains.count(Phi);
+ }
+
/// Estimate cost of an intrinsic call instruction CI if it were vectorized
/// with factor VF. Return the cost of the instruction, including
/// scalarization overhead if it's needed.
- unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
+ InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead
/// if it's needed. The flag NeedToScalarize shows if the call needs to be
/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.
- unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
+ InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
+ bool &NeedToScalarize);
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
@@ -1327,7 +1628,8 @@ private:
/// \return An upper bound for the vectorization factor, a power-of-2 larger
/// than zero. One is returned if vectorization should best be avoided due
/// to cost.
- unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
+ ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
@@ -1336,47 +1638,54 @@ private:
/// is
/// false, then all operations will be scalarized (i.e. no vectorization has
/// actually taken place).
- using VectorizationCostTy = std::pair<unsigned, bool>;
+ using VectorizationCostTy = std::pair<InstructionCost, bool>;
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
/// the factor width.
- VectorizationCostTy expectedCost(unsigned VF);
+ VectorizationCostTy expectedCost(ElementCount VF);
/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
- VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+ VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
/// The cost-computation logic from getInstructionCost which provides
/// the vector type as an output parameter.
- unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
+ InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
+ Type *&VectorTy);
+
+ /// Return the cost of instructions in an inloop reduction pattern, if I is
+ /// part of that pattern.
+ InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
+ Type *VectorTy,
+ TTI::TargetCostKind CostKind);
/// Calculate vectorization cost of memory instruction \p I.
- unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
+ InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
/// The cost computation for scalarized memory instruction.
- unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+ InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
/// The cost computation for interleaving group of memory instructions.
- unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+ InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
/// The cost computation for Gather/Scatter instruction.
- unsigned getGatherScatterCost(Instruction *I, unsigned VF);
+ InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
/// The cost computation for widening instruction \p I with consecutive
/// memory access.
- unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
+ InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
/// The cost calculation for Load/Store instruction \p I with uniform pointer -
/// Load: scalar load + broadcast.
/// Store: scalar store + (loop invariant value stored? 0 : extract of last
/// element)
- unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
+ InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
- unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
+ InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
@@ -1394,7 +1703,7 @@ private:
/// A type representing the costs for instructions if they were to be
/// scalarized rather than vectorized. The entries are Instruction-Cost
/// pairs.
- using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
+ using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.
@@ -1416,19 +1725,30 @@ private:
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
/// vectorization factor. The entries are VF-ScalarCostTy pairs.
- DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
+ DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
/// Holds the instructions known to be uniform after vectorization.
/// The data is collected per VF.
- DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
+ DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
/// Holds the instructions known to be scalar after vectorization.
/// The data is collected per VF.
- DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+ DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
/// Holds the instructions (address computations) that are forced to be
/// scalarized.
- DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+ DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
+ /// PHINodes of the reductions that should be expanded in-loop along with
+ /// their associated chains of reduction operations, in program order from top
+ /// (PHI) to bottom
+ ReductionChainMap InLoopReductionChains;
+
+ /// A Map of inloop reduction operations and their immediate chain operand.
+ /// FIXME: This can be removed once reductions can be costed correctly in
+ /// vplan. This was added to allow quick lookup to the inloop operations,
+ /// without having to loop through InLoopReductionChains.
+ DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
/// Returns the expected difference in cost from scalarizing the expression
/// feeding a predicated instruction \p PredInst. The instructions to
@@ -1436,7 +1756,7 @@ private:
/// non-negative return value implies the expression will be scalarized.
/// Currently, only single-use chains are considered for scalarization.
int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
- unsigned VF);
+ ElementCount VF);
/// Collect the instructions that are uniform after vectorization. An
/// instruction is uniform if we represent it with a single scalar value in
@@ -1447,27 +1767,28 @@ private:
/// scalarized instruction will be represented by VF scalar values in the
/// vectorized loop, each corresponding to an iteration of the original
/// scalar loop.
- void collectLoopUniforms(unsigned VF);
+ void collectLoopUniforms(ElementCount VF);
/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
/// during vectorization. Non-uniform scalarized instructions will be
/// represented by VF values in the vectorized loop, each corresponding to an
/// iteration of the original scalar loop.
- void collectLoopScalars(unsigned VF);
+ void collectLoopScalars(ElementCount VF);
/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
- using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
- std::pair<InstWidening, unsigned>>;
+ using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
+ std::pair<InstWidening, InstructionCost>>;
DecisionList WideningDecisions;
/// Returns true if \p V is expected to be vectorized and it needs to be
/// extracted.
- bool needsExtract(Value *V, unsigned VF) const {
+ bool needsExtract(Value *V, ElementCount VF) const {
Instruction *I = dyn_cast<Instruction>(V);
- if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
+ if (VF.isScalar() || !I || !TheLoop->contains(I) ||
+ TheLoop->isLoopInvariant(I))
return false;
// Assume we can vectorize V (and hence we need extraction) if the
@@ -1482,11 +1803,21 @@ private:
/// Returns a range containing only operands needing to be extracted.
SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
- unsigned VF) {
+ ElementCount VF) {
return SmallVector<Value *, 4>(make_filter_range(
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}
+ /// Determines if we have the infrastructure to vectorize loop \p L and its
+ /// epilogue, assuming the main loop is vectorized by \p VF.
+ bool isCandidateForEpilogueVectorization(const Loop &L,
+ const ElementCount VF) const;
+
+ /// Returns true if epilogue vectorization is considered profitable, and
+ /// false otherwise.
+ /// \p VF is the vectorization factor chosen for the original loop.
+ bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
+
public:
/// The loop that we evaluate.
Loop *TheLoop;
@@ -1529,6 +1860,9 @@ public:
/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+
+ /// Profitable vector factors.
+ SmallVector<VectorizationFactor, 8> ProfitableVFs;
};
} // end namespace llvm
@@ -1549,7 +1883,7 @@ public:
// representation for pragma 'omp simd' is introduced.
static bool isExplicitVecOuterLoop(Loop *OuterLp,
OptimizationRemarkEmitter *ORE) {
- assert(!OuterLp->empty() && "This is not an outer loop");
+ assert(!OuterLp->isInnermost() && "This is not an outer loop");
LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
// Only outer loops with an explicit vectorization hint are supported.
@@ -1582,7 +1916,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
// now, only collect outer loops that have explicit vectorization hints. If we
// are stress testing the VPlan H-CFG construction, we collect the outermost
// loop of every loop nest.
- if (L.empty() || VPlanBuildStressTest ||
+ if (L.isInnermost() || VPlanBuildStressTest ||
(EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
LoopBlocksRPO RPOT(&L);
RPOT.perform(LI);
@@ -1696,10 +2030,10 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
}
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
- const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
+ const InductionDescriptor &II, Value *Step, Value *Start,
+ Instruction *EntryVal) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
"Expected either an induction phi-node or a truncate of it!");
- Value *Start = II.getStartValue();
// Construct the initial value of the vector IV in the vector loop preheader
auto CurrIP = Builder.saveIP();
@@ -1729,7 +2063,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// Multiply the vectorization factor by the step using integer or
// floating-point arithmetic as appropriate.
- Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
+ Value *ConstVF =
+ getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
// Create a vector splat to use in the induction update.
@@ -1737,10 +2072,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// FIXME: If the step is non-constant, we create the vector splat with
// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
// handle a constant vector splat.
- Value *SplatVF =
- isa<Constant>(Mul)
- ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
- : Builder.CreateVectorSplat(VF, Mul);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ Value *SplatVF = isa<Constant>(Mul)
+ ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(VF, Mul);
Builder.restoreIP(CurrIP);
// We may need to add the step a number of times, depending on the unroll
@@ -1816,7 +2151,8 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
}
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
+ TruncInst *Trunc) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
@@ -1874,8 +2210,10 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *EntryPart =
- getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
+ getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
+ ID.getInductionOpcode());
VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
if (Trunc)
addMetadata(EntryPart, Trunc);
@@ -1885,7 +2223,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
// Now do the actual transformations, and start with creating the step value.
Value *Step = CreateStepValue(ID.getStep());
- if (VF <= 1) {
+ if (VF.isZero() || VF.isScalar()) {
Value *ScalarIV = CreateScalarIV(Step);
CreateSplatIV(ScalarIV, Step);
return;
@@ -1896,7 +2234,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
// least one user in the loop that is not widened.
auto NeedsScalarIV = needsScalarInduction(EntryVal);
if (!NeedsScalarIV) {
- createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
return;
}
@@ -1904,7 +2242,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (!shouldScalarizeInstruction(EntryVal)) {
- createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
Value *ScalarIV = CreateScalarIV(Step);
// Create scalar steps that can be used by instructions we will later
// scalarize. Note that the addition of the scalar steps will not increase
@@ -1926,7 +2264,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps BinOp) {
// Create and check the types.
- auto *ValVTy = cast<VectorType>(Val->getType());
+ auto *ValVTy = cast<FixedVectorType>(Val->getType());
int VLen = ValVTy->getNumElements();
Type *STy = Val->getType()->getScalarType();
@@ -1983,8 +2321,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Instruction *EntryVal,
const InductionDescriptor &ID) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
- assert(VF > 1 && "VF should be greater than one");
-
+ assert(VF.isVector() && "VF should be greater than one");
// Get the value type and ensure it and the step have the same integer type.
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
assert(ScalarIVTy == Step->getType() &&
@@ -2006,12 +2343,27 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
unsigned Lanes =
- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
- : VF;
+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
+ ? 1
+ : VF.getKnownMinValue();
+ assert((!VF.isScalable() || Lanes == 1) &&
+ "Should never scalarize a scalable vector");
// Compute the scalar steps and save the results in VectorLoopValueMap.
for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
+ auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
+ ScalarIVTy->getScalarSizeInBits());
+ Value *StartIdx =
+ createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+ if (ScalarIVTy->isFloatingPointTy())
+ StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
+ StartIdx = addFastMathFlag(Builder.CreateBinOp(
+ AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
+ // The step returned by `createStepForVF` is a runtime-evaluated value
+ // when VF is scalable. Otherwise, it should be folded into a Constant.
+ assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
+ "Expected StartIdx to be folded to a constant when VF is not "
+ "scalable");
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
@@ -2045,7 +2397,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
// If we aren't vectorizing, we can just copy the scalar map values over to
// the vector map.
- if (VF == 1) {
+ if (VF.isScalar()) {
VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
return ScalarValue;
}
@@ -2054,7 +2406,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
// is known to be uniform after vectorization, this corresponds to lane zero
// of the Part unroll iteration. Otherwise, the last instruction is the one
// we created for the last vector lane of the Part unroll iteration.
- unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
+ unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
+ ? 0
+ : VF.getKnownMinValue() - 1;
+ assert((!VF.isScalable() || LastLane == 0) &&
+ "Scalable vectorization can't lead to any scalarized values.");
auto *LastInst = cast<Instruction>(
VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
@@ -2075,10 +2431,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
VectorValue = getBroadcastInstrs(ScalarValue);
VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
} else {
- // Initialize packing with insertelements to start from undef.
- Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
- VectorLoopValueMap.setVectorValue(V, Part, Undef);
- for (unsigned Lane = 0; Lane < VF; ++Lane)
+ // Initialize packing with insertelements to start from poison.
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF));
+ VectorLoopValueMap.setVectorValue(V, Part, Poison);
+ for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
packScalarIntoVectorValue(V, {Part, Lane});
VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
}
@@ -2117,7 +2474,7 @@ InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
// extractelement instruction.
auto *U = getOrCreateVectorValue(V, Instance.Part);
if (!U->getType()->isVectorTy()) {
- assert(VF == 1 && "Value not scalarized has non-vector type");
+ assert(VF.isScalar() && "Value not scalarized has non-vector type");
return U;
}
@@ -2142,12 +2499,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type");
+ assert(!VF.isScalable() && "Cannot reverse scalable vectors");
SmallVector<int, 8> ShuffleMask;
- for (unsigned i = 0; i < VF; ++i)
- ShuffleMask.push_back(VF - i - 1);
+ for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
+ ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
- return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
- ShuffleMask, "reverse");
+ return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
}
// Return whether we allow using masked interleave-groups (for dealing with
@@ -2172,9 +2529,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
// }
// To:
// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
-// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
-// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
-// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
+// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
+// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
+// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
//
// Or translate following interleaved store group (factor = 3):
// for (i = 0; i < N; i+=3) {
@@ -2185,20 +2542,22 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
// }
// To:
// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
-// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void InnerLoopVectorizer::vectorizeInterleaveGroup(
- const InterleaveGroup<Instruction> *Group, VPTransformState &State,
- VPValue *Addr, VPValue *BlockInMask) {
+ const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
+ VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
+ VPValue *BlockInMask) {
Instruction *Instr = Group->getInsertPos();
const DataLayout &DL = Instr->getModule()->getDataLayout();
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getMemInstValueType(Instr);
unsigned InterleaveFactor = Group->getFactor();
- auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
// Prepare for the new pointers.
SmallVector<Value *, 2> AddrParts;
@@ -2214,8 +2573,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
// pointer operand of the interleaved access is supposed to be uniform. For
// uniform instructions, we're only required to generate a value for the
// first vector lane in each unroll iteration.
+ assert(!VF.isScalable() &&
+ "scalable vector reverse operation is not implemented");
if (Group->isReverse())
- Index += (VF - 1) * Group->getFactor();
+ Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
for (unsigned Part = 0; Part < UF; Part++) {
Value *AddrPart = State.get(Addr, {Part, 0});
@@ -2246,11 +2607,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
setDebugLocFromInst(Builder, Instr);
- Value *UndefVec = UndefValue::get(VecTy);
+ Value *PoisonVec = PoisonValue::get(VecTy);
Value *MaskForGaps = nullptr;
if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
- MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
}
@@ -2266,10 +2628,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
Value *GroupMask = MaskForGaps;
if (BlockInMask) {
Value *BlockInMaskPart = State.get(BlockInMask, Part);
- auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart, Undefs,
- createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
+ BlockInMaskPart,
+ createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
+ "interleaved.mask");
GroupMask = MaskForGaps
? Builder.CreateBinOp(Instruction::And, ShuffledMask,
MaskForGaps)
@@ -2277,7 +2640,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
NewLoad =
Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
- GroupMask, UndefVec, "wide.masked.vec");
+ GroupMask, PoisonVec, "wide.masked.vec");
}
else
NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
@@ -2288,6 +2651,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
// For each member in the group, shuffle out the appropriate data from the
// wide loads.
+ unsigned J = 0;
for (unsigned I = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
@@ -2295,28 +2659,33 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
if (!Member)
continue;
- auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ auto StrideMask =
+ createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
for (unsigned Part = 0; Part < UF; Part++) {
Value *StridedVec = Builder.CreateShuffleVector(
- NewLoads[Part], UndefVec, StrideMask, "strided.vec");
+ NewLoads[Part], StrideMask, "strided.vec");
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
- VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
}
if (Group->isReverse())
StridedVec = reverseVector(StridedVec);
- VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
+ State.set(VPDefs[J], Member, StridedVec, Part);
}
+ ++J;
}
return;
}
// The sub vector type for current instruction.
- auto *SubVT = FixedVectorType::get(ScalarTy, VF);
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ auto *SubVT = VectorType::get(ScalarTy, VF);
// Vectorize the interleaved store group.
for (unsigned Part = 0; Part < UF; Part++) {
@@ -2324,11 +2693,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
SmallVector<Value *, 4> StoredVecs;
for (unsigned i = 0; i < InterleaveFactor; i++) {
// Interleaved store group doesn't allow a gap, so each index has a member
- Instruction *Member = Group->getMember(i);
- assert(Member && "Fail to get a member from an interleaved store group");
+ assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
+
+ Value *StoredVec = State.get(StoredValues[i], Part);
- Value *StoredVec = getOrCreateVectorValue(
- cast<StoreInst>(Member)->getValueOperand(), Part);
if (Group->isReverse())
StoredVec = reverseVector(StoredVec);
@@ -2344,16 +2712,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
Value *WideVec = concatenateVectors(Builder, StoredVecs);
// Interleave the elements in the wide vector.
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *IVec = Builder.CreateShuffleVector(
- WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
+ WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
"interleaved.vec");
Instruction *NewStoreInstr;
if (BlockInMask) {
Value *BlockInMaskPart = State.get(BlockInMask, Part);
- auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
+ BlockInMaskPart,
+ createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
"interleaved.mask");
NewStoreInstr = Builder.CreateMaskedStore(
IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
@@ -2366,11 +2735,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
}
-void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
- VPTransformState &State,
- VPValue *Addr,
- VPValue *StoredValue,
- VPValue *BlockInMask) {
+void InnerLoopVectorizer::vectorizeMemoryInstruction(
+ Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
+ VPValue *StoredValue, VPValue *BlockInMask) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2387,7 +2754,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
"CM decision is not to widen the memory instruction");
Type *ScalarDataTy = getMemInstValueType(Instr);
- auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
+
+ auto *DataTy = VectorType::get(ScalarDataTy, VF);
const Align Alignment = getLoadStoreAlignment(Instr);
// Determine if the pointer operand of the access is either consecutive or
@@ -2419,19 +2787,23 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
InBounds = gep->isInBounds();
if (Reverse) {
+ assert(!VF.isScalable() &&
+ "Reversing vectors is not yet supported for scalable vectors.");
+
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
+ PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
+ ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
PartPtr->setIsInBounds(InBounds);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
+ PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
+ ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
PartPtr->setIsInBounds(InBounds);
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
} else {
+ Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
+ Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
PartPtr->setIsInBounds(InBounds);
}
@@ -2486,7 +2858,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(
- VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
+ VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
"wide.masked.load");
else
NewLI =
@@ -2497,7 +2869,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
if (Reverse)
NewLI = reverseVector(NewLI);
}
- VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
+
+ State.set(Def, Instr, NewLI, Part);
}
}
@@ -2507,6 +2880,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
VPTransformState &State) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+ // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
+ // the first lane and part.
+ if (isa<NoAliasScopeDeclInst>(Instr))
+ if (Instance.Lane != 0 || Instance.Part != 0)
+ return;
+
setDebugLocFromInst(Builder, Instr);
// Does this instruction return a value ?
@@ -2519,7 +2898,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
- auto *NewOp = State.get(User.getOperand(op), Instance);
+ auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
+ auto InputInstance = Instance;
+ if (!Operand || !OrigLoop->contains(Operand) ||
+ (Cost->isUniformAfterVectorization(Operand, State.VF)))
+ InputInstance.Lane = 0;
+ auto *NewOp = State.get(User.getOperand(op), InputInstance);
Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);
@@ -2527,7 +2911,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);
- // Add the cloned scalar to the scalar map entry.
+ // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
+ // representing scalar values in VPTransformState. Add the cloned scalar to
+ // the scalar map entry.
VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
// If we just cloned a new assumption, add it the assumption cache.
@@ -2564,7 +2950,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
Induction->addIncoming(Next, Latch);
// Create the compare.
Value *ICmp = Builder.CreateICmpEQ(Next, End);
- Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
+ Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
// Now we have two terminators. Remove the old one from the block.
Latch->getTerminator()->eraseFromParent();
@@ -2581,7 +2967,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
// Find the loop boundaries.
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
- assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
+ assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count");
Type *IdxTy = Legal->getWidestInductionType();
@@ -2627,7 +3013,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Type *Ty = TC->getType();
- Constant *Step = ConstantInt::get(Ty, VF * UF);
+ // This is where we can make the step a runtime constant.
+ Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
// If the tail is to be folded by masking, round the number of iterations N
// up to a multiple of Step instead of rounding down. This is done by first
@@ -2636,9 +3023,12 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// that it starts at zero and its Step is a power of two; the loop will then
// exit, with the last early-exit vector comparison also producing all-true.
if (Cost->foldTailByMasking()) {
- assert(isPowerOf2_32(VF * UF) &&
+ assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
"VF*UF must be a power of 2 when folding tail by masking");
- TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
+ assert(!VF.isScalable() &&
+ "Tail folding not yet supported for scalable vectors");
+ TC = Builder.CreateAdd(
+ TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
}
// Now we need to generate the expression for the part of the loop that the
@@ -2648,14 +3038,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// unroll factor (number of SIMD instructions).
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
- // If there is a non-reversed interleaved group that may speculatively access
- // memory out-of-bounds, we need to ensure that there will be at least one
- // iteration of the scalar epilogue loop. Thus, if the step evenly divides
+ // There are two cases where we need to ensure (at least) the last iteration
+ // runs in the scalar remainder loop. Thus, if the step evenly divides
// the trip count, we set the remainder to be equal to the step. If the step
// does not evenly divide the trip count, no adjustment is necessary since
// there will already be scalar iterations. Note that the minimum iterations
- // check ensures that N >= Step.
- if (VF > 1 && Cost->requiresScalarEpilogue()) {
+ // check ensures that N >= Step. The cases are:
+ // 1) If there is a non-reversed interleaved group that may speculatively
+ // access memory out-of-bounds.
+ // 2) If any instruction may follow a conditionally taken exit. That is, if
+ // the loop contains multiple exiting blocks, or a single exiting block
+ // which is not the latch.
+ if (VF.isVector() && Cost->requiresScalarEpilogue()) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}
@@ -2668,17 +3062,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
const DataLayout &DL) {
// Verify that V is a vector type with same number of elements as DstVTy.
- unsigned VF = DstVTy->getNumElements();
- VectorType *SrcVecTy = cast<VectorType>(V->getType());
+ auto *DstFVTy = cast<FixedVectorType>(DstVTy);
+ unsigned VF = DstFVTy->getNumElements();
+ auto *SrcVecTy = cast<FixedVectorType>(V->getType());
assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
Type *SrcElemTy = SrcVecTy->getElementType();
- Type *DstElemTy = DstVTy->getElementType();
+ Type *DstElemTy = DstFVTy->getElementType();
assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
"Vector elements must have same size");
// Do a direct cast if element types are castable.
if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
- return Builder.CreateBitOrPointerCast(V, DstVTy);
+ return Builder.CreateBitOrPointerCast(V, DstFVTy);
}
// V cannot be directly casted to desired vector type.
// May happen when V is a floating point vector but DstVTy is a vector of
@@ -2692,7 +3087,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
auto *VecIntTy = FixedVectorType::get(IntTy, VF);
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
- return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
+ return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
}
void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
@@ -2713,11 +3108,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
// If tail is to be folded, vector loop takes care of all iterations.
Value *CheckMinIters = Builder.getFalse();
- if (!Cost->foldTailByMasking())
- CheckMinIters = Builder.CreateICmp(
- P, Count, ConstantInt::get(Count->getType(), VF * UF),
- "min.iters.check");
-
+ if (!Cost->foldTailByMasking()) {
+ Value *Step =
+ createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
+ CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
+ }
// Create new preheader for vector loop.
LoopVectorPreHeader =
SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
@@ -2754,7 +3149,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
if (C->isZero())
return;
- assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
+ assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
+ (OptForSizeBasedOnProfile &&
+ Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size");
SCEVCheckBlock->setName("vector.scevcheck");
@@ -2792,15 +3189,8 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
if (!RtPtrChecking.Need)
return;
- Instruction *FirstCheckInst;
- Instruction *MemRuntimeCheck;
- std::tie(FirstCheckInst, MemRuntimeCheck) =
- addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
- RtPtrChecking.getChecks(), RtPtrChecking.getSE());
- assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
- "claimed checks are required");
- if (MemCheckBlock->getParent()->hasOptSize()) {
+ if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
"Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.");
@@ -2820,22 +3210,33 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
"vector.ph");
+ auto *CondBranch = cast<BranchInst>(
+ Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
+ ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
+ LoopBypassBlocks.push_back(MemCheckBlock);
+ AddedSafetyChecks = true;
+
// Update dominator only if this is first RT check.
if (LoopBypassBlocks.empty()) {
DT->changeImmediateDominator(Bypass, MemCheckBlock);
DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
}
- ReplaceInstWithInst(
- MemCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
- LoopBypassBlocks.push_back(MemCheckBlock);
- AddedSafetyChecks = true;
+ Instruction *FirstCheckInst;
+ Instruction *MemRuntimeCheck;
+ std::tie(FirstCheckInst, MemRuntimeCheck) =
+ addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
+ RtPtrChecking.getChecks(), RtPtrChecking.getSE());
+ assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
+ "claimed checks are required");
+ CondBranch->setCondition(MemRuntimeCheck);
// We currently don't use LoopVersioning for the actual loop cloning but we
// still use it to add the noalias metadata.
- LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
- PSE.getSE());
+ LVer = std::make_unique<LoopVersioning>(
+ *Legal->getLAI(),
+ Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
+ DT, PSE.getSE());
LVer->prepareNoAliasMetadata();
}
@@ -2939,74 +3340,35 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
llvm_unreachable("invalid enum");
}
-BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
- /*
- In this function we generate a new loop. The new loop will contain
- the vectorized instructions while the old loop will continue to run the
- scalar remainder.
-
- [ ] <-- loop iteration number check.
- / |
- / v
- | [ ] <-- vector loop bypass (may consist of multiple blocks).
- | / |
- | / v
- || [ ] <-- vector pre header.
- |/ |
- | v
- | [ ] \
- | [ ]_| <-- vector loop.
- | |
- | v
- | -[ ] <--- middle-block.
- | / |
- | / v
- -|- >[ ] <--- new preheader.
- | |
- | v
- | [ ] \
- | [ ]_| <-- old scalar loop to handle remainder.
- \ |
- \ v
- >[ ] <-- exit block.
- ...
- */
-
- MDNode *OrigLoopID = OrigLoop->getLoopID();
-
- // Some loops have a single integer induction variable, while other loops
- // don't. One example is c++ iterators that often have multiple pointer
- // induction variables. In the code below we also support a case where we
- // don't have a single induction variable.
- //
- // We try to obtain an induction variable from the original loop as hard
- // as possible. However if we don't find one that:
- // - is an integer
- // - counts from zero, stepping by one
- // - is the size of the widest induction variable type
- // then we create a new one.
- OldInduction = Legal->getPrimaryInduction();
- Type *IdxTy = Legal->getWidestInductionType();
-
- // Split the single block loop into the two loop structure described above.
+Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarBody = OrigLoop->getHeader();
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
- LoopExitBlock = OrigLoop->getExitBlock();
+ LoopExitBlock = OrigLoop->getUniqueExitBlock();
assert(LoopExitBlock && "Must have an exit block");
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopMiddleBlock =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
- LI, nullptr, "middle.block");
+ LI, nullptr, Twine(Prefix) + "middle.block");
LoopScalarPreHeader =
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
- nullptr, "scalar.ph");
+ nullptr, Twine(Prefix) + "scalar.ph");
+
+ // Set up branch from middle block to the exit and scalar preheader blocks.
+ // completeLoopSkeleton will update the condition to use an iteration check,
+ // if required to decide whether to execute the remainder.
+ BranchInst *BrInst =
+ BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
+ auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
+ BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
+ ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
+
// We intentionally don't let SplitBlock to update LoopInfo since
// LoopVectorBody should belong to another loop than LoopVectorPreHeader.
// LoopVectorBody is explicitly added to the correct place few lines later.
LoopVectorBody =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
- nullptr, nullptr, "vector.body");
+ nullptr, nullptr, Twine(Prefix) + "vector.body");
// Update dominator for loop exit.
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -3023,37 +3385,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
LI->addTopLevelLoop(Lp);
}
Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
+ return Lp;
+}
- // Find the loop boundaries.
- Value *Count = getOrCreateTripCount(Lp);
-
- Value *StartIdx = ConstantInt::get(IdxTy, 0);
-
- // Now, compare the new count to zero. If it is zero skip the vector loop and
- // jump to the scalar loop. This check also covers the case where the
- // backedge-taken count is uint##_max: adding one to it will overflow leading
- // to an incorrect trip count of zero. In this (rare) case we will also jump
- // to the scalar loop.
- emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
-
- // Generate the code to check any assumptions that we've made for SCEV
- // expressions.
- emitSCEVChecks(Lp, LoopScalarPreHeader);
-
- // Generate the code that checks in runtime if arrays overlap. We put the
- // checks into a separate block to make the more common case of few elements
- // faster.
- emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
- // Generate the induction variable.
- // The loop step is equal to the vectorization factor (num of SIMD elements)
- // times the unroll factor (num of SIMD instructions).
- Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
- Constant *Step = ConstantInt::get(IdxTy, VF * UF);
- Induction =
- createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
- getDebugLocFromInstOrOperands(OldInduction));
-
+void InnerLoopVectorizer::createInductionResumeValues(
+ Loop *L, Value *VectorTripCount,
+ std::pair<BasicBlock *, Value *> AdditionalBypass) {
+ assert(VectorTripCount && L && "Expected valid arguments");
+ assert(((AdditionalBypass.first && AdditionalBypass.second) ||
+ (!AdditionalBypass.first && !AdditionalBypass.second)) &&
+ "Inconsistent information about additional bypass.");
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
@@ -3061,10 +3402,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// iteration in the vectorized loop.
// If we come from a bypass edge then we need to start from the original
// start value.
-
- // This variable saves the new starting index for the scalar loop. It is used
- // to test if there are any tail iterations left once the vector loop has
- // completed.
for (auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
InductionDescriptor II = InductionEntry.second;
@@ -3076,20 +3413,32 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// Copy original phi DL over to the new one.
BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
Value *&EndValue = IVEndValues[OrigPhi];
+ Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
if (OrigPhi == OldInduction) {
// We know what the end value is.
- EndValue = CountRoundDown;
+ EndValue = VectorTripCount;
} else {
- IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
+ IRBuilder<> B(L->getLoopPreheader()->getTerminator());
Type *StepType = II.getStep()->getType();
Instruction::CastOps CastOp =
- CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
- Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
+ CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
+ Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
EndValue->setName("ind.end");
- }
+ // Compute the end value for the additional bypass (if applicable).
+ if (AdditionalBypass.first) {
+ B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
+ CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
+ StepType, true);
+ CRD =
+ B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
+ EndValueFromAdditionalBypass =
+ emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+ EndValueFromAdditionalBypass->setName("ind.end");
+ }
+ }
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
@@ -3099,42 +3448,44 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// value.
for (BasicBlock *BB : LoopBypassBlocks)
BCResumeVal->addIncoming(II.getStartValue(), BB);
+
+ if (AdditionalBypass.first)
+ BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
+ EndValueFromAdditionalBypass);
+
OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
}
+}
+
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
+ MDNode *OrigLoopID) {
+ assert(L && "Expected valid loop.");
- // We need the OrigLoop (scalar loop part) latch terminator to help
- // produce correct debug info for the middle block BB instructions.
- // The legality check stage guarantees that the loop will have a single
- // latch.
- assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
- "Scalar loop latch terminator isn't a branch");
- BranchInst *ScalarLatchBr =
- cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
+ // The trip counts should be cached by now.
+ Value *Count = getOrCreateTripCount(L);
+ Value *VectorTripCount = getOrCreateVectorTripCount(L);
+
+ auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
// If tail is to be folded, we know we don't need to run the remainder.
- Value *CmpN = Builder.getTrue();
if (!Cost->foldTailByMasking()) {
- CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
- CountRoundDown, "cmp.n",
- LoopMiddleBlock->getTerminator());
+ Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+ Count, VectorTripCount, "cmp.n",
+ LoopMiddleBlock->getTerminator());
- // Here we use the same DebugLoc as the scalar loop latch branch instead
+ // Here we use the same DebugLoc as the scalar loop latch terminator instead
// of the corresponding compare because they may have ended up with
// different line numbers and we want to avoid awkward line stepping while
// debugging. Eg. if the compare has got a line number inside the loop.
- cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
+ CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
+ cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
}
- BranchInst *BrInst =
- BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
- BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
- ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
-
// Get ready to start creating new instructions into the vectorized body.
- assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
+ assert(LoopVectorPreHeader == L->getLoopPreheader() &&
"Inconsistent vector loop preheader");
Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
@@ -3142,7 +3493,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupVectorized});
if (VectorizedLoopID.hasValue()) {
- Lp->setLoopID(VectorizedLoopID.getValue());
+ L->setLoopID(VectorizedLoopID.getValue());
// Do not setAlreadyVectorized if loop attributes have been defined
// explicitly.
@@ -3152,9 +3503,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
if (MDNode *LID = OrigLoop->getLoopID())
- Lp->setLoopID(LID);
+ L->setLoopID(LID);
- LoopVectorizeHints Hints(Lp, true, *ORE);
+ LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
#ifdef EXPENSIVE_CHECKS
@@ -3165,6 +3516,91 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
return LoopVectorPreHeader;
}
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+ /*
+ In this function we generate a new loop. The new loop will contain
+ the vectorized instructions while the old loop will continue to run the
+ scalar remainder.
+
+ [ ] <-- loop iteration number check.
+ / |
+ / v
+ | [ ] <-- vector loop bypass (may consist of multiple blocks).
+ | / |
+ | / v
+ || [ ] <-- vector pre header.
+ |/ |
+ | v
+ | [ ] \
+ | [ ]_| <-- vector loop.
+ | |
+ | v
+ | -[ ] <--- middle-block.
+ | / |
+ | / v
+ -|- >[ ] <--- new preheader.
+ | |
+ | v
+ | [ ] \
+ | [ ]_| <-- old scalar loop to handle remainder.
+ \ |
+ \ v
+ >[ ] <-- exit block.
+ ...
+ */
+
+ // Get the metadata of the original loop before it gets modified.
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
+
+ // Create an empty vector loop, and prepare basic blocks for the runtime
+ // checks.
+ Loop *Lp = createVectorLoopSkeleton("");
+
+ // Now, compare the new count to zero. If it is zero skip the vector loop and
+ // jump to the scalar loop. This check also covers the case where the
+ // backedge-taken count is uint##_max: adding one to it will overflow leading
+ // to an incorrect trip count of zero. In this (rare) case we will also jump
+ // to the scalar loop.
+ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
+
+ // Generate the code to check any assumptions that we've made for SCEV
+ // expressions.
+ emitSCEVChecks(Lp, LoopScalarPreHeader);
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+
+ // Some loops have a single integer induction variable, while other loops
+ // don't. One example is c++ iterators that often have multiple pointer
+ // induction variables. In the code below we also support a case where we
+ // don't have a single induction variable.
+ //
+ // We try to obtain an induction variable from the original loop as hard
+ // as possible. However if we don't find one that:
+ // - is an integer
+ // - counts from zero, stepping by one
+ // - is the size of the widest induction variable type
+ // then we create a new one.
+ OldInduction = Legal->getPrimaryInduction();
+ Type *IdxTy = Legal->getWidestInductionType();
+ Value *StartIdx = ConstantInt::get(IdxTy, 0);
+ // The loop step is equal to the vectorization factor (num of SIMD elements)
+ // times the unroll factor (num of SIMD instructions).
+ Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
+ Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
+ Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+ Induction =
+ createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+ getDebugLocFromInstOrOperands(OldInduction));
+
+ // Emit phis for the new starting index of the scalar loop.
+ createInductionResumeValues(Lp, CountRoundDown);
+
+ return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
// Fix up external users of the induction variable. At this point, we are
// in LCSSA form, with all external PHIs that use the IV having one input value,
// coming from the remainder loop. We need those PHIs to also have a correct
@@ -3178,7 +3614,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.
- assert(OrigLoop->getExitBlock() && "Expected a single exit block");
+ assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
DenseMap<Value *, Value *> MissingVals;
@@ -3284,9 +3720,10 @@ static void cse(BasicBlock *BB) {
}
}
-unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
- unsigned VF,
- bool &NeedToScalarize) {
+InstructionCost
+LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
+ bool &NeedToScalarize) {
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
@@ -3297,9 +3734,9 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
- unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
- TTI::TCK_RecipThroughput);
- if (VF == 1)
+ InstructionCost ScalarCallCost =
+ TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
+ if (VF.isScalar())
return ScalarCallCost;
// Compute corresponding vector type for return value and arguments.
@@ -3309,31 +3746,33 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
- unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
+ InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
- unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
+ InstructionCost Cost =
+ ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
NeedToScalarize = true;
- VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
+ VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
if (!TLI || CI->isNoBuiltin() || !VecFunc)
return Cost;
// If the corresponding vector cost is cheaper, return its cost.
- unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
- TTI::TCK_RecipThroughput);
+ InstructionCost VectorCallCost =
+ TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
if (VectorCallCost < Cost) {
NeedToScalarize = false;
- return VectorCallCost;
+ Cost = VectorCallCost;
}
return Cost;
}
-unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
+ ElementCount VF) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
@@ -3373,7 +3812,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
Type *ScalarTruncatedTy =
IntegerType::get(OriginalTy->getContext(), KV.second);
auto *TruncatedTy = FixedVectorType::get(
- ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
+ ScalarTruncatedTy,
+ cast<FixedVectorType>(OriginalTy)->getNumElements());
if (TruncatedTy == OriginalTy)
continue;
@@ -3423,13 +3863,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
break;
}
} else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
- auto Elements0 =
- cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
+ auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
+ ->getNumElements();
auto *O0 = B.CreateZExtOrTrunc(
SI->getOperand(0),
FixedVectorType::get(ScalarTruncatedTy, Elements0));
- auto Elements1 =
- cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
+ auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
+ ->getNumElements();
auto *O1 = B.CreateZExtOrTrunc(
SI->getOperand(1),
FixedVectorType::get(ScalarTruncatedTy, Elements1));
@@ -3439,16 +3879,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
// Don't do anything with the operands, just extend the result.
continue;
} else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
- auto Elements =
- cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
+ auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
+ ->getNumElements();
auto *O0 = B.CreateZExtOrTrunc(
IE->getOperand(0),
FixedVectorType::get(ScalarTruncatedTy, Elements));
auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
} else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
- auto Elements =
- cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
+ auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
+ ->getNumElements();
auto *O0 = B.CreateZExtOrTrunc(
EE->getOperand(0),
FixedVectorType::get(ScalarTruncatedTy, Elements));
@@ -3490,7 +3930,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
void InnerLoopVectorizer::fixVectorizedLoop() {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
- if (VF > 1)
+ if (VF.isVector())
truncateToMinimalBitwidths();
// Fix widened non-induction PHIs by setting up the PHI operands.
@@ -3531,9 +3971,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
// profile is not inherently precise anyway. Note also possible bypass of
// vector code caused by legality checks is ignored, assigning all the weight
// to the vector loop, optimistically.
- setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
- LI->getLoopFor(LoopVectorBody),
- LI->getLoopFor(LoopScalarBody), VF * UF);
+ //
+ // For scalable vectorization we can't know at compile time how many iterations
+ // of the loop are handled in one vector iteration, so instead assume a pessimistic
+ // vscale of '1'.
+ setProfileInfoAfterUnrolling(
+ LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
+ LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
}
void InnerLoopVectorizer::fixCrossIterationPHIs() {
@@ -3612,11 +4056,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Create a vector from the initial value.
auto *VectorInit = ScalarInit;
- if (VF > 1) {
+ if (VF.isVector()) {
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
VectorInit = Builder.CreateInsertElement(
- UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
- VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
+ PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
+ Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
}
// We constructed a temporary phi node in the first phase of vectorization.
@@ -3657,10 +4102,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// We will construct a vector for the recurrence by combining the values for
// the current and previous iterations. This is the required shuffle mask.
- SmallVector<int, 8> ShuffleMask(VF);
- ShuffleMask[0] = VF - 1;
- for (unsigned I = 1; I < VF; ++I)
- ShuffleMask[I] = I + VF - 1;
+ assert(!VF.isScalable());
+ SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
+ ShuffleMask[0] = VF.getKnownMinValue() - 1;
+ for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
+ ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
// The vector from which to take the initial value for the current iteration
// (actual or unrolled). Initially, this is the vector phi node.
@@ -3670,9 +4116,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
- auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
- ShuffleMask)
- : Incoming;
+ auto *Shuffle =
+ VF.isVector()
+ ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
+ : Incoming;
PhiPart->replaceAllUsesWith(Shuffle);
cast<Instruction>(PhiPart)->eraseFromParent();
VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
@@ -3685,10 +4132,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Extract the last vector element in the middle block. This will be the
// initial value for the recurrence when jumping to the scalar loop.
auto *ExtractForScalar = Incoming;
- if (VF > 1) {
+ if (VF.isVector()) {
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
ExtractForScalar = Builder.CreateExtractElement(
- ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
+ ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
+ "vector.recur.extract");
}
// Extract the second last element in the middle block if the
// Phi is used outside the loop. We need to extract the phi itself
@@ -3696,9 +4144,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// will be the value when jumping to the exit block from the LoopMiddleBlock,
// when the scalar loop is not run at all.
Value *ExtractForPhiUsedOutsideLoop = nullptr;
- if (VF > 1)
+ if (VF.isVector())
ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
- Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
+ Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
+ "vector.recur.extract.for.phi");
// When loop is unrolled without vectorizing, initialize
// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
// `Incoming`. This is analogous to the vectorized case above: extracting the
@@ -3722,69 +4171,31 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// vector recurrence we extracted in the middle block. Since the loop is in
// LCSSA form, we just need to find all the phi nodes for the original scalar
// recurrence in the exit block, and then add an edge for the middle block.
- for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
- if (LCSSAPhi.getIncomingValue(0) == Phi) {
+ // Note that LCSSA does not imply single entry when the original scalar loop
+ // had multiple exiting edges (as we always run the last iteration in the
+ // scalar epilogue); in that case, the exiting path through middle will be
+ // dynamically dead and the value picked for the phi doesn't matter.
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis())
+ if (any_of(LCSSAPhi.incoming_values(),
+ [Phi](Value *V) { return V == Phi; }))
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
- }
- }
}
void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
- Constant *Zero = Builder.getInt32(0);
-
// Get it's reduction variable descriptor.
assert(Legal->isReductionVariable(Phi) &&
"Unable to find the reduction variable");
RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
- RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+ RecurKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
- RdxDesc.getMinMaxRecurrenceKind();
setDebugLocFromInst(Builder, ReductionStartValue);
-
- // We need to generate a reduction vector from the incoming scalar.
- // To do so, we need to generate the 'identity' vector and override
- // one of the elements with the incoming scalar reduction. We need
- // to do it in the vector-loop preheader.
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
// This is the vector-clone of the value that leaves the loop.
Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
- // Find the reduction identity variable. Zero for addition, or, xor,
- // one for multiplication, -1 for And.
- Value *Identity;
- Value *VectorStart;
- if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
- RK == RecurrenceDescriptor::RK_FloatMinMax) {
- // MinMax reduction have the start value as their identify.
- if (VF == 1) {
- VectorStart = Identity = ReductionStartValue;
- } else {
- VectorStart = Identity =
- Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
- }
- } else {
- // Handle other reduction kinds:
- Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
- RK, VecTy->getScalarType());
- if (VF == 1) {
- Identity = Iden;
- // This vector is the Identity vector where the first element is the
- // incoming scalar reduction.
- VectorStart = ReductionStartValue;
- } else {
- Identity = ConstantVector::getSplat({VF, false}, Iden);
-
- // This vector is the Identity vector where the first element is the
- // incoming scalar reduction.
- VectorStart =
- Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
- }
- }
-
// Wrap flags are in general invalid after vectorization, clear them.
clearReductionWrapFlags(RdxDesc);
@@ -3798,10 +4209,6 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
Value *Val = getOrCreateVectorValue(LoopVal, Part);
- // Make sure to add the reduction start value only to the
- // first unroll part.
- Value *StartVal = (Part == 0) ? VectorStart : Identity;
- cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
cast<PHINode>(VecRdxPhi)
->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
}
@@ -3816,8 +4223,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// If tail is folded by masking, the vector value to leave the loop should be
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
- // instead of the former.
- if (Cost->foldTailByMasking()) {
+ // instead of the former. For an inloop reduction the reduction will already
+ // be predicated, and does not need to be handled here.
+ if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *VecLoopExitInst =
VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
@@ -3831,14 +4239,31 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
assert(Sel && "Reduction exit feeds no select");
VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+
+ // If the target can create a predicated operator for the reduction at no
+ // extra cost in the loop (for example a predicated vadd), it can be
+ // cheaper for the select to remain in the loop than be sunk out of it,
+ // and so use the select value for the phi instead of the old
+ // LoopExitValue.
+ RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
+ if (PreferPredicatedReductionSelect ||
+ TTI->preferPredicatedReductionSelect(
+ RdxDesc.getOpcode(), Phi->getType(),
+ TargetTransformInfo::ReductionFlags())) {
+ auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
+ VecRdxPhi->setIncomingValueForBlock(
+ LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
+ }
}
}
// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
- if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
- Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
+ if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
+ assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
Builder.SetInsertPoint(
LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
VectorParts RdxParts(UF);
@@ -3865,7 +4290,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// Reduce all of the unrolled parts into a single vector.
Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
- unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
+ unsigned Op = RecurrenceDescriptor::getOpcode(RK);
// The middle block terminator has already been assigned a DebugLoc here (the
// OrigLoop's single latch terminator). We want the whole middle block to
@@ -3884,14 +4309,14 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
ReducedPartRdx, "bin.rdx"),
RdxDesc.getFastMathFlags());
else
- ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
- RdxPart);
+ ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
}
- if (VF > 1) {
- bool NoNaN = Legal->hasFunNoNaNAttr();
+ // Create the reduction after the loop. Note that inloop reductions create the
+ // target reduction in the loop using a Reduction recipe.
+ if (VF.isVector() && !IsInLoopReductionPhi) {
ReducedPartRdx =
- createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
+ createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -3911,21 +4336,17 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
- // We know that the loop is in LCSSA form. We need to update the
- // PHI nodes in the exit blocks.
- for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
- // All PHINodes need to have a single entry edge, or two if
- // we already fixed them.
- assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
- // We found a reduction value exit-PHI. Update it with the
- // incoming bypass edge.
- if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
+ // We know that the loop is in LCSSA form. We need to update the PHI nodes
+ // in the exit blocks. See comment on analogous loop in
+ // fixFirstOrderRecurrence for a more complete explaination of the logic.
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis())
+ if (any_of(LCSSAPhi.incoming_values(),
+ [LoopExitInst](Value *V) { return V == LoopExitInst; }))
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
- } // end of the LCSSA phi scan.
- // Fix the scalar loop reduction variable with the incoming reduction sum
- // from the vector body and from the backedge value.
+ // Fix the scalar loop reduction variable with the incoming reduction sum
+ // from the vector body and from the backedge value.
int IncomingEdgeBlockIdx =
Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
@@ -3937,9 +4358,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
void InnerLoopVectorizer::clearReductionWrapFlags(
RecurrenceDescriptor &RdxDesc) {
- RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
- if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
- RK != RecurrenceDescriptor::RK_IntegerMult)
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ if (RK != RecurKind::Add && RK != RecurKind::Mul)
return;
Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
@@ -3968,22 +4388,27 @@ void InnerLoopVectorizer::clearReductionWrapFlags(
void InnerLoopVectorizer::fixLCSSAPHIs() {
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
- if (LCSSAPhi.getNumIncomingValues() == 1) {
- auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
- // Non-instruction incoming values will have only one value.
- unsigned LastLane = 0;
- if (isa<Instruction>(IncomingValue))
- LastLane = Cost->isUniformAfterVectorization(
- cast<Instruction>(IncomingValue), VF)
- ? 0
- : VF - 1;
- // Can be a loop invariant incoming value or the last scalar value to be
- // extracted from the vectorized loop.
- Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- Value *lastIncomingValue =
- getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
- LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
- }
+ if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
+ // Some phis were already hand updated by the reduction and recurrence
+ // code above, leave them alone.
+ continue;
+
+ auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
+ // Non-instruction incoming values will have only one value.
+ unsigned LastLane = 0;
+ if (isa<Instruction>(IncomingValue))
+ LastLane = Cost->isUniformAfterVectorization(
+ cast<Instruction>(IncomingValue), VF)
+ ? 0
+ : VF.getKnownMinValue() - 1;
+ assert((!VF.isScalable() || LastLane == 0) &&
+ "scalable vectors dont support non-uniform scalars yet");
+ // Can be a loop invariant incoming value or the last scalar value to be
+ // extracted from the vectorized loop.
+ Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+ Value *lastIncomingValue =
+ getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
+ LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
}
}
@@ -4087,9 +4512,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {
}
}
-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
- unsigned UF, unsigned VF,
- bool IsPtrLoopInvariant,
+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
+ VPUser &Operands, unsigned UF,
+ ElementCount VF, bool IsPtrLoopInvariant,
SmallBitVector &IsIndexLoopInvariant,
VPTransformState &State) {
// Construct a vector GEP by widening the operands of the scalar GEP as
@@ -4098,7 +4523,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
// is vector-typed. Thus, to keep the representation compact, we only use
// vector-typed operands for loop-varying values.
- if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+ if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
// If we are vectorizing, but the GEP has only loop-invariant operands,
// the GEP we build (by only using vector-typed operands for
// loop-varying values) would be a scalar pointer. Thus, to ensure we
@@ -4114,7 +4539,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
auto *Clone = Builder.Insert(GEP->clone());
for (unsigned Part = 0; Part < UF; ++Part) {
Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
- VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
+ State.set(VPDef, GEP, EntryPart, Part);
addMetadata(EntryPart, GEP);
}
} else {
@@ -4149,16 +4574,19 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
Indices)
: Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
- assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+ assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector");
- VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
+ State.set(VPDef, GEP, NewGEP, Part);
addMetadata(NewGEP, GEP);
}
}
}
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
- unsigned VF) {
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
+ RecurrenceDescriptor *RdxDesc,
+ Value *StartV, unsigned UF,
+ ElementCount VF) {
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
PHINode *P = cast<PHINode>(PN);
if (EnableVPlanNativePath) {
// Currently we enter here in the VPlan-native path for non-induction
@@ -4166,7 +4594,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
// Create a vector phi with no operands - the vector phi operands will be
// set at the end of vector code generation.
Type *VecTy =
- (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
+ (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
OrigPHIsToFix.push_back(P);
@@ -4181,18 +4609,60 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
- if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
+ if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
+ Value *Iden = nullptr;
+ bool ScalarPHI =
+ (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
+ Type *VecTy =
+ ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
+
+ if (RdxDesc) {
+ assert(Legal->isReductionVariable(P) && StartV &&
+ "RdxDesc should only be set for reduction variables; in that case "
+ "a StartV is also required");
+ RecurKind RK = RdxDesc->getRecurrenceKind();
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
+ // MinMax reduction have the start value as their identify.
+ if (ScalarPHI) {
+ Iden = StartV;
+ } else {
+ IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident");
+ }
+ } else {
+ Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
+ RK, VecTy->getScalarType());
+ Iden = IdenC;
+
+ if (!ScalarPHI) {
+ Iden = ConstantVector::getSplat(VF, IdenC);
+ IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ Constant *Zero = Builder.getInt32(0);
+ StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
+ }
+ }
+ }
+
for (unsigned Part = 0; Part < UF; ++Part) {
// This is phase one of vectorizing PHIs.
- Type *VecTy =
- (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
Value *EntryPart = PHINode::Create(
VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
+ if (StartV) {
+ // Make sure to add the reduction start value only to the
+ // first unroll part.
+ Value *StartVal = (Part == 0) ? StartV : Iden;
+ cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
+ }
}
return;
}
+ assert(!Legal->isReductionVariable(P) &&
+ "reductions should be handled above");
+
setDebugLocFromInst(Builder, P);
// This PHINode must be an induction variable.
@@ -4213,26 +4683,74 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
case InductionDescriptor::IK_PtrInduction: {
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
- // This is the normalized GEP that starts counting at zero.
- Value *PtrInd = Induction;
- PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If the instruction is uniform, we only need to generate the
- // first lane. Otherwise, we generate all VF values.
- unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
- // These are the scalar results. Notice that we don't generate vector GEPs
- // because scalar GEPs result in better code.
- for (unsigned Part = 0; Part < UF; ++Part) {
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
- Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
- Value *SclrGep =
- emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
- SclrGep->setName("next.gep");
- VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+
+ if (Cost->isScalarAfterVectorization(P, VF)) {
+ // This is the normalized GEP that starts counting at zero.
+ Value *PtrInd =
+ Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If the instruction is uniform, we only need to generate the
+ // first lane. Otherwise, we generate all VF values.
+ unsigned Lanes =
+ Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ Constant *Idx = ConstantInt::get(PtrInd->getType(),
+ Lane + Part * VF.getKnownMinValue());
+ Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+ Value *SclrGep =
+ emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
+ SclrGep->setName("next.gep");
+ VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+ }
}
+ return;
+ }
+ assert(isa<SCEVConstant>(II.getStep()) &&
+ "Induction step not a SCEV constant!");
+ Type *PhiType = II.getStep()->getType();
+
+ // Build a pointer phi
+ Value *ScalarStartValue = II.getStartValue();
+ Type *ScStValueType = ScalarStartValue->getType();
+ PHINode *NewPointerPhi =
+ PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
+ NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
+
+ // A pointer induction, performed by using a gep
+ BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
+ Instruction *InductionLoc = LoopLatch->getTerminator();
+ const SCEV *ScalarStep = II.getStep();
+ SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+ Value *ScalarStepValue =
+ Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
+ Value *InductionGEP = GetElementPtrInst::Create(
+ ScStValueType->getPointerElementType(), NewPointerPhi,
+ Builder.CreateMul(
+ ScalarStepValue,
+ ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
+ "ptr.ind", InductionLoc);
+ NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
+
+ // Create UF many actual address geps that use the pointer
+ // phi as base and a vectorized version of the step value
+ // (<step*0, ..., step*N>) as offset.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Constant *, 8> Indices;
+ // Create a vector of consecutive numbers from zero to VF.
+ for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
+ Indices.push_back(
+ ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
+ Constant *StartOffset = ConstantVector::get(Indices);
+
+ Value *GEP = Builder.CreateGEP(
+ ScStValueType->getPointerElementType(), NewPointerPhi,
+ Builder.CreateMul(
+ StartOffset,
+ Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
+ "vector.gep"));
+ VectorLoopValueMap.setVectorValue(P, Part, GEP);
}
- return;
}
}
}
@@ -4255,7 +4773,8 @@ static bool mayDivideByZero(Instruction &I) {
return !CInt || CInt->isZero();
}
-void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
+void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
+ VPUser &User,
VPTransformState &State) {
switch (I.getOpcode()) {
case Instruction::Call:
@@ -4297,7 +4816,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
VecOp->copyIRFlags(&I);
// Use this vector value for all users of the original instruction.
- VectorLoopValueMap.setVectorValue(&I, Part, V);
+ State.set(Def, &I, V, Part);
addMetadata(V, &I);
}
@@ -4321,7 +4840,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
} else {
C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
}
- VectorLoopValueMap.setVectorValue(&I, Part, C);
+ State.set(Def, &I, C, Part);
addMetadata(C, &I);
}
@@ -4345,12 +4864,12 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
/// Vectorize casts.
Type *DestTy =
- (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
+ (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *A = State.get(User.getOperand(0), Part);
Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
- VectorLoopValueMap.setVectorValue(&I, Part, Cast);
+ State.set(Def, &I, Cast, Part);
addMetadata(Cast, &I);
}
break;
@@ -4362,7 +4881,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
} // end of switch.
}
-void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
+ VPUser &ArgOperands,
VPTransformState &State) {
assert(!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
@@ -4373,7 +4893,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
SmallVector<Type *, 4> Tys;
for (Value *ArgOperand : CI->arg_operands())
- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+ Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
@@ -4381,11 +4901,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;
- unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
- bool UseVectorIntrinsic =
- ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
+ InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
+ bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
assert((UseVectorIntrinsic || !NeedToScalarize) &&
"Instruction should be scalarized elsewhere.");
+ assert(IntrinsicCost.isValid() && CallCost.isValid() &&
+ "Cannot have invalid costs while widening");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
@@ -4404,15 +4926,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
if (UseVectorIntrinsic) {
// Use vector version of the intrinsic.
Type *TysForDecl[] = {CI->getType()};
- if (VF > 1)
- TysForDecl[0] =
- FixedVectorType::get(CI->getType()->getScalarType(), VF);
+ if (VF.isVector()) {
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+ }
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");
} else {
// Use vector version of the function call.
- const VFShape Shape =
- VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
+ const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
#ifndef NDEBUG
assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.");
@@ -4426,12 +4948,12 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
if (isa<FPMathOperator>(V))
V->copyFastMathFlags(CI);
- VectorLoopValueMap.setVectorValue(&I, Part, V);
+ State.set(Def, &I, V, Part);
addMetadata(V, &I);
}
}
-void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
+void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
VPUser &Operands,
bool InvariantCond,
VPTransformState &State) {
@@ -4450,16 +4972,16 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
Value *Op0 = State.get(Operands.getOperand(1), Part);
Value *Op1 = State.get(Operands.getOperand(2), Part);
Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
- VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+ State.set(VPDef, &I, Sel, Part);
addMetadata(Sel, &I);
}
}
-void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
+void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.
- assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
+ assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF");
SmallSetVector<Instruction *, 8> Worklist;
@@ -4468,6 +4990,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// accesses that will remain scalar.
SmallSetVector<Instruction *, 8> ScalarPtrs;
SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+ auto *Latch = TheLoop->getLoopLatch();
// A helper that returns true if the use of Ptr by MemAccess will be scalar.
// The pointer operands of loads and stores will be scalar as long as the
@@ -4493,11 +5016,33 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
!TheLoop->isLoopInvariant(V);
};
- // A helper that evaluates a memory access's use of a pointer. If the use
- // will be a scalar use, and the pointer is only used by memory accesses, we
- // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
- // PossibleNonScalarPtrs.
+ auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
+ if (!isa<PHINode>(Ptr) ||
+ !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
+ return false;
+ auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
+ if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
+ return false;
+ return isScalarUse(MemAccess, Ptr);
+ };
+
+ // A helper that evaluates a memory access's use of a pointer. If the
+ // pointer is actually the pointer induction of a loop, it is being
+ // inserted into Worklist. If the use will be a scalar use, and the
+ // pointer is only used by memory accesses, we place the pointer in
+ // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+ if (isScalarPtrInduction(MemAccess, Ptr)) {
+ Worklist.insert(cast<Instruction>(Ptr));
+ Instruction *Update = cast<Instruction>(
+ cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
+ Worklist.insert(Update);
+ LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
+ << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
+ << "\n");
+ return;
+ }
// We only care about bitcast and getelementptr instructions contained in
// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -4521,10 +5066,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
};
// We seed the scalars analysis with three classes of instructions: (1)
- // instructions marked uniform-after-vectorization, (2) bitcast and
- // getelementptr instructions used by memory accesses requiring a scalar use,
- // and (3) pointer induction variables and their update instructions (we
- // currently only scalarize these).
+ // instructions marked uniform-after-vectorization and (2) bitcast,
+ // getelementptr and (pointer) phi instructions used by memory accesses
+ // requiring a scalar use.
//
// (1) Add to the worklist all instructions that have been identified as
// uniform-after-vectorization.
@@ -4550,24 +5094,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
Worklist.insert(I);
}
- // (3) Add to the worklist all pointer induction variables and their update
- // instructions.
- //
- // TODO: Once we are able to vectorize pointer induction variables we should
- // no longer insert them into the worklist here.
- auto *Latch = TheLoop->getLoopLatch();
- for (auto &Induction : Legal->getInductionVars()) {
- auto *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
- continue;
- Worklist.insert(Ind);
- Worklist.insert(IndUpdate);
- LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
- LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
- << "\n");
- }
-
// Insert the forced scalars.
// FIXME: Currently widenPHIInstruction() often creates a dead vector
// induction variable when the PHI user is scalarized.
@@ -4603,14 +5129,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- // We already considered pointer induction variables, so there's no reason
- // to look at their users again.
- //
- // TODO: Once we are able to vectorize pointer induction variables we
- // should no longer skip over them here.
- if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
- continue;
-
// If tail-folding is applied, the primary induction variable will be used
// to feed a vector compare.
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
@@ -4646,7 +5164,8 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
+ ElementCount VF) {
if (!blockNeedsPredication(I->getParent()))
return false;
switch(I->getOpcode()) {
@@ -4660,7 +5179,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
auto *Ty = getMemInstValueType(I);
// We have already decided how to vectorize this instruction, get that
// result.
- if (VF > 1) {
+ if (VF.isVector()) {
InstWidening WideningDecision = getWideningDecision(I, VF);
assert(WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment");
@@ -4681,8 +5200,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
return false;
}
-bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
- unsigned VF) {
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
+ Instruction *I, ElementCount VF) {
assert(isAccessInterleaved(I) && "Expecting interleaved access.");
assert(getWideningDecision(I, VF) == CM_Unknown &&
"Decision should not be set yet.");
@@ -4718,8 +5237,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
: TTI.isLegalMaskedStore(Ty, Alignment);
}
-bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
- unsigned VF) {
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
+ Instruction *I, ElementCount VF) {
// Get and ensure we have a valid memory instruction.
LoadInst *LI = dyn_cast<LoadInst>(I);
StoreInst *SI = dyn_cast<StoreInst>(I);
@@ -4746,13 +5265,13 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
return true;
}
-void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
+void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
// already does this check. Collecting Uniforms for VF=1 does not make any
// sense.
- assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
+ assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
"This function should not be visited twice for the same VF");
// Visit the list of Uniforms. If we'll not find any uniform value, we'll
@@ -4778,6 +5297,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// replicating region where only a single instance out of VF should be formed.
// TODO: optimize such seldom cases if found important, see PR40816.
auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
+ if (isOutOfScope(I)) {
+ LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
+ << *I << "\n");
+ return;
+ }
if (isScalarWithPredication(I, VF)) {
LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n");
@@ -4794,65 +5318,71 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
addToWorklistIfAllowed(Cmp);
- // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
- // are pointers that are treated like consecutive pointers during
- // vectorization. The pointer operands of interleaved accesses are an
- // example.
- SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
-
- // Holds pointer operands of instructions that are possibly non-uniform.
- SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
-
- auto isUniformDecision = [&](Instruction *I, unsigned VF) {
+ auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
InstWidening WideningDecision = getWideningDecision(I, VF);
assert(WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment");
+ // A uniform memory op is itself uniform. We exclude uniform stores
+ // here as they demand the last lane, not the first one.
+ if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
+ assert(WideningDecision == CM_Scalarize);
+ return true;
+ }
+
return (WideningDecision == CM_Widen ||
WideningDecision == CM_Widen_Reverse ||
WideningDecision == CM_Interleave);
};
- // Iterate over the instructions in the loop, and collect all
- // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
- // that a consecutive-like pointer operand will be scalarized, we collect it
- // in PossibleNonUniformPtrs instead. We use two sets here because a single
- // getelementptr instruction can be used by both vectorized and scalarized
- // memory instructions. For example, if a loop loads and stores from the same
- // location, but the store is conditional, the store will be scalarized, and
- // the getelementptr won't remain uniform.
+
+
+ // Returns true if Ptr is the pointer operand of a memory access instruction
+ // I, and I is known to not require scalarization.
+ auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
+ return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
+ };
+
+ // Holds a list of values which are known to have at least one uniform use.
+ // Note that there may be other uses which aren't uniform. A "uniform use"
+ // here is something which only demands lane 0 of the unrolled iterations;
+ // it does not imply that all lanes produce the same value (e.g. this is not
+ // the usual meaning of uniform)
+ SmallPtrSet<Value *, 8> HasUniformUse;
+
+ // Scan the loop for instructions which are either a) known to have only
+ // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
// If there's no pointer operand, there's nothing to do.
- auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
+ auto *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)
continue;
- // True if all users of Ptr are memory accesses that have Ptr as their
- // pointer operand.
- auto UsersAreMemAccesses =
- llvm::all_of(Ptr->users(), [&](User *U) -> bool {
- return getLoadStorePointerOperand(U) == Ptr;
- });
-
- // Ensure the memory instruction will not be scalarized or used by
- // gather/scatter, making its pointer operand non-uniform. If the pointer
- // operand is used by any instruction other than a memory access, we
- // conservatively assume the pointer operand may be non-uniform.
- if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
- PossibleNonUniformPtrs.insert(Ptr);
+ // A uniform memory op is itself uniform. We exclude uniform stores
+ // here as they demand the last lane, not the first one.
+ if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
+ addToWorklistIfAllowed(&I);
- // If the memory instruction will be vectorized and its pointer operand
- // is consecutive-like, or interleaving - the pointer operand should
- // remain uniform.
- else
- ConsecutiveLikePtrs.insert(Ptr);
+ if (isUniformDecision(&I, VF)) {
+ assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
+ HasUniformUse.insert(Ptr);
+ }
}
- // Add to the Worklist all consecutive and consecutive-like pointers that
- // aren't also identified as possibly non-uniform.
- for (auto *V : ConsecutiveLikePtrs)
- if (!PossibleNonUniformPtrs.count(V))
- addToWorklistIfAllowed(V);
+ // Add to the worklist any operands which have *only* uniform (e.g. lane 0
+ // demanding) users. Since loops are assumed to be in LCSSA form, this
+ // disallows uses outside the loop as well.
+ for (auto *V : HasUniformUse) {
+ if (isOutOfScope(V))
+ continue;
+ auto *I = cast<Instruction>(V);
+ auto UsersAreMemAccesses =
+ llvm::all_of(I->users(), [&](User *U) -> bool {
+ return isVectorizedMemAccessUse(cast<Instruction>(U), V);
+ });
+ if (UsersAreMemAccesses)
+ addToWorklistIfAllowed(I);
+ }
// Expand Worklist in topological order: whenever a new instruction
// is added , its users should be already inside Worklist. It ensures
@@ -4875,20 +5405,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
auto *OI = cast<Instruction>(OV);
if (llvm::all_of(OI->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
- return Worklist.count(J) ||
- (OI == getLoadStorePointerOperand(J) &&
- isUniformDecision(J, VF));
+ return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
}))
addToWorklistIfAllowed(OI);
}
}
- // Returns true if Ptr is the pointer operand of a memory access instruction
- // I, and I is known to not require scalarization.
- auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
- return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
- };
-
// For an instruction to be added into Worklist above, all its users inside
// the loop should also be in Worklist. However, this condition cannot be
// true for phi nodes that form a cyclic dependence. We must process phi
@@ -4961,8 +5483,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
- unsigned UserIC) {
+Optional<ElementCount>
+LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
// uniform if the target can skip.
@@ -4982,9 +5504,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
return None;
}
+ ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
+
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return UserVF ? UserVF : computeFeasibleMaxVF(TC);
+ return MaxVF;
+ case CM_ScalarEpilogueNotAllowedUsePredicate:
+ LLVM_FALLTHROUGH;
case CM_ScalarEpilogueNotNeededUsePredicate:
LLVM_DEBUG(
dbgs() << "LV: vector predicate hint/switch found.\n"
@@ -5005,9 +5531,26 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
// for size.
if (runtimeChecksRequired())
return None;
+
break;
}
+ // The only loops we can vectorize without a scalar epilogue, are loops with
+ // a bottom-test and a single exiting block. We'd have to handle the fact
+ // that not every instruction executes on the last iteration. This will
+ // require a lane mask which varies through the vector loop body. (TODO)
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ // If there was a tail-folding hint/switch, but we can't fold the tail by
+ // masking, fallback to a vectorization with a scalar epilogue.
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
+ "scalar epilogue instead.\n");
+ ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+ return MaxVF;
+ }
+ return None;
+ }
+
// Now try the tail folding
// Invalidate interleave groups that require an epilogue if we can't mask
@@ -5020,10 +5563,21 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
- assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
- unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
- if (TC > 0 && TC % MaxVFtimesIC == 0) {
+ assert(!MaxVF.isScalable() &&
+ "Scalable vectors do not yet support tail folding");
+ assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
+ "MaxVF must be a power of 2");
+ unsigned MaxVFtimesIC =
+ UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
+ // Avoid tail folding if the trip count is known to be a multiple of any VF we
+ // chose.
+ ScalarEvolution *SE = PSE.getSE();
+ const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+ const SCEV *ExitCount = SE->getAddExpr(
+ BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+ const SCEV *Rem = SE->getURemExpr(
+ ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
+ if (Rem->isZero()) {
// Accept MaxVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
return MaxVF;
@@ -5038,6 +5592,20 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
return MaxVF;
}
+ // If there was a tail-folding hint/switch, but we can't fold the tail by
+ // masking, fallback to a vectorization with a scalar epilogue.
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
+ "scalar epilogue instead.\n");
+ ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+ return MaxVF;
+ }
+
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
+ LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
+ return None;
+ }
+
if (TC == 0) {
reportVectorizationFailure(
"Unable to calculate the loop count due to complex control flow",
@@ -5055,8 +5623,33 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
return None;
}
-unsigned
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
+ElementCount
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF) {
+ bool IgnoreScalableUserVF = UserVF.isScalable() &&
+ !TTI.supportsScalableVectors() &&
+ !ForceTargetSupportsScalableVectors;
+ if (IgnoreScalableUserVF) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Ignoring VF=" << UserVF
+ << " because target does not support scalable vectors.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "Ignoring VF=" << ore::NV("UserVF", UserVF)
+ << " because target does not support scalable vectors.";
+ });
+ }
+
+ // Beyond this point two scenarios are handled. If UserVF isn't specified
+ // then a suitable VF is chosen. If UserVF is specified and there are
+ // dependencies, check if it's legal. However, if a UserVF is specified and
+ // there are no dependencies, then there's nothing to do.
+ if (UserVF.isNonZero() && !IgnoreScalableUserVF &&
+ Legal->isSafeForAnyVectorWidth())
+ return UserVF;
+
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5066,9 +5659,62 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
- unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
+ unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
+
+ // If the user vectorization factor is legally unsafe, clamp it to a safe
+ // value. Otherwise, return as is.
+ if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
+ unsigned MaxSafeElements =
+ PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+ ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
- WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
+ if (UserVF.isScalable()) {
+ Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+
+ // Scale VF by vscale before checking if it's safe.
+ MaxSafeVF = ElementCount::getScalable(
+ MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+
+ if (MaxSafeVF.isZero()) {
+ // The dependence distance is too small to use scalable vectors,
+ // fallback on fixed.
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Max legal vector width too small, scalable vectorization "
+ "unfeasible. Using fixed-width vectorization instead.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "Max legal vector width too small, scalable vectorization "
+ << "unfeasible. Using fixed-width vectorization instead.";
+ });
+ return computeFeasibleMaxVF(
+ ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+ if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
+ return UserVF;
+
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is unsafe, clamping to max safe VF=" << MaxSafeVF
+ << ".\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is unsafe, clamping to maximum safe vectorization factor "
+ << ore::NV("VectorizationFactor", MaxSafeVF);
+ });
+ return MaxSafeVF;
+ }
+
+ WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
// Note that both WidestRegister and WidestType may not be a powers of 2.
@@ -5079,12 +5725,13 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n");
- assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
- " into one vector!");
+ assert(MaxVectorSize <= WidestRegister &&
+ "Did not expect to pack so many elements"
+ " into one vector!");
if (MaxVectorSize == 0) {
LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
MaxVectorSize = 1;
- return MaxVectorSize;
+ return ElementCount::getFixed(MaxVectorSize);
} else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
isPowerOf2_32(ConstTripCount)) {
// We need to clamp the VF to be the ConstTripCount. There is no point in
@@ -5092,7 +5739,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n");
MaxVectorSize = ConstTripCount;
- return MaxVectorSize;
+ return ElementCount::getFixed(MaxVectorSize);
}
unsigned MaxVF = MaxVectorSize;
@@ -5100,10 +5747,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
(MaximizeBandwidth && isScalarEpilogueAllowed())) {
// Collect all viable vectorization factors larger than the default MaxVF
// (i.e. MaxVectorSize).
- SmallVector<unsigned, 8> VFs;
+ SmallVector<ElementCount, 8> VFs;
unsigned NewMaxVectorSize = WidestRegister / SmallestType;
for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
- VFs.push_back(VS);
+ VFs.push_back(ElementCount::getFixed(VS));
// For each VF calculate its register usage.
auto RUs = calculateRegisterUsage(VFs);
@@ -5118,7 +5765,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
Selected = false;
}
if (Selected) {
- MaxVF = VFs[i];
+ MaxVF = VFs[i].getKnownMinValue();
break;
}
}
@@ -5130,30 +5777,39 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
}
}
}
- return MaxVF;
+ return ElementCount::getFixed(MaxVF);
}
VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
- float Cost = expectedCost(1).first;
- const float ScalarCost = Cost;
+LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
+ // FIXME: This can be fixed for scalable vectors later, because at this stage
+ // the LoopVectorizer will only consider vectorizing a loop with scalable
+ // vectors when the loop has a hint to enable vectorization for a given VF.
+ assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
+
+ InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
+ LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
+ assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
+
unsigned Width = 1;
- LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+ const float ScalarCost = *ExpectedCost.getValue();
+ float Cost = ScalarCost;
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
- if (ForceVectorization && MaxVF > 1) {
+ if (ForceVectorization && MaxVF.isVector()) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
// evaluation.
Cost = std::numeric_limits<float>::max();
}
- for (unsigned i = 2; i <= MaxVF; i *= 2) {
+ for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
// Notice that the vector loop needs to be executed less times, so
// we need to divide the cost of the vector loops by the width of
// the vector elements.
- VectorizationCostTy C = expectedCost(i);
- float VectorCost = C.first / (float)i;
+ VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
+ assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
+ float VectorCost = *C.first.getValue() / (float)i;
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
<< " costs: " << (int)VectorCost << ".\n");
if (!C.second && !ForceVectorization) {
@@ -5162,6 +5818,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
<< " because it will not generate any vector instructions.\n");
continue;
}
+
+ // If profitable add it to ProfitableVF list.
+ if (VectorCost < ScalarCost) {
+ ProfitableVFs.push_back(VectorizationFactor(
+ {ElementCount::getFixed(i), (unsigned)VectorCost}));
+ }
+
if (VectorCost < Cost) {
Cost = VectorCost;
Width = i;
@@ -5180,10 +5843,131 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
- VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
+ VectorizationFactor Factor = {ElementCount::getFixed(Width),
+ (unsigned)(Width * Cost)};
return Factor;
}
+bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
+ const Loop &L, ElementCount VF) const {
+ // Cross iteration phis such as reductions need special handling and are
+ // currently unsupported.
+ if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
+ return Legal->isFirstOrderRecurrence(&Phi) ||
+ Legal->isReductionVariable(&Phi);
+ }))
+ return false;
+
+ // Phis with uses outside of the loop require special handling and are
+ // currently unsupported.
+ for (auto &Entry : Legal->getInductionVars()) {
+ // Look for uses of the value of the induction at the last iteration.
+ Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
+ for (User *U : PostInc->users())
+ if (!L.contains(cast<Instruction>(U)))
+ return false;
+ // Look for uses of penultimate value of the induction.
+ for (User *U : Entry.first->users())
+ if (!L.contains(cast<Instruction>(U)))
+ return false;
+ }
+
+ // Induction variables that are widened require special handling that is
+ // currently not supported.
+ if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
+ return !(this->isScalarAfterVectorization(Entry.first, VF) ||
+ this->isProfitableToScalarize(Entry.first, VF));
+ }))
+ return false;
+
+ return true;
+}
+
+bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
+ const ElementCount VF) const {
+ // FIXME: We need a much better cost-model to take different parameters such
+ // as register pressure, code size increase and cost of extra branches into
+ // account. For now we apply a very crude heuristic and only consider loops
+ // with vectorization factors larger than a certain value.
+ // We also consider epilogue vectorization unprofitable for targets that don't
+ // consider interleaving beneficial (eg. MVE).
+ if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
+ return false;
+ if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
+ return true;
+ return false;
+}
+
+VectorizationFactor
+LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
+ const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
+ VectorizationFactor Result = VectorizationFactor::Disabled();
+ if (!EnableEpilogueVectorization) {
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
+ return Result;
+ }
+
+ if (!isScalarEpilogueAllowed()) {
+ LLVM_DEBUG(
+ dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
+ "allowed.\n";);
+ return Result;
+ }
+
+ // FIXME: This can be fixed for scalable vectors later, because at this stage
+ // the LoopVectorizer will only consider vectorizing a loop with scalable
+ // vectors when the loop has a hint to enable vectorization for a given VF.
+ if (MainLoopVF.isScalable()) {
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
+ "yet supported.\n");
+ return Result;
+ }
+
+ // Not really a cost consideration, but check for unsupported cases here to
+ // simplify the logic.
+ if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
+ LLVM_DEBUG(
+ dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
+ "not a supported candidate.\n";);
+ return Result;
+ }
+
+ if (EpilogueVectorizationForceVF > 1) {
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
+ if (LVP.hasPlanWithVFs(
+ {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
+ return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
+ else {
+ LLVM_DEBUG(
+ dbgs()
+ << "LEV: Epilogue vectorization forced factor is not viable.\n";);
+ return Result;
+ }
+ }
+
+ if (TheLoop->getHeader()->getParent()->hasOptSize() ||
+ TheLoop->getHeader()->getParent()->hasMinSize()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
+ return Result;
+ }
+
+ if (!isEpilogueVectorizationProfitable(MainLoopVF))
+ return Result;
+
+ for (auto &NextVF : ProfitableVFs)
+ if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+ (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
+ LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
+ Result = NextVF;
+
+ if (Result != VectorizationFactor::Disabled())
+ LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
+ << Result.Width.getFixedValue() << "\n";);
+ return Result;
+}
+
std::pair<unsigned, unsigned>
LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned MinWidth = -1U;
@@ -5210,6 +5994,11 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
if (!Legal->isReductionVariable(PN))
continue;
RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
+ if (PreferInLoopReductions ||
+ TTI.preferInLoopReduction(RdxDesc.getOpcode(),
+ RdxDesc.getRecurrenceType(),
+ TargetTransformInfo::ReductionFlags()))
+ continue;
T = RdxDesc.getRecurrenceType();
}
@@ -5240,7 +6029,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
return {MinWidth, MaxWidth};
}
-unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
unsigned LoopCost) {
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -5263,10 +6052,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
if (Legal->getMaxSafeDepDistBytes() != -1U)
return 1;
- // Do not interleave loops with a relatively small known or estimated trip
- // count.
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
- if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
+ const bool HasReductions = !Legal->getReductionVars().empty();
+ // Do not interleave loops with a relatively small known or estimated trip
+ // count. But we will interleave when InterleaveSmallLoopScalarReduction is
+ // enabled, and the code has scalar reductions(HasReductions && VF = 1),
+ // because with the above conditions interleaving can expose ILP and break
+ // cross iteration dependences for reductions.
+ if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
+ !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
return 1;
RegisterUsage R = calculateRegisterUsage({VF})[0];
@@ -5294,7 +6088,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
<< " registers of "
<< TTI.getRegisterClassName(pair.first) << " register class\n");
- if (VF == 1) {
+ if (VF.isScalar()) {
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
TargetNumRegisters = ForceTargetNumScalarRegs;
} else {
@@ -5318,10 +6112,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
}
// Clamp the interleave ranges to reasonable counts.
- unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
+ unsigned MaxInterleaveCount =
+ TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
// Check if the user has overridden the max.
- if (VF == 1) {
+ if (VF.isScalar()) {
if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
} else {
@@ -5330,28 +6125,47 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
}
// If trip count is known or estimated compile time constant, limit the
- // interleave count to be less than the trip count divided by VF.
+ // interleave count to be less than the trip count divided by VF, provided it
+ // is at least 1.
+ //
+ // For scalable vectors we can't know if interleaving is beneficial. It may
+ // not be beneficial for small loops if none of the lanes in the second vector
+ // iterations is enabled. However, for larger loops, there is likely to be a
+ // similar benefit as for fixed-width vectors. For now, we choose to leave
+ // the InterleaveCount as if vscale is '1', although if some information about
+ // the vector is known (e.g. min vector size), we can make a better decision.
if (BestKnownTC) {
- MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
+ MaxInterleaveCount =
+ std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
+ // Make sure MaxInterleaveCount is greater than 0.
+ MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
}
- // If we did not calculate the cost for VF (because the user selected the VF)
- // then we calculate the cost of VF here.
- if (LoopCost == 0)
- LoopCost = expectedCost(VF).first;
-
- assert(LoopCost && "Non-zero loop cost expected");
+ assert(MaxInterleaveCount > 0 &&
+ "Maximum interleave count must be greater than 0");
// Clamp the calculated IC to be between the 1 and the max interleave count
// that the target and trip count allows.
if (IC > MaxInterleaveCount)
IC = MaxInterleaveCount;
- else if (IC < 1)
- IC = 1;
+ else
+ // Make sure IC is greater than 0.
+ IC = std::max(1u, IC);
+
+ assert(IC > 0 && "Interleave count must be greater than 0.");
+
+ // If we did not calculate the cost for VF (because the user selected the VF)
+ // then we calculate the cost of VF here.
+ if (LoopCost == 0) {
+ assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
+ LoopCost = *expectedCost(VF).first.getValue();
+ }
+
+ assert(LoopCost && "Non-zero loop cost expected");
// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
- if (VF > 1 && !Legal->getReductionVars().empty()) {
+ if (VF.isVector() && HasReductions) {
LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
return IC;
}
@@ -5359,11 +6173,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
// Note that if we've already vectorized the loop we will have done the
// runtime check and so interleaving won't require further checks.
bool InterleavingRequiresRuntimePointerCheck =
- (VF == 1 && Legal->getRuntimePointerChecking()->Need);
+ (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
// We want to interleave small loops in order to reduce the loop overhead and
// potentially expose ILP opportunities.
- LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
+ << "LV: IC is " << IC << '\n'
+ << "LV: VF is " << VF << '\n');
+ const bool AggressivelyInterleaveReductions =
+ TTI.enableAggressiveInterleaving(HasReductions);
if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the
@@ -5382,7 +6200,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
// by this point), we can increase the critical path length if the loop
// we're interleaving is inside another loop. Limit, by default to 2, so the
// critical path only gets increased by one reduction operation.
- if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
+ if (HasReductions && TheLoop->getLoopDepth() > 1) {
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
SmallIC = std::min(SmallIC, F);
StoresIC = std::min(StoresIC, F);
@@ -5396,14 +6214,23 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
return std::max(StoresIC, LoadsIC);
}
- LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
- return SmallIC;
+ // If there are scalar reductions and TTI has enabled aggressive
+ // interleaving for reductions, we will interleave to expose ILP.
+ if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
+ AggressivelyInterleaveReductions) {
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+ // Interleave no less than SmallIC but not as aggressive as the normal IC
+ // to satisfy the rare situation when resources are too limited.
+ return std::max(IC / 2, SmallIC);
+ } else {
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+ return SmallIC;
+ }
}
// Interleave if this is a large loop (small loops are already dealt with by
// this point) that could benefit from interleaving.
- bool HasReductions = !Legal->getReductionVars().empty();
- if (TTI.enableAggressiveInterleaving(HasReductions)) {
+ if (AggressivelyInterleaveReductions) {
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
return IC;
}
@@ -5413,7 +6240,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
}
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
-LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
+LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
// This function calculates the register usage by measuring the highest number
// of values that are alive at a single location. Obviously, this is a very
// rough estimation. We scan the loop in a topological order in order and
@@ -5485,26 +6312,17 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
TransposeEnds[Interval.second].push_back(Interval.first);
SmallPtrSet<Instruction *, 8> OpenIntervals;
-
- // Get the size of the widest register.
- unsigned MaxSafeDepDist = -1U;
- if (Legal->getMaxSafeDepDistBytes() != -1U)
- MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
- unsigned WidestRegister =
- std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
- const DataLayout &DL = TheFunction->getParent()->getDataLayout();
-
SmallVector<RegisterUsage, 8> RUs(VFs.size());
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
// A lambda that gets the register usage for the given type and VF.
- auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
- if (Ty->isTokenTy())
+ const auto &TTICapture = TTI;
+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
+ if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
return 0U;
- unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
- return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
+ return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
};
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -5528,7 +6346,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// Count the number of live intervals.
SmallMapVector<unsigned, unsigned, 4> RegUsage;
- if (VFs[j] == 1) {
+ if (VFs[j].isScalar()) {
for (auto Inst : OpenIntervals) {
unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
if (RegUsage.find(ClassID) == RegUsage.end())
@@ -5557,7 +6375,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
}
}
}
-
+
for (auto& pair : RegUsage) {
if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
@@ -5575,10 +6393,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
SmallMapVector<unsigned, unsigned, 4> Invariant;
-
+
for (auto Inst : LoopInvariants) {
- unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
- unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
+ unsigned Usage =
+ VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+ unsigned ClassID =
+ TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
if (Invariant.find(ClassID) == Invariant.end())
Invariant[ClassID] = Usage;
else
@@ -5626,12 +6446,13 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
NumPredStores > NumberOfStoresToPredicate);
}
-void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
+void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
// expected cost for interleaving.
- if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
+ if (VF.isScalar() || VF.isZero() ||
+ InstsToScalarize.find(VF) != InstsToScalarize.end())
return;
// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
@@ -5660,14 +6481,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
}
int LoopVectorizationCostModel::computePredInstDiscount(
- Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
- unsigned VF) {
+ Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
assert(!isUniformAfterVectorization(PredInst, VF) &&
"Instruction marked uniform-after-vectorization will be predicated");
// Initialize the discount to zero, meaning that the scalar version and the
// vector version cost the same.
- int Discount = 0;
+ InstructionCost Discount = 0;
// Holds instructions to analyze. The instructions we visit are mapped in
// ScalarCosts. Those instructions are the ones that would be scalarized if
@@ -5722,22 +6542,27 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the cost of the vector instruction. Note that this cost already
// includes the scalarization overhead of the predicated instruction.
- unsigned VectorCost = getInstructionCost(I, VF).first;
+ InstructionCost VectorCost = getInstructionCost(I, VF).first;
// Compute the cost of the scalarized instruction. This cost is the cost of
// the instruction as if it wasn't if-converted and instead remained in the
// predicated block. We will scale this cost by block probability after
// computing the scalarization overhead.
- unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ InstructionCost ScalarCost =
+ VF.getKnownMinValue() *
+ getInstructionCost(I, ElementCount::getFixed(1)).first;
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(I->getType(), VF)),
- APInt::getAllOnesValue(VF), true, false);
- ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
- TTI::TCK_RecipThroughput);
+ APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ ScalarCost +=
+ VF.getKnownMinValue() *
+ TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
}
// Compute the scalarization overhead of needed extractelement
@@ -5750,10 +6575,12 @@ int LoopVectorizationCostModel::computePredInstDiscount(
"Instruction has non-scalar type");
if (canBeScalarized(J))
Worklist.push_back(J);
- else if (needsExtract(J, VF))
+ else if (needsExtract(J, VF)) {
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(J->getType(), VF)),
- APInt::getAllOnesValue(VF), false, true);
+ APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
+ }
}
// Scale the total scalar cost by block probability.
@@ -5765,11 +6592,11 @@ int LoopVectorizationCostModel::computePredInstDiscount(
ScalarCosts[I] = ScalarCost;
}
- return Discount;
+ return *Discount.getValue();
}
LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(unsigned VF) {
+LoopVectorizationCostModel::expectedCost(ElementCount VF) {
VectorizationCostTy Cost;
// For each block.
@@ -5779,14 +6606,15 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
// For each instruction in the old loop.
for (Instruction &I : BB->instructionsWithoutDebug()) {
// Skip ignored values.
- if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
+ if (ValuesToIgnore.count(&I) ||
+ (VF.isVector() && VecValuesToIgnore.count(&I)))
continue;
VectorizationCostTy C = getInstructionCost(&I, VF);
// Check if we should override the cost.
if (ForceTargetInstructionCost.getNumOccurrences() > 0)
- C.first = ForceTargetInstructionCost;
+ C.first = InstructionCost(ForceTargetInstructionCost);
BlockCost.first += C.first;
BlockCost.second |= C.second;
@@ -5799,9 +6627,10 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
// if-converted. This means that the block's instructions (aside from
// stores and instructions that may divide by zero) will now be
// unconditionally executed. For the scalar case, we may not always execute
- // the predicated block. Thus, scale the block's cost by the probability of
- // executing it.
- if (VF == 1 && blockNeedsPredication(BB))
+ // the predicated block, if it is an if-else block. Thus, scale the block's
+ // cost by the probability of executing it. blockNeedsPredication from
+ // Legal is used so as to not include all blocks in tail folded loops.
+ if (VF.isScalar() && Legal->blockNeedsPredication(BB))
BlockCost.first /= getReciprocalPredBlockProb();
Cost.first += BlockCost.first;
@@ -5846,9 +6675,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
Legal->hasStride(I->getOperand(1));
}
-unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
- unsigned VF) {
- assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
+InstructionCost
+LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+ ElementCount VF) {
+ assert(VF.isVector() &&
+ "Scalarization cost of instruction implies vectorization.");
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Type *ValTy = getMemInstValueType(I);
auto SE = PSE.getSE();
@@ -5861,14 +6693,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
// Get the cost of the scalar memory instruction and address computation.
- unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+ InstructionCost Cost =
+ VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
const Align Alignment = getLoadStoreAlignment(I);
- Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
- Alignment, AS,
- TTI::TCK_RecipThroughput);
+ Cost += VF.getKnownMinValue() *
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
+ AS, TTI::TCK_RecipThroughput);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
@@ -5889,8 +6722,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
return Cost;
}
-unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
+ ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
Value *Ptr = getLoadStorePointerOperand(I);
@@ -5901,7 +6735,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
const Align Alignment = getLoadStoreAlignment(I);
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
if (Legal->isMaskRequired(I))
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
CostKind);
@@ -5915,8 +6749,11 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
return Cost;
}
-unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
+ ElementCount VF) {
+ assert(Legal->isUniformMemOp(*I));
+
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
@@ -5937,11 +6774,12 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
(isLoopInvariantStoreValue
? 0
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
- VF - 1));
+ VF.getKnownMinValue() - 1));
}
-unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
+ ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
@@ -5953,8 +6791,9 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
TargetTransformInfo::TCK_RecipThroughput, I);
}
-unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+ ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
@@ -5963,7 +6802,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
assert(Group && "Fail to get an interleaved access group.");
unsigned InterleaveFactor = Group->getFactor();
- auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
// Holds the indices of existing members in an interleaved load group.
// An interleaved store group doesn't need this as it doesn't allow gaps.
@@ -5977,7 +6817,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
// Calculate the cost of the whole interleaved group.
bool UseMaskForGaps =
Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
- unsigned Cost = TTI.getInterleavedMemoryOpCost(
+ InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
@@ -5991,11 +6831,122 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
return Cost;
}
-unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
- unsigned VF) {
+InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
+ Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
+ // Early exit for no inloop reductions
+ if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
+ return InstructionCost::getInvalid();
+ auto *VectorTy = cast<VectorType>(Ty);
+
+ // We are looking for a pattern of, and finding the minimal acceptable cost:
+ // reduce(mul(ext(A), ext(B))) or
+ // reduce(mul(A, B)) or
+ // reduce(ext(A)) or
+ // reduce(A).
+ // The basic idea is that we walk down the tree to do that, finding the root
+ // reduction instruction in InLoopReductionImmediateChains. From there we find
+ // the pattern of mul/ext and test the cost of the entire pattern vs the cost
+ // of the components. If the reduction cost is lower then we return it for the
+ // reduction instruction and 0 for the other instructions in the pattern. If
+ // it is not we return an invalid cost specifying the orignal cost method
+ // should be used.
+ Instruction *RetI = I;
+ if ((RetI->getOpcode() == Instruction::SExt ||
+ RetI->getOpcode() == Instruction::ZExt)) {
+ if (!RetI->hasOneUser())
+ return InstructionCost::getInvalid();
+ RetI = RetI->user_back();
+ }
+ if (RetI->getOpcode() == Instruction::Mul &&
+ RetI->user_back()->getOpcode() == Instruction::Add) {
+ if (!RetI->hasOneUser())
+ return InstructionCost::getInvalid();
+ RetI = RetI->user_back();
+ }
+
+ // Test if the found instruction is a reduction, and if not return an invalid
+ // cost specifying the parent to use the original cost modelling.
+ if (!InLoopReductionImmediateChains.count(RetI))
+ return InstructionCost::getInvalid();
+
+ // Find the reduction this chain is a part of and calculate the basic cost of
+ // the reduction on its own.
+ Instruction *LastChain = InLoopReductionImmediateChains[RetI];
+ Instruction *ReductionPhi = LastChain;
+ while (!isa<PHINode>(ReductionPhi))
+ ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
+
+ RecurrenceDescriptor RdxDesc =
+ Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
+ unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(),
+ VectorTy, false, CostKind);
+
+ // Get the operand that was not the reduction chain and match it to one of the
+ // patterns, returning the better cost if it is found.
+ Instruction *RedOp = RetI->getOperand(1) == LastChain
+ ? dyn_cast<Instruction>(RetI->getOperand(0))
+ : dyn_cast<Instruction>(RetI->getOperand(1));
+
+ VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
+
+ if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
+ !TheLoop->isLoopInvariant(RedOp)) {
+ bool IsUnsigned = isa<ZExtInst>(RedOp);
+ auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
+ InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+ /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
+ CostKind);
+
+ unsigned ExtCost =
+ TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
+ TTI::CastContextHint::None, CostKind, RedOp);
+ if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
+ return I == RetI ? *RedCost.getValue() : 0;
+ } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
+ Instruction *Mul = RedOp;
+ Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
+ Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
+ if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
+ Op0->getOpcode() == Op1->getOpcode() &&
+ Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
+ !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
+ bool IsUnsigned = isa<ZExtInst>(Op0);
+ auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
+ // reduce(mul(ext, ext))
+ unsigned ExtCost =
+ TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
+ TTI::CastContextHint::None, CostKind, Op0);
+ unsigned MulCost =
+ TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
+
+ InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+ /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
+ CostKind);
+
+ if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
+ return I == RetI ? *RedCost.getValue() : 0;
+ } else {
+ unsigned MulCost =
+ TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
+
+ InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+ /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
+ CostKind);
+
+ if (RedCost.isValid() && RedCost < MulCost + BaseCost)
+ return I == RetI ? *RedCost.getValue() : 0;
+ }
+ }
+
+ return I == RetI ? BaseCost : InstructionCost::getInvalid();
+}
+
+InstructionCost
+LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
+ ElementCount VF) {
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
- if (VF == 1) {
+ if (VF.isScalar()) {
Type *ValTy = getMemInstValueType(I);
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
@@ -6008,43 +6959,52 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
}
LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+ ElementCount VF) {
// If we know that this instruction will remain uniform, check the cost of
// the scalar version.
if (isUniformAfterVectorization(I, VF))
- VF = 1;
+ VF = ElementCount::getFixed(1);
- if (VF > 1 && isProfitableToScalarize(I, VF))
+ if (VF.isVector() && isProfitableToScalarize(I, VF))
return VectorizationCostTy(InstsToScalarize[VF][I], false);
// Forced scalars do not have any scalarization overhead.
auto ForcedScalar = ForcedScalars.find(VF);
- if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
+ if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
auto InstSet = ForcedScalar->second;
if (InstSet.count(I))
- return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+ return VectorizationCostTy(
+ (getInstructionCost(I, ElementCount::getFixed(1)).first *
+ VF.getKnownMinValue()),
+ false);
}
Type *VectorTy;
- unsigned C = getInstructionCost(I, VF, VectorTy);
+ InstructionCost C = getInstructionCost(I, VF, VectorTy);
bool TypeNotScalarized =
- VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
+ VF.isVector() && VectorTy->isVectorTy() &&
+ TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
return VectorizationCostTy(C, TypeNotScalarized);
}
-unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+ ElementCount VF) {
- if (VF == 1)
+ assert(!VF.isScalable() &&
+ "cannot compute scalarization overhead for scalable vectorization");
+ if (VF.isScalar())
return 0;
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(
- cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
+ cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
+ true, false);
// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6061,11 +7021,11 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
// Skip operands that do not require extraction/scalarization and do not incur
// any overhead.
return Cost + TTI.getOperandsScalarizationOverhead(
- filterExtractingOperands(Ops, VF), VF);
+ filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
}
-void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
- if (VF == 1)
+void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
+ if (VF.isScalar())
return;
NumPredStores = 0;
for (BasicBlock *BB : TheLoop->blocks()) {
@@ -6082,23 +7042,19 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
NumPredStores++;
- if (Legal->isUniform(Ptr) &&
- // Conditional loads and stores should be scalarized and predicated.
- // isScalarWithPredication cannot be used here since masked
- // gather/scatters are not considered scalar with predication.
- !Legal->blockNeedsPredication(I.getParent())) {
+ if (Legal->isUniformMemOp(I)) {
// TODO: Avoid replicating loads and stores instead of
// relying on instcombine to remove them.
// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
- unsigned Cost = getUniformMemOpCost(&I, VF);
+ InstructionCost Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
continue;
}
// We assume that widening is the best solution when possible.
if (memoryInstructionCanBeWidened(&I, VF)) {
- unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+ InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
int ConsecutiveStride =
Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
@@ -6110,7 +7066,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
// Choose between Interleaving, Gather/Scatter or Scalarization.
- unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
+ InstructionCost InterleaveCost = std::numeric_limits<int>::max();
unsigned NumAccesses = 1;
if (isAccessInterleaved(&I)) {
auto Group = getInterleavedAccessGroup(&I);
@@ -6125,17 +7081,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
InterleaveCost = getInterleaveGroupCost(&I, VF);
}
- unsigned GatherScatterCost =
+ InstructionCost GatherScatterCost =
isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
- : std::numeric_limits<unsigned>::max();
+ : std::numeric_limits<int>::max();
- unsigned ScalarizationCost =
+ InstructionCost ScalarizationCost =
getMemInstScalarizationCost(&I, VF) * NumAccesses;
// Choose better solution for the current VF,
// write down this decision and use it during vectorization.
- unsigned Cost;
+ InstructionCost Cost;
InstWidening Decision;
if (InterleaveCost <= GatherScatterCost &&
InterleaveCost < ScalarizationCost) {
@@ -6179,8 +7135,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// Add all instructions used to generate the addresses.
SmallVector<Instruction *, 4> Worklist;
- for (auto *I : AddrDefs)
- Worklist.push_back(I);
+ append_range(Worklist, AddrDefs);
while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();
for (auto &Op : I->operands())
@@ -6199,14 +7154,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
InstWidening Decision = getWideningDecision(I, VF);
if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
// Scalarize a widened load of address.
- setWideningDecision(I, VF, CM_Scalarize,
- (VF * getMemoryInstructionCost(I, 1)));
+ setWideningDecision(
+ I, VF, CM_Scalarize,
+ (VF.getKnownMinValue() *
+ getMemoryInstructionCost(I, ElementCount::getFixed(1))));
else if (auto Group = getInterleavedAccessGroup(I)) {
// Scalarize an interleave group of address loads.
for (unsigned I = 0; I < Group->getFactor(); ++I) {
if (Instruction *Member = Group->getMember(I))
- setWideningDecision(Member, VF, CM_Scalarize,
- (VF * getMemoryInstructionCost(Member, 1)));
+ setWideningDecision(
+ Member, VF, CM_Scalarize,
+ (VF.getKnownMinValue() *
+ getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
}
}
} else
@@ -6216,9 +7175,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
}
-unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
- unsigned VF,
- Type *&VectorTy) {
+InstructionCost
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
+ Type *&VectorTy) {
Type *RetTy = I->getType();
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6240,19 +7199,22 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// blocks requires also an extract of its vector compare i1 element.
bool ScalarPredicatedBB = false;
BranchInst *BI = cast<BranchInst>(I);
- if (VF > 1 && BI->isConditional() &&
+ if (VF.isVector() && BI->isConditional() &&
(PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
ScalarPredicatedBB = true;
if (ScalarPredicatedBB) {
// Return cost for branches around scalarized and predicated blocks.
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *Vec_i1Ty =
- FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
- return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
- false, true) +
- (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
- } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
+ VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+ return (TTI.getScalarizationOverhead(
+ Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
+ false, true) +
+ (TTI.getCFInstrCost(Instruction::Br, CostKind) *
+ VF.getKnownMinValue()));
+ } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
// The back-edge branch will remain, as will all scalar branches.
return TTI.getCFInstrCost(Instruction::Br, CostKind);
else
@@ -6267,20 +7229,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// First-order recurrences are replaced by vector shuffles inside the loop.
// NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
- if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
- return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- cast<VectorType>(VectorTy), VF - 1,
- FixedVectorType::get(RetTy, 1));
+ if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
+ return TTI.getShuffleCost(
+ TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
+ VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
// converted into select instructions. We require N - 1 selects per phi
// node, where N is the number of incoming values.
- if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
+ if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
return (Phi->getNumIncomingValues() - 1) *
TTI.getCmpSelInstrCost(
Instruction::Select, ToVectorTy(Phi->getType(), VF),
ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
- CostKind);
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
return TTI.getCFInstrCost(Instruction::PHI, CostKind);
}
@@ -6292,17 +7254,19 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// vector lane. Get the scalarization cost and scale this amount by the
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
- if (VF > 1 && isScalarWithPredication(I)) {
- unsigned Cost = 0;
+ if (VF.isVector() && isScalarWithPredication(I)) {
+ InstructionCost Cost = 0;
// These instructions have a non-void type, so account for the phi nodes
// that we will create. This cost is likely to be zero. The phi node
// cost, if any, should be scaled by the block probability because it
// models a copy at the end of each predicated block.
- Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
+ Cost += VF.getKnownMinValue() *
+ TTI.getCFInstrCost(Instruction::PHI, CostKind);
// The cost of the non-predicated instruction.
- Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
+ Cost += VF.getKnownMinValue() *
+ TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
// The cost of insertelement and extractelement instructions needed for
// scalarization.
@@ -6331,6 +7295,13 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// Since we will replace the stride by 1 the multiplication should go away.
if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
return 0;
+
+ // Detect reduction patterns
+ InstructionCost RedCost;
+ if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ .isValid())
+ return RedCost;
+
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
Value *Op2 = I->getOperand(1);
@@ -6341,14 +7312,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
Op2VK = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
- unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+ unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
return N * TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
- unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
return N * TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
@@ -6362,10 +7334,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
Type *CondTy = SI->getCondition()->getType();
if (!ScalarCond)
- CondTy = FixedVectorType::get(CondTy, VF);
-
+ CondTy = VectorType::get(CondTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
- CostKind, I);
+ CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
@@ -6374,18 +7345,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
VectorTy = ToVectorTy(ValTy, VF);
- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
- I);
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
}
case Instruction::Store:
case Instruction::Load: {
- unsigned Width = VF;
- if (Width > 1) {
+ ElementCount Width = VF;
+ if (Width.isVector()) {
InstWidening Decision = getWideningDecision(I, Width);
assert(Decision != CM_Unknown &&
"CM decision should be taken at this point");
if (Decision == CM_Scalarize)
- Width = 1;
+ Width = ElementCount::getFixed(1);
}
VectorTy = ToVectorTy(getMemInstValueType(I), Width);
return getMemoryInstructionCost(I, VF);
@@ -6402,15 +7373,62 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
+ // Computes the CastContextHint from a Load/Store instruction.
+ auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
+ assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ "Expected a load or a store!");
+
+ if (VF.isScalar() || !TheLoop->contains(I))
+ return TTI::CastContextHint::Normal;
+
+ switch (getWideningDecision(I, VF)) {
+ case LoopVectorizationCostModel::CM_GatherScatter:
+ return TTI::CastContextHint::GatherScatter;
+ case LoopVectorizationCostModel::CM_Interleave:
+ return TTI::CastContextHint::Interleave;
+ case LoopVectorizationCostModel::CM_Scalarize:
+ case LoopVectorizationCostModel::CM_Widen:
+ return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
+ : TTI::CastContextHint::Normal;
+ case LoopVectorizationCostModel::CM_Widen_Reverse:
+ return TTI::CastContextHint::Reversed;
+ case LoopVectorizationCostModel::CM_Unknown:
+ llvm_unreachable("Instr did not go through cost modelling?");
+ }
+
+ llvm_unreachable("Unhandled case!");
+ };
+
+ unsigned Opcode = I->getOpcode();
+ TTI::CastContextHint CCH = TTI::CastContextHint::None;
+ // For Trunc, the context is the only user, which must be a StoreInst.
+ if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
+ if (I->hasOneUse())
+ if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
+ CCH = ComputeCCH(Store);
+ }
+ // For Z/Sext, the context is the operand, which must be a LoadInst.
+ else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
+ Opcode == Instruction::FPExt) {
+ if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
+ CCH = ComputeCCH(Load);
+ }
+
// We optimize the truncation of induction variables having constant
// integer steps. The cost of these truncations is the same as the scalar
// operation.
if (isOptimizableIVTruncate(I, VF)) {
auto *Trunc = cast<TruncInst>(I);
return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
- Trunc->getSrcTy(), CostKind, Trunc);
+ Trunc->getSrcTy(), CCH, CostKind, Trunc);
}
+ // Detect reduction patterns
+ InstructionCost RedCost;
+ if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ .isValid())
+ return RedCost;
+
Type *SrcScalarTy = I->getOperand(0)->getType();
Type *SrcVecTy =
VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
@@ -6421,35 +7439,39 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
//
// Calculate the modified src and dest types.
Type *MinVecTy = VectorTy;
- if (I->getOpcode() == Instruction::Trunc) {
+ if (Opcode == Instruction::Trunc) {
SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
VectorTy =
largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
- } else if (I->getOpcode() == Instruction::ZExt ||
- I->getOpcode() == Instruction::SExt) {
+ } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
VectorTy =
smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
}
}
- unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
- return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
- CostKind, I);
+ assert(!VF.isScalable() && "VF is assumed to be non scalable");
+ unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
+ return N *
+ TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
case Instruction::Call: {
bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);
- unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
- if (getVectorIntrinsicIDForCall(CI, TLI))
- return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
+ InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
+ if (getVectorIntrinsicIDForCall(CI, TLI)) {
+ InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+ return std::min(CallCost, IntrinsicCost);
+ }
return CallCost;
}
+ case Instruction::ExtractValue:
+ return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
default:
// The cost of executing VF copies of the scalar instruction. This opcode
// is unknown. Assume that it is the same as 'mul'.
- return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
- CostKind) +
+ return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
+ Instruction::Mul, VectorTy, CostKind) +
getScalarizationOverhead(I, VF);
} // end of switch.
}
@@ -6502,7 +7524,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// detection.
for (auto &Reduction : Legal->getReductionVars()) {
RecurrenceDescriptor &RedDes = Reduction.second;
- SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+ const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Ignore type-casting instructions we identified during induction
@@ -6514,6 +7536,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}
}
+void LoopVectorizationCostModel::collectInLoopReductions() {
+ for (auto &Reduction : Legal->getReductionVars()) {
+ PHINode *Phi = Reduction.first;
+ RecurrenceDescriptor &RdxDesc = Reduction.second;
+
+ // We don't collect reductions that are type promoted (yet).
+ if (RdxDesc.getRecurrenceType() != Phi->getType())
+ continue;
+
+ // If the target would prefer this reduction to happen "in-loop", then we
+ // want to record it as such.
+ unsigned Opcode = RdxDesc.getOpcode();
+ if (!PreferInLoopReductions &&
+ !TTI.preferInLoopReduction(Opcode, Phi->getType(),
+ TargetTransformInfo::ReductionFlags()))
+ continue;
+
+ // Check that we can correctly put the reductions into the loop, by
+ // finding the chain of operations that leads from the phi to the loop
+ // exit value.
+ SmallVector<Instruction *, 4> ReductionOperations =
+ RdxDesc.getReductionOpChain(Phi, TheLoop);
+ bool InLoop = !ReductionOperations.empty();
+ if (InLoop) {
+ InLoopReductionChains[Phi] = ReductionOperations;
+ // Add the elements to InLoopReductionImmediateChains for cost modelling.
+ Instruction *LastChain = Phi;
+ for (auto *I : ReductionOperations) {
+ InLoopReductionImmediateChains[I] = LastChain;
+ LastChain = I;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
+ << " reduction for phi: " << *Phi << "\n");
+ }
+}
+
// TODO: we could return a pair of values that specify the max VF and
// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
@@ -6527,37 +7586,40 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
}
VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
- unsigned VF = UserVF;
+LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
+ assert(!UserVF.isScalable() && "scalable vectors not yet supported");
+ ElementCount VF = UserVF;
// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
// Since we cannot modify the incoming IR, we need to build VPlan upfront in
// the vectorization pipeline.
- if (!OrigLoop->empty()) {
+ if (!OrigLoop->isInnermost()) {
// If the user doesn't provide a vectorization factor, determine a
// reasonable one.
- if (!UserVF) {
- VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
+ if (UserVF.isZero()) {
+ VF = ElementCount::getFixed(
+ determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
// Make sure we have a VF > 1 for stress testing.
- if (VPlanBuildStressTest && VF < 2) {
+ if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n");
- VF = 4;
+ VF = ElementCount::getFixed(4);
}
}
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
- assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
- LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
- << " to build VPlans.\n");
+ assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+ "VF needs to be a power of two");
+ LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
+ << "VF " << VF << " to build VPlans.\n");
buildVPlans(VF, VF);
// For VPlan build stress testing, we bail out after VPlan construction.
if (VPlanBuildStressTest)
return VectorizationFactor::Disabled();
- return {VF, 0};
+ return {VF, 0 /*Cost*/};
}
LLVM_DEBUG(
@@ -6566,10 +7628,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
return VectorizationFactor::Disabled();
}
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
- unsigned UserIC) {
- assert(OrigLoop->empty() && "Inner loop expected.");
- Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
+Optional<VectorizationFactor>
+LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+ assert(OrigLoop->isInnermost() && "Inner loop expected.");
+ Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
return None;
@@ -6587,40 +7649,55 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
CM.invalidateCostModelingDecisions();
}
- if (UserVF) {
- LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
- assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+ ElementCount MaxVF = MaybeMaxVF.getValue();
+ assert(MaxVF.isNonZero() && "MaxVF is zero.");
+
+ bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
+ if (!UserVF.isZero() &&
+ (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
+ // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
+ // VFs here, this should be reverted to only use legal UserVFs once the
+ // loop below supports scalable VFs.
+ ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
+ LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
+ << " VF " << VF << ".\n");
+ assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+ "VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
- CM.selectUserVectorizationFactor(UserVF);
- buildVPlansWithVPRecipes(UserVF, UserVF);
+ CM.selectUserVectorizationFactor(VF);
+ CM.collectInLoopReductions();
+ buildVPlansWithVPRecipes(VF, VF);
LLVM_DEBUG(printPlans(dbgs()));
- return {{UserVF, 0}};
+ return {{VF, 0}};
}
- unsigned MaxVF = MaybeMaxVF.getValue();
- assert(MaxVF != 0 && "MaxVF is zero.");
+ assert(!MaxVF.isScalable() &&
+ "Scalable vectors not yet supported beyond this point");
- for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
+ for (ElementCount VF = ElementCount::getFixed(1);
+ ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
// Collect Uniform and Scalar instructions after vectorization with VF.
CM.collectUniformsAndScalars(VF);
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
- if (VF > 1)
+ if (VF.isVector())
CM.collectInstsToScalarize(VF);
}
- buildVPlansWithVPRecipes(1, MaxVF);
+ CM.collectInLoopReductions();
+
+ buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
LLVM_DEBUG(printPlans(dbgs()));
- if (MaxVF == 1)
+ if (MaxVF.isScalar())
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
return CM.selectVectorizationFactor(MaxVF);
}
-void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
+void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
<< '\n');
BestVF = VF;
@@ -6639,13 +7716,23 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
// 1. Create a new empty loop. Unlink the old loop and connect the new one.
VPCallbackILV CallbackILV(ILV);
- VPTransformState State{BestVF, BestUF, LI,
- DT, ILV.Builder, ILV.VectorLoopValueMap,
- &ILV, CallbackILV};
+ assert(BestVF.hasValue() && "Vectorization Factor is missing");
+
+ VPTransformState State{*BestVF,
+ BestUF,
+ OrigLoop,
+ LI,
+ DT,
+ ILV.Builder,
+ ILV.VectorLoopValueMap,
+ &ILV,
+ CallbackILV};
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
State.TripCount = ILV.getOrCreateTripCount(nullptr);
State.CanonicalIV = ILV.Induction;
+ ILV.printDebugTracesAtStart();
+
//===------------------------------------------------===//
//
// Notice: any optimization or new instruction that go
@@ -6661,25 +7748,48 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
ILV.fixVectorizedLoop();
+
+ ILV.printDebugTracesAtEnd();
}
void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
SmallPtrSetImpl<Instruction *> &DeadInstructions) {
- BasicBlock *Latch = OrigLoop->getLoopLatch();
- // We create new control-flow for the vectorized loop, so the original
- // condition will be dead after vectorization if it's only used by the
- // branch.
- auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
- if (Cmp && Cmp->hasOneUse())
- DeadInstructions.insert(Cmp);
+ // We create new control-flow for the vectorized loop, so the original exit
+ // conditions will be dead after vectorization if it's only used by the
+ // terminator
+ SmallVector<BasicBlock*> ExitingBlocks;
+ OrigLoop->getExitingBlocks(ExitingBlocks);
+ for (auto *BB : ExitingBlocks) {
+ auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
+ if (!Cmp || !Cmp->hasOneUse())
+ continue;
+
+ // TODO: we should introduce a getUniqueExitingBlocks on Loop
+ if (!DeadInstructions.insert(Cmp).second)
+ continue;
+
+ // The operands of the icmp is often a dead trunc, used by IndUpdate.
+ // TODO: can recurse through operands in general
+ for (Value *Op : Cmp->operands()) {
+ if (isa<TruncInst>(Op) && Op->hasOneUse())
+ DeadInstructions.insert(cast<Instruction>(Op));
+ }
+ }
// We create new "steps" for induction variable updates to which the original
// induction variables map. An original update instruction will be dead if
// all its users except the induction variable are dead.
+ auto *Latch = OrigLoop->getLoopLatch();
for (auto &Induction : Legal->getInductionVars()) {
PHINode *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+ // If the tail is to be folded by masking, the primary induction variable,
+ // if exists, isn't dead: it will be used for masking. Don't kill it.
+ if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
+ continue;
+
if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
@@ -6754,12 +7864,284 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
}
}
+//===--------------------------------------------------------------------===//
+// EpilogueVectorizerMainLoop
+//===--------------------------------------------------------------------===//
+
+/// This function is partially responsible for generating the control flow
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
+BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
+ Loop *Lp = createVectorLoopSkeleton("");
+
+ // Generate the code to check the minimum iteration count of the vector
+ // epilogue (see below).
+ EPI.EpilogueIterationCountCheck =
+ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
+ EPI.EpilogueIterationCountCheck->setName("iter.check");
+
+ // Generate the code to check any assumptions that we've made for SCEV
+ // expressions.
+ BasicBlock *SavedPreHeader = LoopVectorPreHeader;
+ emitSCEVChecks(Lp, LoopScalarPreHeader);
+
+ // If a safety check was generated save it.
+ if (SavedPreHeader != LoopVectorPreHeader)
+ EPI.SCEVSafetyCheck = SavedPreHeader;
+
+ // Generate the code that checks at runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ SavedPreHeader = LoopVectorPreHeader;
+ emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+
+ // If a safety check was generated save/overwite it.
+ if (SavedPreHeader != LoopVectorPreHeader)
+ EPI.MemSafetyCheck = SavedPreHeader;
+
+ // Generate the iteration count check for the main loop, *after* the check
+ // for the epilogue loop, so that the path-length is shorter for the case
+ // that goes directly through the vector epilogue. The longer-path length for
+ // the main loop is compensated for, by the gain from vectorizing the larger
+ // trip count. Note: the branch will get updated later on when we vectorize
+ // the epilogue.
+ EPI.MainLoopIterationCountCheck =
+ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
+
+ // Generate the induction variable.
+ OldInduction = Legal->getPrimaryInduction();
+ Type *IdxTy = Legal->getWidestInductionType();
+ Value *StartIdx = ConstantInt::get(IdxTy, 0);
+ Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+ Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+ EPI.VectorTripCount = CountRoundDown;
+ Induction =
+ createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+ getDebugLocFromInstOrOperands(OldInduction));
+
+ // Skip induction resume value creation here because they will be created in
+ // the second pass. If we created them here, they wouldn't be used anyway,
+ // because the vplan in the second pass still contains the inductions from the
+ // original loop.
+
+ return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
+ LLVM_DEBUG({
+ dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
+ << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+ << ", Main Loop UF:" << EPI.MainLoopUF
+ << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+ << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+ });
+}
+
+void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
+ DEBUG_WITH_TYPE(VerboseDebug, {
+ dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
+ });
+}
+
+BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
+ Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
+ assert(L && "Expected valid Loop.");
+ assert(Bypass && "Expected valid bypass basic block.");
+ unsigned VFactor =
+ ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
+ unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
+ Value *Count = getOrCreateTripCount(L);
+ // Reuse existing vector loop preheader for TC checks.
+ // Note that new preheader block is generated for vector loop.
+ BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+ IRBuilder<> Builder(TCCheckBlock->getTerminator());
+
+ // Generate code to check if the loop's trip count is less than VF * UF of the
+ // main vector loop.
+ auto P =
+ Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+ Value *CheckMinIters = Builder.CreateICmp(
+ P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
+ "min.iters.check");
+
+ if (!ForEpilogue)
+ TCCheckBlock->setName("vector.main.loop.iter.check");
+
+ // Create new preheader for vector loop.
+ LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
+ DT, LI, nullptr, "vector.ph");
+
+ if (ForEpilogue) {
+ assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
+ DT->getNode(Bypass)->getIDom()) &&
+ "TC check is expected to dominate Bypass");
+
+ // Update dominator for Bypass & LoopExit.
+ DT->changeImmediateDominator(Bypass, TCCheckBlock);
+ DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+
+ LoopBypassBlocks.push_back(TCCheckBlock);
+
+ // Save the trip count so we don't have to regenerate it in the
+ // vec.epilog.iter.check. This is safe to do because the trip count
+ // generated here dominates the vector epilog iter check.
+ EPI.TripCount = Count;
+ }
+
+ ReplaceInstWithInst(
+ TCCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+ return TCCheckBlock;
+}
+
+//===--------------------------------------------------------------------===//
+// EpilogueVectorizerEpilogueLoop
+//===--------------------------------------------------------------------===//
+
+/// This function is partially responsible for generating the control flow
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
+BasicBlock *
+EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
+ Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
+
+ // Now, compare the remaining count and if there aren't enough iterations to
+ // execute the vectorized epilogue skip to the scalar part.
+ BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
+ VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
+ LoopVectorPreHeader =
+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+ LI, nullptr, "vec.epilog.ph");
+ emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
+ VecEpilogueIterationCountCheck);
+
+ // Adjust the control flow taking the state info from the main loop
+ // vectorization into account.
+ assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
+ "expected this to be saved from the previous pass.");
+ EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, LoopVectorPreHeader);
+
+ DT->changeImmediateDominator(LoopVectorPreHeader,
+ EPI.MainLoopIterationCountCheck);
+
+ EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+ if (EPI.SCEVSafetyCheck)
+ EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ if (EPI.MemSafetyCheck)
+ EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+ DT->changeImmediateDominator(
+ VecEpilogueIterationCountCheck,
+ VecEpilogueIterationCountCheck->getSinglePredecessor());
+
+ DT->changeImmediateDominator(LoopScalarPreHeader,
+ EPI.EpilogueIterationCountCheck);
+ DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
+
+ // Keep track of bypass blocks, as they feed start values to the induction
+ // phis in the scalar loop preheader.
+ if (EPI.SCEVSafetyCheck)
+ LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
+ if (EPI.MemSafetyCheck)
+ LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
+ LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
+
+ // Generate a resume induction for the vector epilogue and put it in the
+ // vector epilogue preheader
+ Type *IdxTy = Legal->getWidestInductionType();
+ PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
+ LoopVectorPreHeader->getFirstNonPHI());
+ EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
+ EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
+ EPI.MainLoopIterationCountCheck);
+
+ // Generate the induction variable.
+ OldInduction = Legal->getPrimaryInduction();
+ Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+ Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+ Value *StartIdx = EPResumeVal;
+ Induction =
+ createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+ getDebugLocFromInstOrOperands(OldInduction));
+
+ // Generate induction resume values. These variables save the new starting
+ // indexes for the scalar loop. They are used to test if there are any tail
+ // iterations left once the vector loop has completed.
+ // Note that when the vectorized epilogue is skipped due to iteration count
+ // check, then the resume value for the induction variable comes from
+ // the trip count of the main vector loop, hence passing the AdditionalBypass
+ // argument.
+ createInductionResumeValues(Lp, CountRoundDown,
+ {VecEpilogueIterationCountCheck,
+ EPI.VectorTripCount} /* AdditionalBypass */);
+
+ AddRuntimeUnrollDisableMetaData(Lp);
+ return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+BasicBlock *
+EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
+ Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
+
+ assert(EPI.TripCount &&
+ "Expected trip count to have been safed in the first pass.");
+ assert(
+ (!isa<Instruction>(EPI.TripCount) ||
+ DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
+ "saved trip count does not dominate insertion point.");
+ Value *TC = EPI.TripCount;
+ IRBuilder<> Builder(Insert->getTerminator());
+ Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
+
+ // Generate code to check if the loop's trip count is less than VF * UF of the
+ // vector epilogue loop.
+ auto P =
+ Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+ Value *CheckMinIters = Builder.CreateICmp(
+ P, Count,
+ ConstantInt::get(Count->getType(),
+ EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
+ "min.epilog.iters.check");
+
+ ReplaceInstWithInst(
+ Insert->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+ LoopBypassBlocks.push_back(Insert);
+ return Insert;
+}
+
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
+ LLVM_DEBUG({
+ dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
+ << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+ << ", Main Loop UF:" << EPI.MainLoopUF
+ << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+ << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+ });
+}
+
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
+ DEBUG_WITH_TYPE(VerboseDebug, {
+ dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
+ });
+}
+
bool LoopVectorizationPlanner::getDecisionAndClampRange(
- const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
- assert(Range.End > Range.Start && "Trying to test an empty VF range.");
+ const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
+ assert(!Range.isEmpty() && "Trying to test an empty VF range.");
bool PredicateAtRangeStart = Predicate(Range.Start);
- for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
+ for (ElementCount TmpVF = Range.Start * 2;
+ ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
if (Predicate(TmpVF) != PredicateAtRangeStart) {
Range.End = TmpVF;
break;
@@ -6773,9 +8155,11 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
/// of VF's starting at a given VF and extending it as much as possible. Each
/// vectorization decision can potentially shorten this sub-range during
/// buildVPlan().
-void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
- for (unsigned VF = MinVF; VF < MaxVF + 1;) {
- VFRange SubRange = {VF, MaxVF + 1};
+void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
+ ElementCount MaxVF) {
+ auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
+ VFRange SubRange = {VF, MaxVFPlusOne};
VPlans.push_back(buildVPlan(SubRange));
VF = SubRange.End;
}
@@ -6800,7 +8184,13 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
return EdgeMaskCache[Edge] = SrcMask;
- VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
+ // If source is an exiting block, we know the exit edge is dynamically dead
+ // in the vector loop, and thus we don't need to restrict the mask. Avoid
+ // adding uses of an otherwise potentially dead instruction.
+ if (OrigLoop->isLoopExiting(Src))
+ return EdgeMaskCache[Edge] = SrcMask;
+
+ VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
assert(EdgeMask && "No Edge Mask found for condition");
if (BI->getSuccessor(0) != Dst)
@@ -6828,23 +8218,34 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
if (!CM.blockNeedsPredication(BB))
return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+ // Create the block in mask as the first non-phi instruction in the block.
+ VPBuilder::InsertPointGuard Guard(Builder);
+ auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
+ Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
+
// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC.
// Start by constructing the desired canonical IV.
VPValue *IV = nullptr;
if (Legal->getPrimaryInduction())
- IV = Plan->getVPValue(Legal->getPrimaryInduction());
+ IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
else {
auto IVRecipe = new VPWidenCanonicalIVRecipe();
- Builder.getInsertBlock()->appendRecipe(IVRecipe);
+ Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
IV = IVRecipe->getVPValue();
}
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
bool TailFolded = !CM.isScalarEpilogueAllowed();
- if (TailFolded && CM.TTI.emitGetActiveLaneMask())
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
- else
+
+ if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
+ // While ActiveLaneMask is a binary op that consumes the loop tripcount
+ // as a second argument, we only pass the IV here and extract the
+ // tripcount from the transform state where codegen of the VP instructions
+ // happen.
+ BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
+ } else {
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+ }
return BlockMaskCache[BB] = BlockMask;
}
@@ -6865,14 +8266,13 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
return BlockMaskCache[BB] = BlockMask;
}
-VPWidenMemoryInstructionRecipe *
-VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
- VPlanPtr &Plan) {
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store");
- auto willWiden = [&](unsigned VF) -> bool {
- if (VF == 1)
+ auto willWiden = [&](ElementCount VF) -> bool {
+ if (VF.isScalar())
return false;
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, VF);
@@ -6903,20 +8303,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
}
VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
+VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
if (II.getKind() == InductionDescriptor::IK_IntInduction ||
- II.getKind() == InductionDescriptor::IK_FpInduction)
- return new VPWidenIntOrFpInductionRecipe(Phi);
+ II.getKind() == InductionDescriptor::IK_FpInduction) {
+ VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start);
+ }
return nullptr;
}
VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
- VFRange &Range) const {
+VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
+ VPlan &Plan) const {
// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -6925,15 +8327,21 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
// Determine whether \p K is a truncation based on an induction variable that
// can be optimized.
auto isOptimizableIVTruncate =
- [&](Instruction *K) -> std::function<bool(unsigned)> {
- return
- [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
+ [&](Instruction *K) -> std::function<bool(ElementCount)> {
+ return [=](ElementCount VF) -> bool {
+ return CM.isOptimizableIVTruncate(K, VF);
+ };
};
if (LoopVectorizationPlanner::getDecisionAndClampRange(
- isOptimizableIVTruncate(I), Range))
+ isOptimizableIVTruncate(I), Range)) {
+
+ InductionDescriptor II =
+ Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
+ VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
- I);
+ Start, I);
+ }
return nullptr;
}
@@ -6962,7 +8370,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
VPlan &Plan) const {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
+ [this, CI](ElementCount VF) {
+ return CM.isScalarWithPredication(CI, VF);
+ },
Range);
if (IsPredicated)
@@ -6970,19 +8380,23 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
- ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+ ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
+ ID == Intrinsic::pseudoprobe ||
+ ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;
- auto willWiden = [&](unsigned VF) -> bool {
+ auto willWiden = [&](ElementCount VF) -> bool {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// The following case may be scalarized depending on the VF.
// The flag shows whether we use Intrinsic or a usual Call for vectorized
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;
- unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
- bool UseVectorIntrinsic =
- ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
+ InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
+ bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
+ assert(IntrinsicCost.isValid() && CallCost.isValid() &&
+ "Cannot have invalid costs while widening");
return UseVectorIntrinsic || !NeedToScalarize;
};
@@ -6997,7 +8411,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
!isa<StoreInst>(I) && "Instruction should have been handled earlier");
// Instruction should be widened, unless it is scalar after vectorization,
// scalarization is profitable or it is predicated.
- auto WillScalarize = [this, I](unsigned VF) -> bool {
+ auto WillScalarize = [this, I](ElementCount VF) -> bool {
return CM.isScalarAfterVectorization(I, VF) ||
CM.isProfitableToScalarize(I, VF) ||
CM.isScalarWithPredication(I, VF);
@@ -7060,15 +8474,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
VPlanPtr &Plan) {
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
+ [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+ [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
+ Range);
auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
IsUniform, IsPredicated);
setRecipe(I, Recipe);
+ Plan->addVPValue(I, Recipe);
// Find if I uses a predicated instruction. If so, it will use its scalar
// value. Avoid hoisting the insert-element which packs the scalar value into
@@ -7110,8 +8526,9 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
assert(Instr->getParent() && "Predicated instruction not in any basic block");
auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
- auto *PHIRecipe =
- Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
+ auto *PHIRecipe = Instr->getType()->isVoidTy()
+ ? nullptr
+ : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
@@ -7139,13 +8556,21 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (auto Phi = dyn_cast<PHINode>(Instr)) {
if (Phi->getParent() != OrigLoop->getHeader())
return tryToBlend(Phi, Plan);
- if ((Recipe = tryToOptimizeInductionPHI(Phi)))
+ if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
return Recipe;
+
+ if (Legal->isReductionVariable(Phi)) {
+ RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ VPValue *StartV =
+ Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
+ return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
+ }
+
return new VPWidenPHIRecipe(Phi);
}
- if (isa<TruncInst>(Instr) &&
- (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
+ if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
+ cast<TruncInst>(Instr), Range, *Plan)))
return Recipe;
if (!shouldWiden(Instr, Range))
@@ -7165,35 +8590,9 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return tryToWiden(Instr, *Plan);
}
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
- unsigned MaxVF) {
- assert(OrigLoop->empty() && "Inner loop expected.");
-
- // Collect conditions feeding internal conditional branches; they need to be
- // represented in VPlan for it to model masking.
- SmallPtrSet<Value *, 1> NeedDef;
-
- auto *Latch = OrigLoop->getLoopLatch();
- for (BasicBlock *BB : OrigLoop->blocks()) {
- if (BB == Latch)
- continue;
- BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
- if (Branch && Branch->isConditional())
- NeedDef.insert(Branch->getCondition());
- }
-
- // If the tail is to be folded by masking, the primary induction variable, if
- // exists needs to be represented in VPlan for it to model early-exit masking.
- // Also, both the Phi and the live-out instruction of each reduction are
- // required in order to introduce a select between them in VPlan.
- if (CM.foldTailByMasking()) {
- if (Legal->getPrimaryInduction())
- NeedDef.insert(Legal->getPrimaryInduction());
- for (auto &Reduction : Legal->getReductionVars()) {
- NeedDef.insert(Reduction.first);
- NeedDef.insert(Reduction.second.getLoopExitInstr());
- }
- }
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
+ ElementCount MaxVF) {
+ assert(OrigLoop->isInnermost() && "Inner loop expected.");
// Collect instructions from the original loop that will become trivially dead
// in the vectorized loop. We don't need to vectorize these instructions. For
@@ -7216,17 +8615,17 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
for (Instruction *I : DeadInstructions)
SinkAfter.erase(I);
- for (unsigned VF = MinVF; VF < MaxVF + 1;) {
- VFRange SubRange = {VF, MaxVF + 1};
- VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
- DeadInstructions, SinkAfter));
+ auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
+ VFRange SubRange = {VF, MaxVFPlusOne};
+ VPlans.push_back(
+ buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
VF = SubRange.End;
}
}
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
- VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
- SmallPtrSetImpl<Instruction *> &DeadInstructions,
+ VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
const DenseMap<Instruction *, Instruction *> &SinkAfter) {
// Hold a mapping from predicated instructions to their recipes, in order to
@@ -7249,14 +8648,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RecipeBuilder.recordRecipeOf(Entry.first);
RecipeBuilder.recordRecipeOf(Entry.second);
}
+ for (auto &Reduction : CM.getInLoopReductionChains()) {
+ PHINode *Phi = Reduction.first;
+ RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
+ const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
+
+ RecipeBuilder.recordRecipeOf(Phi);
+ for (auto &R : ReductionOperations) {
+ RecipeBuilder.recordRecipeOf(R);
+ // For min/max reducitons, where we have a pair of icmp/select, we also
+ // need to record the ICmp recipe, so it can be removed later.
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
+ RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
+ }
+ }
// For each interleave group which is relevant for this (possibly trimmed)
// Range, add it to the set of groups to be later applied to the VPlan and add
// placeholders for its members' Recipes which we'll be replacing with a
// single VPInterleaveRecipe.
for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
- auto applyIG = [IG, this](unsigned VF) -> bool {
- return (VF >= 2 && // Query is illegal for VF == 1
+ auto applyIG = [IG, this](ElementCount VF) -> bool {
+ return (VF.isVector() && // Query is illegal for VF == 1
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
};
@@ -7278,10 +8691,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
Plan->setEntry(VPBB);
- // Represent values that will have defs inside VPlan.
- for (Value *V : NeedDef)
- Plan->addVPValue(V);
-
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
@@ -7308,6 +8717,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
if (auto Recipe =
RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
+ for (auto *Def : Recipe->definedValues()) {
+ auto *UV = Def->getUnderlyingValue();
+ Plan->addVPValue(UV, Def);
+ }
+
RecipeBuilder.setRecipe(Instr, Recipe);
VPBB->appendRecipe(Recipe);
continue;
@@ -7343,6 +8757,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
for (auto &Entry : SinkAfter) {
VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
+ // If the target is in a replication region, make sure to move Sink to the
+ // block after it, not into the replication region itself.
+ if (auto *Region =
+ dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
+ if (Region->isReplicator()) {
+ assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
+ VPBasicBlock *NextBlock =
+ cast<VPBasicBlock>(Region->getSuccessors().front());
+ Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
+ continue;
+ }
+ }
Sink->moveAfter(Target);
}
@@ -7352,33 +8778,52 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
for (auto IG : InterleaveGroups) {
auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
RecipeBuilder.getRecipe(IG->getInsertPos()));
- (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
- ->insertBefore(Recipe);
+ SmallVector<VPValue *, 4> StoredValues;
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
+ StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
+ auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
+ Recipe->getMask());
+ VPIG->insertBefore(Recipe);
+ unsigned J = 0;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *Member = IG->getMember(i)) {
+ if (!Member->getType()->isVoidTy()) {
+ VPValue *OriginalV = Plan->getVPValue(Member);
+ Plan->removeVPValueFor(Member);
+ Plan->addVPValue(Member, VPIG->getVPValue(J));
+ OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
+ J++;
+ }
RecipeBuilder.getRecipe(Member)->eraseFromParent();
}
}
+ // Adjust the recipes for any inloop reductions.
+ if (Range.Start.isVector())
+ adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
+
// Finally, if tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the end of the latch.
- if (CM.foldTailByMasking()) {
+ if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
Builder.setInsertPoint(VPBB);
auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
for (auto &Reduction : Legal->getReductionVars()) {
- VPValue *Phi = Plan->getVPValue(Reduction.first);
- VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
+ if (CM.isInLoopReduction(Reduction.first))
+ continue;
+ VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
+ VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
}
}
std::string PlanName;
raw_string_ostream RSO(PlanName);
- unsigned VF = Range.Start;
+ ElementCount VF = Range.Start;
Plan->addVF(VF);
RSO << "Initial VPlan for VF={" << VF;
- for (VF *= 2; VF < Range.End; VF *= 2) {
+ for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
Plan->addVF(VF);
RSO << "," << VF;
}
@@ -7394,7 +8839,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// transformations before even evaluating whether vectorization is profitable.
// Since we cannot modify the incoming IR, we need to build VPlan upfront in
// the vectorization pipeline.
- assert(!OrigLoop->empty());
+ assert(!OrigLoop->isInnermost());
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
// Create new empty VPlan
@@ -7404,7 +8849,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
HCFGBuilder.buildHierarchicalCFG();
- for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
+ for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
+ VF *= 2)
Plan->addVF(VF);
if (EnableVPlanPredication) {
@@ -7422,6 +8868,67 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
return Plan;
}
+// Adjust the recipes for any inloop reductions. The chain of instructions
+// leading from the loop exit instr to the phi need to be converted to
+// reductions, with one operand being vector and the other being the scalar
+// reduction chain.
+void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
+ VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
+ for (auto &Reduction : CM.getInLoopReductionChains()) {
+ PHINode *Phi = Reduction.first;
+ RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
+
+ // ReductionOperations are orders top-down from the phi's use to the
+ // LoopExitValue. We keep a track of the previous item (the Chain) to tell
+ // which of the two operands will remain scalar and which will be reduced.
+ // For minmax the chain will be the select instructions.
+ Instruction *Chain = Phi;
+ for (Instruction *R : ReductionOperations) {
+ VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
+ RecurKind Kind = RdxDesc.getRecurrenceKind();
+
+ VPValue *ChainOp = Plan->getVPValue(Chain);
+ unsigned FirstOpId;
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+ assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
+ "Expected to replace a VPWidenSelectSC");
+ FirstOpId = 1;
+ } else {
+ assert(isa<VPWidenRecipe>(WidenRecipe) &&
+ "Expected to replace a VPWidenSC");
+ FirstOpId = 0;
+ }
+ unsigned VecOpId =
+ R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
+ VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
+
+ auto *CondOp = CM.foldTailByMasking()
+ ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
+ : nullptr;
+ VPReductionRecipe *RedRecipe = new VPReductionRecipe(
+ &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
+ WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
+ Plan->removeVPValueFor(R);
+ Plan->addVPValue(R, RedRecipe);
+ WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
+ WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
+ WidenRecipe->eraseFromParent();
+
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+ VPRecipeBase *CompareRecipe =
+ RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
+ assert(isa<VPWidenRecipe>(CompareRecipe) &&
+ "Expected to replace a VPWidenSC");
+ assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
+ "Expected no remaining users");
+ CompareRecipe->eraseFromParent();
+ }
+ Chain = R;
+ }
+ }
+}
+
Value* LoopVectorizationPlanner::VPCallbackILV::
getOrCreateVectorValues(Value *V, unsigned Part) {
return ILV.getOrCreateVectorValue(V, Part);
@@ -7449,29 +8956,35 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
void VPWidenCallRecipe::execute(VPTransformState &State) {
- State.ILV->widenCallInstruction(Ingredient, User, State);
+ State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
+ *this, State);
}
void VPWidenSelectRecipe::execute(VPTransformState &State) {
- State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
+ State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
+ this, *this, InvariantCond, State);
}
void VPWidenRecipe::execute(VPTransformState &State) {
- State.ILV->widenInstruction(Ingredient, User, State);
+ State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
}
void VPWidenGEPRecipe::execute(VPTransformState &State) {
- State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
+ State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
+ *this, State.UF, State.VF, IsPtrLoopInvariant,
IsIndexLoopInvariant, State);
}
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
- State.ILV->widenIntOrFpInduction(IV, Trunc);
+ State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
+ Trunc);
}
void VPWidenPHIRecipe::execute(VPTransformState &State) {
- State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
+ Value *StartV =
+ getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
+ State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF);
}
void VPBlendRecipe::execute(VPTransformState &State) {
@@ -7515,22 +9028,59 @@ void VPBlendRecipe::execute(VPTransformState &State) {
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
- State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
+ State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
+ getStoredValues(), getMask());
+}
+
+void VPReductionRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Reduction being replicated.");
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ RecurKind Kind = RdxDesc->getRecurrenceKind();
+ Value *NewVecOp = State.get(getVecOp(), Part);
+ if (VPValue *Cond = getCondOp()) {
+ Value *NewCond = State.get(Cond, Part);
+ VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
+ Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+ Kind, VecTy->getElementType());
+ Constant *IdenVec =
+ ConstantVector::getSplat(VecTy->getElementCount(), Iden);
+ Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
+ NewVecOp = Select;
+ }
+ Value *NewRed =
+ createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
+ Value *PrevInChain = State.get(getChainOp(), Part);
+ Value *NextInChain;
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+ NextInChain =
+ createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
+ NewRed, PrevInChain);
+ } else {
+ NextInChain = State.Builder.CreateBinOp(
+ (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
+ PrevInChain);
+ }
+ State.set(this, getUnderlyingInstr(), NextInChain, Part);
+ }
}
void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
- State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
- IsPredicated, State);
+ assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
+ *State.Instance, IsPredicated, State);
// Insert scalar instance packing it into a vector.
- if (AlsoPack && State.VF > 1) {
- // If we're constructing lane 0, initialize to start from undef.
+ if (AlsoPack && State.VF.isVector()) {
+ // If we're constructing lane 0, initialize to start from poison.
if (State.Instance->Lane == 0) {
- Value *Undef = UndefValue::get(
- FixedVectorType::get(Ingredient->getType(), State.VF));
- State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
+ assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+ Value *Poison = PoisonValue::get(
+ VectorType::get(getUnderlyingValue()->getType(), State.VF));
+ State.ValueMap.setVectorValue(getUnderlyingInstr(),
+ State.Instance->Part, Poison);
}
- State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
+ State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
+ *State.Instance);
}
return;
}
@@ -7538,10 +9088,12 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
// Generate scalar instances for all VF lanes of all UF parts, unless the
// instruction is uniform inwhich case generate only the first lane for each
// of the UF parts.
- unsigned EndLane = IsUniform ? 1 : State.VF;
+ unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
+ assert((!State.VF.isScalable() || IsUniform) &&
+ "Can't scalarize a scalable vector");
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
IsPredicated, State);
}
@@ -7573,8 +9125,8 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Predicated instruction PHI works per instance.");
- Instruction *ScalarPredInst = cast<Instruction>(
- State.ValueMap.getScalarValue(PredInst, *State.Instance));
+ Instruction *ScalarPredInst =
+ cast<Instruction>(State.get(getOperand(0), *State.Instance));
BasicBlock *PredicatedBB = ScalarPredInst->getParent();
BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
assert(PredicatingBB && "Predicated block has no single predecessor.");
@@ -7586,6 +9138,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
// also do that packing, thereby "hoisting" the insert-element sequence.
// Otherwise, a phi node for the scalar value is needed.
unsigned Part = State.Instance->Part;
+ Instruction *PredInst =
+ cast<Instruction>(getOperand(0)->getUnderlyingValue());
if (State.ValueMap.hasVectorValue(PredInst, Part)) {
Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
@@ -7596,16 +9150,17 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
} else {
Type *PredInstType = PredInst->getType();
PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
- Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
+ Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB);
Phi->addIncoming(ScalarPredInst, PredicatedBB);
State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
}
}
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
- VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
- State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
- getMask());
+ VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
+ State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
+ StoredValue ? nullptr : getVPValue(),
+ getAddr(), StoredValue, getMask());
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -7617,35 +9172,53 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
LoopVectorizationLegality &LVL) {
- bool OptSize =
- F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
- PGSOQueryType::IRPass);
// 1) OptSize takes precedence over all other options, i.e. if this is set,
// don't look at hints or options, and don't request a scalar epilogue.
- if (OptSize)
+ // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
+ // LoopAccessInfo (due to code dependency and not being able to reliably get
+ // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
+ // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
+ // versioning when the vectorization is forced, unlike hasOptSize. So revert
+ // back to the old way and vectorize with versioning when forced. See D81345.)
+ if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass) &&
+ Hints.getForce() != LoopVectorizeHints::FK_Enabled))
return CM_ScalarEpilogueNotAllowedOptSize;
- bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
- !PreferPredicateOverEpilog;
+ // 2) If set, obey the directives
+ if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+ switch (PreferPredicateOverEpilogue) {
+ case PreferPredicateTy::ScalarEpilogue:
+ return CM_ScalarEpilogueAllowed;
+ case PreferPredicateTy::PredicateElseScalarEpilogue:
+ return CM_ScalarEpilogueNotNeededUsePredicate;
+ case PreferPredicateTy::PredicateOrDontVectorize:
+ return CM_ScalarEpilogueNotAllowedUsePredicate;
+ };
+ }
- // 2) Next, if disabling predication is requested on the command line, honour
- // this and request a scalar epilogue.
- if (PredicateOptDisabled)
+ // 3) If set, obey the hints
+ switch (Hints.getPredicate()) {
+ case LoopVectorizeHints::FK_Enabled:
+ return CM_ScalarEpilogueNotNeededUsePredicate;
+ case LoopVectorizeHints::FK_Disabled:
return CM_ScalarEpilogueAllowed;
+ };
- // 3) and 4) look if enabling predication is requested on the command line,
- // with a loop hint, or if the TTI hook indicates this is profitable, request
- // predication .
- if (PreferPredicateOverEpilog ||
- Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
- (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
- LVL.getLAI()) &&
- Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
+ // 4) if the TTI hook indicates this is profitable, request predication.
+ if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+ LVL.getLAI()))
return CM_ScalarEpilogueNotNeededUsePredicate;
return CM_ScalarEpilogueAllowed;
}
+void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
+ unsigned Part) {
+ set(Def, V, Part);
+ ILV->setVectorValue(IRDef, Part, V);
+}
+
// Process the loop in the VPlan-native vectorization path. This path builds
// VPlan upfront in the vectorization pipeline, which allows to apply
// VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -7657,7 +9230,7 @@ static bool processLoopInVPlanNativePath(
OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
- if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
+ if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
return false;
}
@@ -7676,7 +9249,7 @@ static bool processLoopInVPlanNativePath(
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
// Get user vectorization factor.
- const unsigned UserVF = Hints.getWidth();
+ ElementCount UserVF = Hints.getWidth();
// Plan how to best vectorize, return the best VF and its cost.
const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
@@ -7691,7 +9264,7 @@ static bool processLoopInVPlanNativePath(
LVP.setBestPlan(VF.Width, 1);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
- &CM);
+ &CM, BFI, PSI);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(LB, DT);
@@ -7710,7 +9283,7 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
!EnableLoopVectorization) {}
bool LoopVectorizePass::processLoop(Loop *L) {
- assert((EnableVPlanNativePath || L->empty()) &&
+ assert((EnableVPlanNativePath || L->isInnermost()) &&
"VPlan-native path is not enabled. Only process inner loops.");
#ifndef NDEBUG
@@ -7755,7 +9328,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements(*ORE);
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
- &Requirements, &Hints, DB, AC);
+ &Requirements, &Hints, DB, AC, BFI, PSI);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
Hints.emitRemarkWithHints();
@@ -7772,11 +9345,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// even evaluating whether vectorization is profitable. Since we cannot modify
// the incoming IR, we need to build VPlan upfront in the vectorization
// pipeline.
- if (!L->empty())
+ if (!L->isInnermost())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
ORE, BFI, PSI, Hints);
- assert(L->empty() && "Inner loop expected.");
+ assert(L->isInnermost() && "Inner loop expected.");
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
@@ -7841,7 +9414,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
// Get user vectorization factor and interleave count.
- unsigned UserVF = Hints.getWidth();
+ ElementCount UserVF = Hints.getWidth();
unsigned UserIC = Hints.getInterleave();
// Plan how to best vectorize, return the best VF and its cost.
@@ -7866,7 +9439,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (VF.Width == 1) {
+ if (VF.Width.isScalar()) {
LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
VecDiagMsg = std::make_pair(
"VectorizationNotBeneficial",
@@ -7955,8 +9528,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
- InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
- &CM);
+ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
+ BFI, PSI);
LVP.executePlan(Unroller, DT);
ORE->emit([&]() {
@@ -7967,16 +9540,51 @@ bool LoopVectorizePass::processLoop(Loop *L) {
});
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
- &LVL, &CM);
- LVP.executePlan(LB, DT);
- ++LoopsVectorized;
- // Add metadata to disable runtime unrolling a scalar loop when there are
- // no runtime checks about strides and memory. A scalar loop that is
- // rarely used is not worth unrolling.
- if (!LB.areSafetyChecksAdded())
- DisableRuntimeUnroll = true;
+ // Consider vectorizing the epilogue too if it's profitable.
+ VectorizationFactor EpilogueVF =
+ CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
+ if (EpilogueVF.Width.isVector()) {
+
+ // The first pass vectorizes the main loop and creates a scalar epilogue
+ // to be vectorized by executing the plan (potentially with a different
+ // factor) again shortly afterwards.
+ EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
+ EpilogueVF.Width.getKnownMinValue(), 1);
+ EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
+ &LVL, &CM, BFI, PSI);
+
+ LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
+ LVP.executePlan(MainILV, DT);
+ ++LoopsVectorized;
+
+ simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
+ formLCSSARecursively(*L, *DT, LI, SE);
+
+ // Second pass vectorizes the epilogue and adjusts the control flow
+ // edges from the first pass.
+ LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
+ EPI.MainLoopVF = EPI.EpilogueVF;
+ EPI.MainLoopUF = EPI.EpilogueUF;
+ EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
+ ORE, EPI, &LVL, &CM, BFI, PSI);
+ LVP.executePlan(EpilogILV, DT);
+ ++LoopsEpilogueVectorized;
+
+ if (!MainILV.areSafetyChecksAdded())
+ DisableRuntimeUnroll = true;
+ } else {
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
+ &LVL, &CM, BFI, PSI);
+ LVP.executePlan(LB, DT);
+ ++LoopsVectorized;
+
+ // Add metadata to disable runtime unrolling a scalar loop when there are
+ // no runtime checks about strides and memory. A scalar loop that is
+ // rarely used is not worth unrolling.
+ if (!LB.areSafetyChecksAdded())
+ DisableRuntimeUnroll = true;
+ }
// Report the vectorization decision.
ORE->emit([&]() {
@@ -8090,7 +9698,8 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
+ TLI, TTI, nullptr, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5bc35aa4695f..0b630197911a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -17,11 +17,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
-#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
@@ -29,14 +26,16 @@
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -47,7 +46,6 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -66,7 +64,6 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
@@ -83,6 +80,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
@@ -130,6 +128,10 @@ static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
+static cl::opt<unsigned>
+MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
+ cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
+
static cl::opt<int>
MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
cl::desc("Maximum depth of the lookup for consecutive stores."));
@@ -204,12 +206,12 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
if (!I0)
return false;
BasicBlock *BB = I0->getParent();
- for (int i = 1, e = VL.size(); i < e; i++) {
- Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (!I)
+ for (int I = 1, E = VL.size(); I < E; I++) {
+ auto *II = dyn_cast<Instruction>(VL[I]);
+ if (!II)
return false;
- if (BB != I->getParent())
+ if (BB != II->getParent())
return false;
}
return true;
@@ -234,11 +236,16 @@ static bool isSplat(ArrayRef<Value *> VL) {
return true;
}
-/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
+/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
static bool isCommutative(Instruction *I) {
- if (auto *IC = dyn_cast<CmpInst>(I))
- return IC->isCommutative();
- return I->isCommutative();
+ if (auto *Cmp = dyn_cast<CmpInst>(I))
+ return Cmp->isCommutative();
+ if (auto *BO = dyn_cast<BinaryOperator>(I))
+ return BO->isCommutative();
+ // TODO: This should check for generic Instruction::isCommutative(), but
+ // we need to confirm that the caller code correctly handles Intrinsics
+ // for example (does not have 2 operands).
+ return false;
}
/// Checks if the vector of instructions can be represented as a shuffle, like:
@@ -250,7 +257,7 @@ static bool isCommutative(Instruction *I) {
/// %x3x3 = mul i8 %x3, %x3
/// %y1y1 = mul i8 %y1, %y1
/// %y2y2 = mul i8 %y2, %y2
-/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
@@ -265,13 +272,13 @@ static bool isCommutative(Instruction *I) {
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
-/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
+/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
/// %5 = mul <4 x i8> %4, %4
/// %6 = extractelement <4 x i8> %5, i32 0
-/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
+/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
/// %7 = extractelement <4 x i8> %5, i32 1
/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
/// %8 = extractelement <4 x i8> %5, i32 2
@@ -285,7 +292,8 @@ static bool isCommutative(Instruction *I) {
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
- unsigned Size = EI0->getVectorOperandType()->getNumElements();
+ unsigned Size =
+ cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
@@ -294,7 +302,7 @@ isShuffle(ArrayRef<Value *> VL) {
auto *EI = cast<ExtractElementInst>(VL[I]);
auto *Vec = EI->getVectorOperand();
// All vector operands must have the same number of vector elements.
- if (cast<VectorType>(Vec->getType())->getNumElements() != Size)
+ if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
return None;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
@@ -303,7 +311,7 @@ isShuffle(ArrayRef<Value *> VL) {
if (Idx->getValue().uge(Size))
continue;
unsigned IntIdx = Idx->getValue().getZExtValue();
- // We can extractelement from undef vector.
+ // We can extractelement from undef or poison vector.
if (isa<UndefValue>(Vec))
continue;
// For correct shuffling we have to have at most 2 different vector operands
@@ -500,7 +508,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
}
/// \returns the AA location that is being access by the instruction.
-static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
+static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
@@ -521,6 +529,15 @@ static bool isSimple(Instruction *I) {
namespace llvm {
+static void inversePermutation(ArrayRef<unsigned> Indices,
+ SmallVectorImpl<int> &Mask) {
+ Mask.clear();
+ const unsigned E = Indices.size();
+ Mask.resize(E, E + 1);
+ for (unsigned I = 0; I < E; ++I)
+ Mask[Indices[I]] = I;
+}
+
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
@@ -535,9 +552,10 @@ public:
using StoreList = SmallVector<StoreInst *, 8>;
using ExtraValueToDebugLocsMap =
MapVector<Value *, SmallVector<Instruction *, 2>>;
+ using OrdersType = SmallVector<unsigned, 4>;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
- TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+ TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
@@ -571,11 +589,11 @@ public:
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
- int getSpillCost() const;
+ InstructionCost getSpillCost() const;
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
- int getTreeCost();
+ InstructionCost getTreeCost();
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -612,6 +630,14 @@ public:
/// \returns The best order of instructions for vectorization.
Optional<ArrayRef<unsigned>> bestOrder() const {
+ assert(llvm::all_of(
+ NumOpsWantToKeepOrder,
+ [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
+ return D.getFirst().size() ==
+ VectorizableTree[0]->Scalars.size();
+ }) &&
+ "All orders must have the same size as number of instructions in "
+ "tree node.");
auto I = std::max_element(
NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
[](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
@@ -625,6 +651,81 @@ public:
return makeArrayRef(I->getFirst());
}
+ /// Builds the correct order for root instructions.
+ /// If some leaves have the same instructions to be vectorized, we may
+ /// incorrectly evaluate the best order for the root node (it is built for the
+ /// vector of instructions without repeated instructions and, thus, has less
+ /// elements than the root node). This function builds the correct order for
+ /// the root node.
+ /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves
+ /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first
+ /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should
+ /// be reordered, the best order will be \<1, 0\>. We need to extend this
+ /// order for the root node. For the root node this order should look like
+ /// \<3, 0, 1, 2\>. This function extends the order for the reused
+ /// instructions.
+ void findRootOrder(OrdersType &Order) {
+ // If the leaf has the same number of instructions to vectorize as the root
+ // - order must be set already.
+ unsigned RootSize = VectorizableTree[0]->Scalars.size();
+ if (Order.size() == RootSize)
+ return;
+ SmallVector<unsigned, 4> RealOrder(Order.size());
+ std::swap(Order, RealOrder);
+ SmallVector<int, 4> Mask;
+ inversePermutation(RealOrder, Mask);
+ Order.assign(Mask.begin(), Mask.end());
+ // The leaf has less number of instructions - need to find the true order of
+ // the root.
+ // Scan the nodes starting from the leaf back to the root.
+ const TreeEntry *PNode = VectorizableTree.back().get();
+ SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
+ SmallPtrSet<const TreeEntry *, 4> Visited;
+ while (!Nodes.empty() && Order.size() != RootSize) {
+ const TreeEntry *PNode = Nodes.pop_back_val();
+ if (!Visited.insert(PNode).second)
+ continue;
+ const TreeEntry &Node = *PNode;
+ for (const EdgeInfo &EI : Node.UserTreeIndices)
+ if (EI.UserTE)
+ Nodes.push_back(EI.UserTE);
+ if (Node.ReuseShuffleIndices.empty())
+ continue;
+ // Build the order for the parent node.
+ OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
+ SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
+ // The algorithm of the order extension is:
+ // 1. Calculate the number of the same instructions for the order.
+ // 2. Calculate the index of the new order: total number of instructions
+ // with order less than the order of the current instruction + reuse
+ // number of the current instruction.
+ // 3. The new order is just the index of the instruction in the original
+ // vector of the instructions.
+ for (unsigned I : Node.ReuseShuffleIndices)
+ ++OrderCounter[Order[I]];
+ SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
+ for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {
+ unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
+ unsigned OrderIdx = Order[ReusedIdx];
+ unsigned NewIdx = 0;
+ for (unsigned J = 0; J < OrderIdx; ++J)
+ NewIdx += OrderCounter[J];
+ NewIdx += CurrentCounter[OrderIdx];
+ ++CurrentCounter[OrderIdx];
+ assert(NewOrder[NewIdx] == RootSize &&
+ "The order index should not be written already.");
+ NewOrder[NewIdx] = I;
+ }
+ std::swap(Order, NewOrder);
+ }
+ assert(Order.size() == RootSize &&
+ "Root node is expected or the size of the order must be the same as "
+ "the number of elements in the root node.");
+ assert(llvm::all_of(Order,
+ [RootSize](unsigned Val) { return Val != RootSize; }) &&
+ "All indices must be initialized");
+ }
+
/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded
@@ -646,6 +747,12 @@ public:
return MinVecRegSize;
}
+ unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+ unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
+ MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
+ return MaxVF ? MaxVF : UINT_MAX;
+ }
+
/// Check if homogeneous aggregate is isomorphic to some VectorType.
/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
@@ -665,7 +772,7 @@ public:
/// effectively impossible for the backend to undo.
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
- bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+ bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
/// can be load combined in the backend. Load combining may not be allowed in
@@ -880,6 +987,14 @@ public:
std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
Value *V = Values[Idx].first;
+ if (isa<Constant>(V)) {
+ // Since this is a function pass, it doesn't make semantic sense to
+ // walk the users of a subclass of Constant. The users could be in
+ // another function, or even another module that happens to be in
+ // the same LLVMContext.
+ continue;
+ }
+
// Calculate the absolute lane, using the minimum relative lane of LHS
// and RHS as base and Idx as the offset.
int Ln = std::min(LHS.second, RHS.second) + Idx;
@@ -1388,7 +1503,7 @@ private:
bool areAllUsersVectorized(Instruction *I) const;
/// \returns the cost of the vectorizable entry.
- int getEntryCost(TreeEntry *E);
+ InstructionCost getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
@@ -1410,20 +1525,21 @@ private:
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
- int getGatherCost(VectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices) const;
+ InstructionCost
+ getGatherCost(FixedVectorType *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) const;
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
- int getGatherCost(ArrayRef<Value *> VL) const;
+ InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
/// Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(TreeEntry *E);
/// \returns a vector from a collection of scalars in \p VL.
- Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+ Value *gather(ArrayRef<Value *> VL);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
@@ -1457,15 +1573,17 @@ private:
/// The Scalars are vectorized into this value. It is initialized to Null.
Value *VectorizedValue = nullptr;
- /// Do we need to gather this sequence ?
- enum EntryState { Vectorize, NeedToGather };
+ /// Do we need to gather this sequence or vectorize it
+ /// (either with vector instruction or with scatter/gather
+ /// intrinsics for store/load)?
+ enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
EntryState State;
/// Does this sequence require some shuffling?
SmallVector<int, 4> ReuseShuffleIndices;
/// Does this entry require reordering?
- ArrayRef<unsigned> ReorderIndices;
+ SmallVector<unsigned, 4> ReorderIndices;
/// Points back to the VectorizableTree.
///
@@ -1606,6 +1724,9 @@ private:
case Vectorize:
dbgs() << "Vectorize\n";
break;
+ case ScatterVectorize:
+ dbgs() << "ScatterVectorize\n";
+ break;
case NeedToGather:
dbgs() << "NeedToGather\n";
break;
@@ -1627,7 +1748,7 @@ private:
dbgs() << "NULL\n";
dbgs() << "ReuseShuffleIndices: ";
if (ReuseShuffleIndices.empty())
- dbgs() << "Emtpy";
+ dbgs() << "Empty";
else
for (unsigned ReuseIdx : ReuseShuffleIndices)
dbgs() << ReuseIdx << ", ";
@@ -1644,26 +1765,55 @@ private:
#endif
};
+#ifndef NDEBUG
+ void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost,
+ InstructionCost VecCost,
+ InstructionCost ScalarCost) const {
+ dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
+ dbgs() << "SLP: Costs:\n";
+ dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
+ dbgs() << "SLP: VectorCost = " << VecCost << "\n";
+ dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
+ dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
+ ReuseShuffleCost + VecCost - ScalarCost << "\n";
+ }
+#endif
+
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
ArrayRef<unsigned> ReuseShuffleIndices = None,
ArrayRef<unsigned> ReorderIndices = None) {
- bool Vectorized = (bool)Bundle;
+ TreeEntry::EntryState EntryState =
+ Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
+ return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
+ ReuseShuffleIndices, ReorderIndices);
+ }
+
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
+ TreeEntry::EntryState EntryState,
+ Optional<ScheduleData *> Bundle,
+ const InstructionsState &S,
+ const EdgeInfo &UserTreeIdx,
+ ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<unsigned> ReorderIndices = None) {
+ assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
+ (Bundle && EntryState != TreeEntry::NeedToGather)) &&
+ "Need to vectorize gather entry?");
VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
TreeEntry *Last = VectorizableTree.back().get();
Last->Idx = VectorizableTree.size() - 1;
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
- Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
+ Last->State = EntryState;
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
- Last->ReorderIndices = ReorderIndices;
+ Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
Last->setOperations(S);
- if (Vectorized) {
- for (int i = 0, e = VL.size(); i != e; ++i) {
- assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
- ScalarToTreeEntry[VL[i]] = Last;
+ if (Last->State != TreeEntry::NeedToGather) {
+ for (Value *V : VL) {
+ assert(!getTreeEntry(V) && "Scalar already in tree!");
+ ScalarToTreeEntry[V] = Last;
}
// Update the scheduler bundle to point to this TreeEntry.
unsigned Lane = 0;
@@ -1699,18 +1849,10 @@ private:
}
#endif
- TreeEntry *getTreeEntry(Value *V) {
- auto I = ScalarToTreeEntry.find(V);
- if (I != ScalarToTreeEntry.end())
- return I->second;
- return nullptr;
- }
+ TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
const TreeEntry *getTreeEntry(Value *V) const {
- auto I = ScalarToTreeEntry.find(V);
- if (I != ScalarToTreeEntry.end())
- return I->second;
- return nullptr;
+ return ScalarToTreeEntry.lookup(V);
}
/// Maps a specific scalar to its tree entry.
@@ -2195,7 +2337,6 @@ private:
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
- using OrdersType = SmallVector<unsigned, 4>;
/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
/// sorted SmallVectors of unsigned.
struct OrdersTypeDenseMapInfo {
@@ -2233,7 +2374,7 @@ private:
ScalarEvolution *SE;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
- AliasAnalysis *AA;
+ AAResults *AA;
LoopInfo *LI;
DominatorTree *DT;
AssumptionCache *AC;
@@ -2332,9 +2473,9 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
}
for (auto V : Entry->Scalars) {
OS << *V;
- if (std::any_of(
- R->ExternalUses.begin(), R->ExternalUses.end(),
- [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+ if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
+ return EU.Scalar == V;
+ }))
OS << " <extract>";
OS << "\n";
}
@@ -2366,13 +2507,17 @@ BoUpSLP::~BoUpSLP() {
"trying to erase instruction with users.");
Pair.getFirst()->eraseFromParent();
}
+#ifdef EXPENSIVE_CHECKS
+ // If we could guarantee that this call is not extremely slow, we could
+ // remove the ifdef limitation (see PR47712).
assert(!verifyFunction(*F, &dbgs()));
+#endif
}
void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
for (auto *V : AV) {
if (auto *I = dyn_cast<Instruction>(V))
- eraseInstruction(I, /*ReplaceWithUndef=*/true);
+ eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
};
}
@@ -2597,11 +2742,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
auto *PH = cast<PHINode>(VL0);
// Check for terminator values (e.g. invoke).
- for (unsigned j = 0; j < VL.size(); ++j)
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ for (Value *V : VL)
+ for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
Instruction *Term = dyn_cast<Instruction>(
- cast<PHINode>(VL[j])->getIncomingValueForBlock(
- PH->getIncomingBlock(i)));
+ cast<PHINode>(V)->getIncomingValueForBlock(
+ PH->getIncomingBlock(I)));
if (Term && Term->isTerminator()) {
LLVM_DEBUG(dbgs()
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
@@ -2618,13 +2763,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Keeps the reordered operands to avoid code duplication.
SmallVector<ValueList, 2> OperandsVec;
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
ValueList Operands;
// Prepare the operand vector.
- for (Value *j : VL)
- Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
- PH->getIncomingBlock(i)));
- TE->setOperand(i, Operands);
+ for (Value *V : VL)
+ Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
+ PH->getIncomingBlock(I)));
+ TE->setOperand(I, Operands);
OperandsVec.push_back(Operands);
}
for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
@@ -2657,12 +2802,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
});
// Insert new order with initial value 0, if it does not exist,
// otherwise return the iterator to the existing one.
- auto StoredCurrentOrderAndNum =
- NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
- ++StoredCurrentOrderAndNum->getSecond();
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies,
- StoredCurrentOrderAndNum->getFirst());
+ ReuseShuffleIndicies, CurrentOrder);
+ findRootOrder(CurrentOrder);
+ ++NumOpsWantToKeepOrder[CurrentOrder];
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
@@ -2739,16 +2882,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
} else {
// Need to reorder.
- auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
- ++I->getSecond();
TreeEntry *TE =
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies, I->getFirst());
+ ReuseShuffleIndicies, CurrentOrder);
TE->setOperandsInOrder();
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
+ findRootOrder(CurrentOrder);
+ ++NumOpsWantToKeepOrder[CurrentOrder];
}
return;
}
+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+ TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+ return;
}
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
@@ -2883,8 +3033,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (Value *j : VL)
- Operands.push_back(cast<Instruction>(j)->getOperand(i));
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
buildTree_rec(Operands, Depth + 1, {TE, i});
}
@@ -2952,6 +3102,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::Store: {
// Check if the stores are consecutive or if we need to swizzle them.
llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
+ // Avoid types that are padded when being allocated as scalars, while
+ // being packed together in a vector (such as i1).
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
+ return;
+ }
// Make sure all stores in the bundle are simple - we can't vectorize
// atomic or volatile stores.
SmallVector<Value *, 4> PointerOps(VL.size());
@@ -3001,15 +3161,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
buildTree_rec(Operands, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
} else {
- // Need to reorder.
- auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
- ++(I->getSecond());
TreeEntry *TE =
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies, I->getFirst());
+ ReuseShuffleIndicies, CurrentOrder);
TE->setOperandsInOrder();
buildTree_rec(Operands, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+ findRootOrder(CurrentOrder);
+ ++NumOpsWantToKeepOrder[CurrentOrder];
}
return;
}
@@ -3028,7 +3187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
VFShape Shape = VFShape::get(
- *CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/},
+ *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
@@ -3165,7 +3324,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
N *= AT->getNumElements();
EltTy = AT->getElementType();
} else {
- auto *VT = cast<VectorType>(EltTy);
+ auto *VT = cast<FixedVectorType>(EltTy);
N *= VT->getNumElements();
EltTy = VT->getElementType();
}
@@ -3203,7 +3362,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
return false;
} else {
- NElts = cast<VectorType>(Vec->getType())->getNumElements();
+ NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
}
if (NElts != VL.size())
@@ -3247,27 +3406,26 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
}
bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
- return I->hasOneUse() ||
- std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
+ return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
return ScalarToTreeEntry.count(U) > 0;
});
}
-static std::pair<unsigned, unsigned>
-getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
- TargetLibraryInfo *TLI) {
+static std::pair<InstructionCost, InstructionCost>
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements());
- int IntrinsicCost =
+ IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
+ auto IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
- auto Shape =
- VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
- false /*HasGlobalPred*/);
+ auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
+ VecTy->getNumElements())),
+ false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- int LibCost = IntrinsicCost;
+ auto LibCost = IntrinsicCost;
if (!CI->isNoBuiltin() && VecFunc) {
// Calculate the cost of the vector library call.
SmallVector<Type *, 4> VecTys;
@@ -3282,7 +3440,7 @@ getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
return {IntrinsicCost, LibCost};
}
-int BoUpSLP::getEntryCost(TreeEntry *E) {
+InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
@@ -3301,7 +3459,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- int ReuseShuffleCost = 0;
+ InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses) {
ReuseShuffleCost =
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
@@ -3317,7 +3475,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
allSameType(VL) && allSameBlock(VL)) {
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
if (ShuffleKind.hasValue()) {
- int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
+ InstructionCost Cost =
+ TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
for (auto *V : VL) {
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
@@ -3336,7 +3495,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
return ReuseShuffleCost + getGatherCost(VL);
}
- assert(E->State == TreeEntry::Vectorize && "Unhandled state");
+ assert((E->State == TreeEntry::Vectorize ||
+ E->State == TreeEntry::ScatterVectorize) &&
+ "Unhandled state");
assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
@@ -3375,37 +3536,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
}
}
- int DeadCost = ReuseShuffleCost;
+ InstructionCost DeadCost = ReuseShuffleCost;
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
DeadCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- Instruction *E = cast<Instruction>(VL[i]);
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ Instruction *EI = cast<Instruction>(VL[I]);
// If all users are going to be vectorized, instruction can be
// considered as dead.
// The same, if have only one user, it will be vectorized for sure.
- if (areAllUsersVectorized(E)) {
+ if (areAllUsersVectorized(EI)) {
// Take credit for instruction that will become dead.
- if (E->hasOneUse()) {
- Instruction *Ext = E->user_back();
+ if (EI->hasOneUse()) {
+ Instruction *Ext = EI->user_back();
if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
all_of(Ext->users(),
[](User *U) { return isa<GetElementPtrInst>(U); })) {
// Use getExtractWithExtendCost() to calculate the cost of
// extractelement/ext pair.
DeadCost -= TTI->getExtractWithExtendCost(
- Ext->getOpcode(), Ext->getType(), VecTy, i);
+ Ext->getOpcode(), Ext->getType(), VecTy, I);
// Add back the cost of s|zext which is subtracted separately.
DeadCost += TTI->getCastInstrCost(
- Ext->getOpcode(), Ext->getType(), E->getType(), CostKind,
- Ext);
+ Ext->getOpcode(), Ext->getType(), EI->getType(),
+ TTI::getCastContextHint(Ext), CostKind, Ext);
continue;
}
}
DeadCost -=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
}
}
return DeadCost;
@@ -3423,40 +3584,78 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
- int ScalarEltCost =
- TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind,
- VL0);
+ InstructionCost ScalarEltCost =
+ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
+ TTI::getCastContextHint(VL0), CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
// Calculate the cost of this instruction.
- int ScalarCost = VL.size() * ScalarEltCost;
+ InstructionCost ScalarCost = VL.size() * ScalarEltCost;
auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
- int VecCost = 0;
+ InstructionCost VecCost = 0;
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
- VecCost = ReuseShuffleCost +
- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
- CostKind, VL0);
+ VecCost =
+ ReuseShuffleCost +
+ TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
+ TTI::getCastContextHint(VL0), CostKind, VL0);
}
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return VecCost - ScalarCost;
}
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
- int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
- Builder.getInt1Ty(),
- CostKind, VL0);
+ InstructionCost ScalarEltCost =
+ TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
- int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
- CostKind, VL0);
+ InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+
+ // Check if all entries in VL are either compares or selects with compares
+ // as condition that have the same predicates.
+ CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
+ bool First = true;
+ for (auto *V : VL) {
+ CmpInst::Predicate CurrentPred;
+ auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
+ if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
+ !match(V, MatchCmp)) ||
+ (!First && VecPred != CurrentPred)) {
+ VecPred = CmpInst::BAD_ICMP_PREDICATE;
+ break;
+ }
+ First = false;
+ VecPred = CurrentPred;
+ }
+
+ InstructionCost VecCost = TTI->getCmpSelInstrCost(
+ E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
+ // Check if it is possible and profitable to use min/max for selects in
+ // VL.
+ //
+ auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
+ if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
+ IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
+ {VecTy, VecTy});
+ InstructionCost IntrinsicCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+ // If the selects are the only uses of the compares, they will be dead
+ // and we can adjust the cost by removing their cost.
+ if (IntrinsicAndUse.second)
+ IntrinsicCost -=
+ TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ VecCost = std::min(VecCost, IntrinsicCost);
+ }
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::FNeg:
@@ -3516,16 +3715,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
SmallVector<const Value *, 4> Operands(VL0->operand_values());
- int ScalarEltCost = TTI->getArithmeticInstrCost(
- E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
- Operands, VL0);
+ InstructionCost ScalarEltCost =
+ TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost = TTI->getArithmeticInstrCost(
- E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
- Operands, VL0);
+ InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecCost =
+ TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands, VL0);
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -3534,36 +3734,42 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
- int ScalarEltCost =
- TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
- Op1VK, Op2VK);
+ InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
+ Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost =
- TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
- Op1VK, Op2VK);
+ InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecCost = TTI->getArithmeticInstrCost(
+ Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
Align alignment = cast<LoadInst>(VL0)->getAlign();
- int ScalarEltCost =
- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
- CostKind, VL0);
+ InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
+ Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
- int VecLdCost =
- TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
- CostKind, VL0);
+ InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecLdCost;
+ if (E->State == TreeEntry::Vectorize) {
+ VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+ CostKind, VL0);
+ } else {
+ assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+ VecLdCost = TTI->getGatherScatterOpCost(
+ Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
+ /*VariableMask=*/false, alignment, CostKind, VL0);
+ }
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecLdCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
return ReuseShuffleCost + VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
@@ -3572,19 +3778,19 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
auto *SI =
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
Align Alignment = SI->getAlign();
- int ScalarEltCost =
- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
- CostKind, VL0);
+ InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
+ Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
if (NeedToShuffleReuses)
ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
- int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
- int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
- VecTy, Alignment, 0, CostKind, VL0);
+ InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecStCost = TTI->getMemoryOpCost(
+ Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
if (IsReorder) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecStCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
return ReuseShuffleCost + VecStCost - ScalarStCost;
}
case Instruction::Call: {
@@ -3592,15 +3798,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1);
- int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+ IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
+ InstructionCost ScalarEltCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
- int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
+ InstructionCost VecCallCost =
+ std::min(VecCallCosts.first, VecCallCosts.second);
LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
@@ -3615,7 +3823,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
(Instruction::isCast(E->getOpcode()) &&
Instruction::isCast(E->getAltOpcode()))) &&
"Invalid Shuffle Vector Operand");
- int ScalarCost = 0;
+ InstructionCost ScalarCost = 0;
if (NeedToShuffleReuses) {
for (unsigned Idx : E->ReuseShuffleIndices) {
Instruction *I = cast<Instruction>(VL[Idx]);
@@ -3633,7 +3841,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
- int VecCost = 0;
+ InstructionCost VecCost = 0;
if (Instruction::isBinaryOp(E->getOpcode())) {
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
@@ -3644,11 +3852,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
- CostKind);
+ TTI::CastContextHint::None, CostKind);
VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
- CostKind);
+ TTI::CastContextHint::None, CostKind);
}
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return ReuseShuffleCost + VecCost - ScalarCost;
}
default:
@@ -3686,11 +3895,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
TargetTransformInfo *TTI) {
// Look past the root to find a source value. Arbitrarily follow the
// path through operand 0 of any 'or'. Also, peek through optional
- // shift-left-by-constant.
+ // shift-left-by-multiple-of-8-bits.
Value *ZextLoad = Root;
+ const APInt *ShAmtC;
while (!isa<ConstantExpr>(ZextLoad) &&
(match(ZextLoad, m_Or(m_Value(), m_Value())) ||
- match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
+ (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
+ ShAmtC->urem(8) == 0)))
ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
// Check if the input is an extended load of the required or/shift expression.
@@ -3714,8 +3925,8 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
return true;
}
-bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
- if (RdxOpcode != Instruction::Or)
+bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
+ if (RdxKind != RecurKind::Or)
return false;
unsigned NumElts = VectorizableTree[0]->Scalars.size();
@@ -3756,22 +3967,35 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
return true;
}
-int BoUpSLP::getSpillCost() const {
+InstructionCost BoUpSLP::getSpillCost() const {
// Walk from the bottom of the tree to the top, tracking which values are
// live. When we see a call instruction that is not part of our tree,
// query TTI to see if there is a cost to keeping values live over it
// (for example, if spills and fills are required).
unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
- int Cost = 0;
+ InstructionCost Cost = 0;
SmallPtrSet<Instruction*, 4> LiveValues;
Instruction *PrevInst = nullptr;
+ // The entries in VectorizableTree are not necessarily ordered by their
+ // position in basic blocks. Collect them and order them by dominance so later
+ // instructions are guaranteed to be visited first. For instructions in
+ // different basic blocks, we only scan to the beginning of the block, so
+ // their order does not matter, as long as all instructions in a basic block
+ // are grouped together. Using dominance ensures a deterministic order.
+ SmallVector<Instruction *, 16> OrderedScalars;
for (const auto &TEPtr : VectorizableTree) {
Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
if (!Inst)
continue;
+ OrderedScalars.push_back(Inst);
+ }
+ llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
+ return DT->dominates(B, A);
+ });
+ for (Instruction *Inst : OrderedScalars) {
if (!PrevInst) {
PrevInst = Inst;
continue;
@@ -3825,8 +4049,8 @@ int BoUpSLP::getSpillCost() const {
return Cost;
}
-int BoUpSLP::getTreeCost() {
- int Cost = 0;
+InstructionCost BoUpSLP::getTreeCost() {
+ InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
@@ -3856,15 +4080,16 @@ int BoUpSLP::getTreeCost() {
}))
continue;
- int C = getEntryCost(&TE);
+ InstructionCost C = getEntryCost(&TE);
+ Cost += C;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for bundle that starts with " << *TE.Scalars[0]
- << ".\n");
- Cost += C;
+ << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
}
SmallPtrSet<Value *, 16> ExtractCostCalculated;
- int ExtractCost = 0;
+ InstructionCost ExtractCost = 0;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!ExtractCostCalculated.insert(EU.Scalar).second)
@@ -3894,39 +4119,42 @@ int BoUpSLP::getTreeCost() {
}
}
- int SpillCost = getSpillCost();
+ InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
- std::string Str;
+#ifndef NDEBUG
+ SmallString<256> Str;
{
- raw_string_ostream OS(Str);
+ raw_svector_ostream OS(Str);
OS << "SLP: Spill Cost = " << SpillCost << ".\n"
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n";
}
LLVM_DEBUG(dbgs() << Str);
-
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
+#endif
return Cost;
}
-int BoUpSLP::getGatherCost(VectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices) const {
+InstructionCost
+BoUpSLP::getGatherCost(FixedVectorType *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) const {
unsigned NumElts = Ty->getNumElements();
APInt DemandedElts = APInt::getNullValue(NumElts);
- for (unsigned i = 0; i < NumElts; ++i)
- if (!ShuffledIndices.count(i))
- DemandedElts.setBit(i);
- int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
- /*Extract*/ false);
+ for (unsigned I = 0; I < NumElts; ++I)
+ if (!ShuffledIndices.count(I))
+ DemandedElts.setBit(I);
+ InstructionCost Cost =
+ TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+ /*Extract*/ false);
if (!ShuffledIndices.empty())
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
}
-int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -3968,11 +4196,10 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
// should be in this block.
auto *Front = E->getMainOp();
auto *BB = Front->getParent();
- assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
- [=](Value *V) -> bool {
- auto *I = cast<Instruction>(V);
- return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
- }));
+ assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
+ auto *I = cast<Instruction>(V);
+ return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+ }));
// The last instruction in the bundle in program order.
Instruction *LastInst = nullptr;
@@ -4025,34 +4252,30 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
-Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
- Value *Vec = UndefValue::get(Ty);
- // Generate the 'InsertElement' instruction.
- for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
- Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
- if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
- GatherSeq.insert(Insrt);
- CSEBlocks.insert(Insrt->getParent());
-
- // Add to our 'need-to-extract' list.
- if (TreeEntry *E = getTreeEntry(VL[i])) {
- // Find which lane we need to extract.
- int FoundLane = -1;
- for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
- // Is this the lane of the scalar that we are looking for ?
- if (E->Scalars[Lane] == VL[i]) {
- FoundLane = Lane;
- break;
- }
- }
- assert(FoundLane >= 0 && "Could not find the correct lane");
- if (!E->ReuseShuffleIndices.empty()) {
- FoundLane =
- std::distance(E->ReuseShuffleIndices.begin(),
- llvm::find(E->ReuseShuffleIndices, FoundLane));
- }
- ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
+ Value *Val0 =
+ isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
+ FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
+ Value *Vec = PoisonValue::get(VecTy);
+ unsigned InsIndex = 0;
+ for (Value *Val : VL) {
+ Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
+ auto *InsElt = dyn_cast<InsertElementInst>(Vec);
+ if (!InsElt)
+ continue;
+ GatherSeq.insert(InsElt);
+ CSEBlocks.insert(InsElt->getParent());
+ // Add to our 'need-to-extract' list.
+ if (TreeEntry *Entry = getTreeEntry(Val)) {
+ // Find which lane we need to extract.
+ unsigned FoundLane = std::distance(Entry->Scalars.begin(),
+ find(Entry->Scalars, Val));
+ assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane");
+ if (!Entry->ReuseShuffleIndices.empty()) {
+ FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
+ find(Entry->ReuseShuffleIndices, FoundLane));
}
+ ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
}
}
@@ -4076,8 +4299,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
for (int Idx : E->ReuseShuffleIndices)
if (UsedIdxs.insert(Idx).second)
UniqueIdxs.emplace_back(Idx);
- V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
- UniqueIdxs);
+ V = Builder.CreateShuffleVector(V, UniqueIdxs);
}
}
return V;
@@ -4085,10 +4307,6 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
}
}
- Type *ScalarTy = S.OpValue->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
- ScalarTy = SI->getValueOperand()->getType();
-
// Check that every instruction appears once in this bundle.
SmallVector<int, 4> ReuseShuffleIndicies;
SmallVector<Value *, 4> UniqueValues;
@@ -4108,27 +4326,16 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
else
VL = UniqueValues;
}
- auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
- Value *V = Gather(VL, VecTy);
+ Value *Vec = gather(VL);
if (!ReuseShuffleIndicies.empty()) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- ReuseShuffleIndicies, "shuffle");
- if (auto *I = dyn_cast<Instruction>(V)) {
+ Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
- return V;
-}
-
-static void inversePermutation(ArrayRef<unsigned> Indices,
- SmallVectorImpl<int> &Mask) {
- Mask.clear();
- const unsigned E = Indices.size();
- Mask.resize(E);
- for (unsigned I = 0; I < E; ++I)
- Mask[Indices[I]] = I;
+ return Vec;
}
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
@@ -4139,32 +4346,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return E->VectorizedValue;
}
- Instruction *VL0 = E->getMainOp();
- Type *ScalarTy = VL0->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
- ScalarTy = SI->getValueOperand()->getType();
- auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
-
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-
if (E->State == TreeEntry::NeedToGather) {
setInsertPointAfterBundle(E);
- auto *V = Gather(E->Scalars, VecTy);
+ Value *Vec = gather(E->Scalars);
if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- if (auto *I = dyn_cast<Instruction>(V)) {
+ Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
- E->VectorizedValue = V;
- return V;
+ E->VectorizedValue = Vec;
+ return Vec;
}
- assert(E->State == TreeEntry::Vectorize && "Unhandled state");
+ assert((E->State == TreeEntry::Vectorize ||
+ E->State == TreeEntry::ScatterVectorize) &&
+ "Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+ Instruction *VL0 = E->getMainOp();
+ Type *ScalarTy = VL0->getType();
+ if (auto *Store = dyn_cast<StoreInst>(VL0))
+ ScalarTy = Store->getValueOperand()->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
switch (ShuffleOrOp) {
case Instruction::PHI: {
auto *PH = cast<PHINode>(VL0);
@@ -4172,10 +4378,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
Value *V = NewPhi;
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
// PHINodes may have multiple entries from the same block. We want to
@@ -4208,37 +4413,33 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
- "reorder_shuffle");
+ V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
}
if (NeedToShuffleReuses) {
// TODO: Merge this shuffle with the ReorderShuffleMask.
if (E->ReorderIndices.empty())
Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
- LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
+ auto *LI = cast<LoadInst>(E->getSingleOperand(0));
Builder.SetInsertPoint(LI);
- PointerType *PtrTy =
- PointerType::get(VecTy, LI->getPointerAddressSpace());
+ auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
Value *NewV = propagateMetadata(V, E->Scalars);
if (!E->ReorderIndices.empty()) {
SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
- NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
- "reorder_shuffle");
+ NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle");
}
if (NeedToShuffleReuses) {
// TODO: Merge this shuffle with the ReorderShuffleMask.
- NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
+ NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices,
+ "shuffle");
}
E->VectorizedValue = NewV;
return NewV;
@@ -4266,10 +4467,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *CI = cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4289,10 +4489,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4310,10 +4509,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = Builder.CreateSelect(Cond, True, False);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4334,10 +4532,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4378,10 +4575,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4396,30 +4592,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E);
LoadInst *LI = cast<LoadInst>(VL0);
+ Instruction *NewLI;
unsigned AS = LI->getPointerAddressSpace();
+ Value *PO = LI->getPointerOperand();
+ if (E->State == TreeEntry::Vectorize) {
- Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
- VecTy->getPointerTo(AS));
+ Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
- // The pointer operand uses an in-tree scalar so we add the new BitCast to
- // ExternalUses list to make sure that an extract will be generated in the
- // future.
- Value *PO = LI->getPointerOperand();
- if (getTreeEntry(PO))
- ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+ // The pointer operand uses an in-tree scalar so we add the new BitCast
+ // to ExternalUses list to make sure that an extract will be generated
+ // in the future.
+ if (getTreeEntry(PO))
+ ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
+
+ NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
+ } else {
+ assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
+ Value *VecPtr = vectorizeTree(E->getOperand(0));
+ // Use the minimum alignment of the gathered loads.
+ Align CommonAlignment = LI->getAlign();
+ for (Value *V : E->Scalars)
+ CommonAlignment =
+ commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment);
+ }
+ Value *V = propagateMetadata(NewLI, E->Scalars);
- LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
- Value *V = propagateMetadata(LI, E->Scalars);
if (IsReorder) {
SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
- V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
- Mask, "reorder_shuffle");
+ V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
}
if (NeedToShuffleReuses) {
// TODO: Merge this shuffle with the ReorderShuffleMask.
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4437,9 +4643,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (IsReorder) {
SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
E->ReorderIndices.end());
- VecValue = Builder.CreateShuffleVector(
- VecValue, UndefValue::get(VecValue->getType()), Mask,
- "reorder_shuffle");
+ VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf");
}
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(
@@ -4454,10 +4658,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
Value *V = propagateMetadata(ST, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4494,10 +4697,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4537,9 +4739,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Function *CF;
if (!UseIntrinsic) {
- VFShape Shape = VFShape::get(
- *CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
- false /*HasGlobalPred*/);
+ VFShape Shape =
+ VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
+ VecTy->getNumElements())),
+ false /*HasGlobalPred*/);
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
} else {
Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
@@ -4557,10 +4760,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
propagateIRFlags(V, E->Scalars, VL0);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4625,10 +4827,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4693,7 +4894,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
continue;
TreeEntry *E = getTreeEntry(Scalar);
assert(E && "Invalid scalar");
- assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");
+ assert(E->State != TreeEntry::NeedToGather &&
+ "Extracting from a gather list");
Value *Vec = E->VectorizedValue;
assert(Vec && "Can't find vectorizable value");
@@ -4851,7 +5053,8 @@ void BoUpSLP::optimizeGatherSequence() {
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
- assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+ assert(*I &&
+ (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
"Worklist not sorted properly!");
BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
@@ -4961,8 +5164,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// cancelScheduling).
while (!Bundle->isReady() && !ReadyInsts.empty()) {
- ScheduleData *pickedSD = ReadyInsts.back();
- ReadyInsts.pop_back();
+ ScheduleData *pickedSD = ReadyInsts.pop_back_val();
if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
schedule(pickedSD, ReadyInsts);
@@ -5106,7 +5308,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
if (I->mayReadOrWriteMemory() &&
(!isa<IntrinsicInst>(I) ||
- cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
+ (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
+ cast<IntrinsicInst>(I)->getIntrinsicID() !=
+ Intrinsic::pseudoprobe))) {
// Update the linked list of memory accessing instructions.
if (CurrentLoadStore) {
CurrentLoadStore->NextLoadStore = SD;
@@ -5133,8 +5337,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
WorkList.push_back(SD);
while (!WorkList.empty()) {
- ScheduleData *SD = WorkList.back();
- WorkList.pop_back();
+ ScheduleData *SD = WorkList.pop_back_val();
ScheduleData *BundleMember = SD;
while (BundleMember) {
@@ -5331,10 +5534,15 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
}
unsigned BoUpSLP::getVectorElementSize(Value *V) {
- // If V is a store, just return the width of the stored value without
- // traversing the expression tree. This is the common case.
- if (auto *Store = dyn_cast<StoreInst>(V))
- return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+ // If V is a store, just return the width of the stored value (or value
+ // truncated just before storing) without traversing the expression tree.
+ // This is the common case.
+ if (auto *Store = dyn_cast<StoreInst>(V)) {
+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+ return DL->getTypeSizeInBits(Trunc->getSrcTy());
+ else
+ return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+ }
auto E = InstrElementSize.find(V);
if (E != InstrElementSize.end())
@@ -5683,7 +5891,7 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
TargetTransformInfo *TTI_,
- TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
+ TargetLibraryInfo *TLI_, AAResults *AA_,
LoopInfo *LI_, DominatorTree *DT_,
AssumptionCache *AC_, DemandedBits *DB_,
OptimizationRemarkEmitter *ORE_) {
@@ -5783,11 +5991,11 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
R.computeMinimumValueSizes();
- int Cost = R.getTreeCost();
+ InstructionCost Cost = R.getTreeCost();
- LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
if (Cost < -SLPCostThreshold) {
- LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
using namespace ore;
@@ -5860,7 +6068,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
// If a vector register can't hold 1 element, we are done.
unsigned MaxVecRegSize = R.getMaxVecRegSize();
- unsigned EltSize = R.getVectorElementSize(Stores[0]);
+ unsigned EltSize = R.getVectorElementSize(Operands[0]);
if (MaxVecRegSize % EltSize != 0)
continue;
@@ -5911,7 +6119,7 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
continue;
if (!isValidElementType(SI->getValueOperand()->getType()))
continue;
- Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
+ Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
}
// Ignore getelementptr instructions that have more than one index, a
@@ -5975,6 +6183,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
if (MaxVF < 2) {
R.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
@@ -5986,7 +6195,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
bool Changed = false;
bool CandidateFound = false;
- int MinCost = SLPCostThreshold;
+ InstructionCost MinCost = SLPCostThreshold.getValue();
bool CompensateUseCost =
!InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
@@ -6042,7 +6251,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
continue;
R.computeMinimumValueSizes();
- int Cost = R.getTreeCost();
+ InstructionCost Cost = R.getTreeCost();
CandidateFound = true;
if (CompensateUseCost) {
// TODO: Use TTI's getScalarizationOverhead for sequence of inserts
@@ -6052,7 +6261,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// part should also switch to same interface.
// For example, the following case is projected code after SLP:
// %4 = extractelement <4 x i64> %3, i32 0
- // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
+ // %v0 = insertelement <4 x i64> poison, i64 %4, i32 0
// %5 = extractelement <4 x i64> %3, i32 1
// %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
// %6 = extractelement <4 x i64> %3, i32 2
@@ -6072,7 +6281,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// Switching to the TTI interface might help a bit.
// Alternative solution could be pattern-match to detect a no-op or
// shuffle.
- unsigned UserCost = 0;
+ InstructionCost UserCost = 0;
for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
@@ -6163,50 +6372,20 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
return false;
}
-/// Generate a shuffle mask to be used in a reduction tree.
-///
-/// \param VecLen The length of the vector to be reduced.
-/// \param NumEltsToRdx The number of elements that should be reduced in the
-/// vector.
-/// \param IsPairwise Whether the reduction is a pairwise or splitting
-/// reduction. A pairwise reduction will generate a mask of
-/// <0,2,...> or <1,3,..> while a splitting reduction will generate
-/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
-/// \param IsLeft True will generate a mask of even elements, odd otherwise.
-static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen,
- unsigned NumEltsToRdx,
- bool IsPairwise, bool IsLeft) {
- assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
-
- SmallVector<int, 32> ShuffleMask(VecLen, -1);
-
- if (IsPairwise)
- // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
- for (unsigned i = 0; i != NumEltsToRdx; ++i)
- ShuffleMask[i] = 2 * i + !IsLeft;
- else
- // Move the upper half of the vector to the lower half.
- for (unsigned i = 0; i != NumEltsToRdx; ++i)
- ShuffleMask[i] = NumEltsToRdx + i;
-
- return ShuffleMask;
-}
-
namespace {
/// Model horizontal reductions.
///
-/// A horizontal reduction is a tree of reduction operations (currently add and
-/// fadd) that has operations that can be put into a vector as its leaf.
-/// For example, this tree:
+/// A horizontal reduction is a tree of reduction instructions that has values
+/// that can be put into a vector as its leaves. For example:
///
/// mul mul mul mul
/// \ / \ /
/// + +
/// \ /
/// +
-/// This tree has "mul" as its reduced values and "+" as its reduction
-/// operations. A reduction might be feeding into a store or a binary operation
+/// This tree has "mul" as its leaf values and "+" as its reduction
+/// instructions. A reduction can feed into a store or a binary operation
/// feeding a phi.
/// ...
/// \ /
@@ -6224,458 +6403,284 @@ namespace {
class HorizontalReduction {
using ReductionOpsType = SmallVector<Value *, 16>;
using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
- ReductionOpsListType ReductionOps;
+ ReductionOpsListType ReductionOps;
SmallVector<Value *, 32> ReducedVals;
// Use map vector to make stable output.
MapVector<Instruction *, Value *> ExtraArgs;
+ WeakTrackingVH ReductionRoot;
+ /// The type of reduction operation.
+ RecurKind RdxKind;
- /// Kind of the reduction data.
- enum ReductionKind {
- RK_None, /// Not a reduction.
- RK_Arithmetic, /// Binary reduction data.
- RK_Min, /// Minimum reduction data.
- RK_UMin, /// Unsigned minimum reduction data.
- RK_Max, /// Maximum reduction data.
- RK_UMax, /// Unsigned maximum reduction data.
- };
-
- /// Contains info about operation, like its opcode, left and right operands.
- class OperationData {
- /// Opcode of the instruction.
- unsigned Opcode = 0;
-
- /// Left operand of the reduction operation.
- Value *LHS = nullptr;
-
- /// Right operand of the reduction operation.
- Value *RHS = nullptr;
-
- /// Kind of the reduction operation.
- ReductionKind Kind = RK_None;
-
- /// True if float point min/max reduction has no NaNs.
- bool NoNaN = false;
-
- /// Checks if the reduction operation can be vectorized.
- bool isVectorizable() const {
- return LHS && RHS &&
- // We currently only support add/mul/logical && min/max reductions.
- ((Kind == RK_Arithmetic &&
- (Opcode == Instruction::Add || Opcode == Instruction::FAdd ||
- Opcode == Instruction::Mul || Opcode == Instruction::FMul ||
- Opcode == Instruction::And || Opcode == Instruction::Or ||
- Opcode == Instruction::Xor)) ||
- ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
- (Kind == RK_Min || Kind == RK_Max)) ||
- (Opcode == Instruction::ICmp &&
- (Kind == RK_UMin || Kind == RK_UMax)));
- }
+ /// Checks if instruction is associative and can be vectorized.
+ static bool isVectorizable(RecurKind Kind, Instruction *I) {
+ if (Kind == RecurKind::None)
+ return false;
+ if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
+ return true;
- /// Creates reduction operation with the current opcode.
- Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
- assert(isVectorizable() &&
- "Expected add|fadd or min/max reduction operation.");
- Value *Cmp = nullptr;
- switch (Kind) {
- case RK_Arithmetic:
- return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
- Name);
- case RK_Min:
- Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
- : Builder.CreateFCmpOLT(LHS, RHS);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- case RK_Max:
- Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
- : Builder.CreateFCmpOGT(LHS, RHS);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- case RK_UMin:
- assert(Opcode == Instruction::ICmp && "Expected integer types.");
- Cmp = Builder.CreateICmpULT(LHS, RHS);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- case RK_UMax:
- assert(Opcode == Instruction::ICmp && "Expected integer types.");
- Cmp = Builder.CreateICmpUGT(LHS, RHS);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- case RK_None:
- break;
- }
- llvm_unreachable("Unknown reduction operation.");
+ if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
+ // FP min/max are associative except for NaN and -0.0. We do not
+ // have to rule out -0.0 here because the intrinsic semantics do not
+ // specify a fixed result for it.
+ return I->getFastMathFlags().noNaNs();
}
- public:
- explicit OperationData() = default;
-
- /// Construction for reduced values. They are identified by opcode only and
- /// don't have associated LHS/RHS values.
- explicit OperationData(Value *V) {
- if (auto *I = dyn_cast<Instruction>(V))
- Opcode = I->getOpcode();
- }
+ return I->isAssociative();
+ }
- /// Constructor for reduction operations with opcode and its left and
- /// right operands.
- OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
- bool NoNaN = false)
- : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
- assert(Kind != RK_None && "One of the reduction operations is expected.");
+ /// Checks if the ParentStackElem.first should be marked as a reduction
+ /// operation with an extra argument or as extra argument itself.
+ void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
+ Value *ExtraArg) {
+ if (ExtraArgs.count(ParentStackElem.first)) {
+ ExtraArgs[ParentStackElem.first] = nullptr;
+ // We ran into something like:
+ // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
+ // The whole ParentStackElem.first should be considered as an extra value
+ // in this case.
+ // Do not perform analysis of remaining operands of ParentStackElem.first
+ // instruction, this whole instruction is an extra argument.
+ RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first);
+ ParentStackElem.second = getNumberOfOperands(ParentRdxKind);
+ } else {
+ // We ran into something like:
+ // ParentStackElem.first += ... + ExtraArg + ...
+ ExtraArgs[ParentStackElem.first] = ExtraArg;
}
+ }
- explicit operator bool() const { return Opcode; }
+ /// Creates reduction operation with the current opcode.
+ static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
+ Value *RHS, const Twine &Name) {
+ unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
+ switch (Kind) {
+ case RecurKind::Add:
+ case RecurKind::Mul:
+ case RecurKind::Or:
+ case RecurKind::And:
+ case RecurKind::Xor:
+ case RecurKind::FAdd:
+ case RecurKind::FMul:
+ return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
+ Name);
+ case RecurKind::FMax:
+ return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
+ case RecurKind::FMin:
+ return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
- /// Return true if this operation is any kind of minimum or maximum.
- bool isMinMax() const {
- switch (Kind) {
- case RK_Arithmetic:
- return false;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax:
- return true;
- case RK_None:
- break;
- }
- llvm_unreachable("Reduction kind is not set");
+ case RecurKind::SMax: {
+ Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
-
- /// Get the index of the first operand.
- unsigned getFirstOperandIndex() const {
- assert(!!*this && "The opcode is not set.");
- // We allow calling this before 'Kind' is set, so handle that specially.
- if (Kind == RK_None)
- return 0;
- return isMinMax() ? 1 : 0;
+ case RecurKind::SMin: {
+ Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
-
- /// Total number of operands in the reduction operation.
- unsigned getNumberOfOperands() const {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- return isMinMax() ? 3 : 2;
+ case RecurKind::UMax: {
+ Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
-
- /// Checks if the operation has the same parent as \p P.
- bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- if (!IsRedOp)
- return I->getParent() == P;
- if (isMinMax()) {
- // SelectInst must be used twice while the condition op must have single
- // use only.
- auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
- return I->getParent() == P && Cmp && Cmp->getParent() == P;
- }
- // Arithmetic reduction operation must be used once only.
- return I->getParent() == P;
+ case RecurKind::UMin: {
+ Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
}
-
- /// Expected number of uses for reduction operations/reduced values.
- bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- if (isMinMax())
- return I->hasNUses(2) &&
- (!IsReductionOp ||
- cast<SelectInst>(I)->getCondition()->hasOneUse());
- return I->hasOneUse();
+ default:
+ llvm_unreachable("Unknown reduction operation.");
}
+ }
- /// Initializes the list of reduction operations.
- void initReductionOps(ReductionOpsListType &ReductionOps) {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- if (isMinMax())
- ReductionOps.assign(2, ReductionOpsType());
- else
- ReductionOps.assign(1, ReductionOpsType());
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p ReductionOps.
+ static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
+ Value *RHS, const Twine &Name,
+ const ReductionOpsListType &ReductionOps) {
+ Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
+ if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
+ if (auto *Sel = dyn_cast<SelectInst>(Op))
+ propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
+ propagateIRFlags(Op, ReductionOps[1]);
+ return Op;
}
-
- /// Add all reduction operations for the reduction instruction \p I.
- void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- if (isMinMax()) {
- ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
- ReductionOps[1].emplace_back(I);
- } else {
- ReductionOps[0].emplace_back(I);
+ propagateIRFlags(Op, ReductionOps[0]);
+ return Op;
+ }
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p I.
+ static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
+ Value *RHS, const Twine &Name, Instruction *I) {
+ Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
+ if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
+ if (auto *Sel = dyn_cast<SelectInst>(Op)) {
+ propagateIRFlags(Sel->getCondition(),
+ cast<SelectInst>(I)->getCondition());
}
}
+ propagateIRFlags(Op, I);
+ return Op;
+ }
- /// Checks if instruction is associative and can be vectorized.
- bool isAssociative(Instruction *I) const {
- assert(Kind != RK_None && *this && LHS && RHS &&
- "Expected reduction operation.");
- switch (Kind) {
- case RK_Arithmetic:
- return I->isAssociative();
- case RK_Min:
- case RK_Max:
- return Opcode == Instruction::ICmp ||
- cast<Instruction>(I->getOperand(0))->isFast();
- case RK_UMin:
- case RK_UMax:
- assert(Opcode == Instruction::ICmp &&
- "Only integer compare operation is expected.");
- return true;
- case RK_None:
- break;
- }
- llvm_unreachable("Reduction kind is not set");
- }
+ static RecurKind getRdxKind(Instruction *I) {
+ assert(I && "Expected instruction for reduction matching");
+ TargetTransformInfo::ReductionFlags RdxFlags;
+ if (match(I, m_Add(m_Value(), m_Value())))
+ return RecurKind::Add;
+ if (match(I, m_Mul(m_Value(), m_Value())))
+ return RecurKind::Mul;
+ if (match(I, m_And(m_Value(), m_Value())))
+ return RecurKind::And;
+ if (match(I, m_Or(m_Value(), m_Value())))
+ return RecurKind::Or;
+ if (match(I, m_Xor(m_Value(), m_Value())))
+ return RecurKind::Xor;
+ if (match(I, m_FAdd(m_Value(), m_Value())))
+ return RecurKind::FAdd;
+ if (match(I, m_FMul(m_Value(), m_Value())))
+ return RecurKind::FMul;
- /// Checks if the reduction operation can be vectorized.
- bool isVectorizable(Instruction *I) const {
- return isVectorizable() && isAssociative(I);
- }
+ if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+ return RecurKind::FMax;
+ if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
+ return RecurKind::FMin;
- /// Checks if two operation data are both a reduction op or both a reduced
- /// value.
- bool operator==(const OperationData &OD) const {
- assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
- "One of the comparing operations is incorrect.");
- return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
- }
- bool operator!=(const OperationData &OD) const { return !(*this == OD); }
- void clear() {
- Opcode = 0;
- LHS = nullptr;
- RHS = nullptr;
- Kind = RK_None;
- NoNaN = false;
- }
+ if (match(I, m_SMax(m_Value(), m_Value())))
+ return RecurKind::SMax;
+ if (match(I, m_SMin(m_Value(), m_Value())))
+ return RecurKind::SMin;
+ if (match(I, m_UMax(m_Value(), m_Value())))
+ return RecurKind::UMax;
+ if (match(I, m_UMin(m_Value(), m_Value())))
+ return RecurKind::UMin;
- /// Get the opcode of the reduction operation.
- unsigned getOpcode() const {
- assert(isVectorizable() && "Expected vectorizable operation.");
- return Opcode;
- }
+ if (auto *Select = dyn_cast<SelectInst>(I)) {
+ // Try harder: look for min/max pattern based on instructions producing
+ // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+ // During the intermediate stages of SLP, it's very common to have
+ // pattern like this (since optimizeGatherSequence is run only once
+ // at the end):
+ // %1 = extractelement <2 x i32> %a, i32 0
+ // %2 = extractelement <2 x i32> %a, i32 1
+ // %cond = icmp sgt i32 %1, %2
+ // %3 = extractelement <2 x i32> %a, i32 0
+ // %4 = extractelement <2 x i32> %a, i32 1
+ // %select = select i1 %cond, i32 %3, i32 %4
+ CmpInst::Predicate Pred;
+ Instruction *L1;
+ Instruction *L2;
- /// Get kind of reduction data.
- ReductionKind getKind() const { return Kind; }
- Value *getLHS() const { return LHS; }
- Value *getRHS() const { return RHS; }
- Type *getConditionType() const {
- return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr;
- }
+ Value *LHS = Select->getTrueValue();
+ Value *RHS = Select->getFalseValue();
+ Value *Cond = Select->getCondition();
- /// Creates reduction operation with the current opcode with the IR flags
- /// from \p ReductionOps.
- Value *createOp(IRBuilder<> &Builder, const Twine &Name,
- const ReductionOpsListType &ReductionOps) const {
- assert(isVectorizable() &&
- "Expected add|fadd or min/max reduction operation.");
- auto *Op = createOp(Builder, Name);
- switch (Kind) {
- case RK_Arithmetic:
- propagateIRFlags(Op, ReductionOps[0]);
- return Op;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax:
- if (auto *SI = dyn_cast<SelectInst>(Op))
- propagateIRFlags(SI->getCondition(), ReductionOps[0]);
- propagateIRFlags(Op, ReductionOps[1]);
- return Op;
- case RK_None:
- break;
- }
- llvm_unreachable("Unknown reduction operation.");
- }
- /// Creates reduction operation with the current opcode with the IR flags
- /// from \p I.
- Value *createOp(IRBuilder<> &Builder, const Twine &Name,
- Instruction *I) const {
- assert(isVectorizable() &&
- "Expected add|fadd or min/max reduction operation.");
- auto *Op = createOp(Builder, Name);
- switch (Kind) {
- case RK_Arithmetic:
- propagateIRFlags(Op, I);
- return Op;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax:
- if (auto *SI = dyn_cast<SelectInst>(Op)) {
- propagateIRFlags(SI->getCondition(),
- cast<SelectInst>(I)->getCondition());
- }
- propagateIRFlags(Op, I);
- return Op;
- case RK_None:
- break;
+ // TODO: Support inverse predicates.
+ if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+ if (!isa<ExtractElementInst>(RHS) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return RecurKind::None;
+ } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+ if (!isa<ExtractElementInst>(LHS) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)))
+ return RecurKind::None;
+ } else {
+ if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+ return RecurKind::None;
+ if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return RecurKind::None;
}
- llvm_unreachable("Unknown reduction operation.");
- }
- TargetTransformInfo::ReductionFlags getFlags() const {
- TargetTransformInfo::ReductionFlags Flags;
- Flags.NoNaN = NoNaN;
- switch (Kind) {
- case RK_Arithmetic:
- break;
- case RK_Min:
- Flags.IsSigned = Opcode == Instruction::ICmp;
- Flags.IsMaxOp = false;
- break;
- case RK_Max:
- Flags.IsSigned = Opcode == Instruction::ICmp;
- Flags.IsMaxOp = true;
- break;
- case RK_UMin:
- Flags.IsSigned = false;
- Flags.IsMaxOp = false;
- break;
- case RK_UMax:
- Flags.IsSigned = false;
- Flags.IsMaxOp = true;
- break;
- case RK_None:
- llvm_unreachable("Reduction kind is not set");
+ TargetTransformInfo::ReductionFlags RdxFlags;
+ switch (Pred) {
+ default:
+ return RecurKind::None;
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_SGE:
+ return RecurKind::SMax;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SLE:
+ return RecurKind::SMin;
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_UGE:
+ return RecurKind::UMax;
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_ULE:
+ return RecurKind::UMin;
}
- return Flags;
}
- };
-
- WeakTrackingVH ReductionRoot;
-
- /// The operation data of the reduction operation.
- OperationData ReductionData;
-
- /// The operation data of the values we perform a reduction on.
- OperationData ReducedValueData;
+ return RecurKind::None;
+ }
- /// Should we model this reduction as a pairwise reduction tree or a tree that
- /// splits the vector in halves and adds those halves.
- bool IsPairwiseReduction = false;
+ /// Return true if this operation is a cmp+select idiom.
+ static bool isCmpSel(RecurKind Kind) {
+ return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind);
+ }
- /// Checks if the ParentStackElem.first should be marked as a reduction
- /// operation with an extra argument or as extra argument itself.
- void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
- Value *ExtraArg) {
- if (ExtraArgs.count(ParentStackElem.first)) {
- ExtraArgs[ParentStackElem.first] = nullptr;
- // We ran into something like:
- // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
- // The whole ParentStackElem.first should be considered as an extra value
- // in this case.
- // Do not perform analysis of remaining operands of ParentStackElem.first
- // instruction, this whole instruction is an extra argument.
- ParentStackElem.second = ParentStackElem.first->getNumOperands();
- } else {
- // We ran into something like:
- // ParentStackElem.first += ... + ExtraArg + ...
- ExtraArgs[ParentStackElem.first] = ExtraArg;
- }
+ /// Get the index of the first operand.
+ static unsigned getFirstOperandIndex(RecurKind Kind) {
+ // We allow calling this before 'Kind' is set, so handle that specially.
+ if (Kind == RecurKind::None)
+ return 0;
+ return isCmpSel(Kind) ? 1 : 0;
}
- static OperationData getOperationData(Value *V) {
- if (!V)
- return OperationData();
+ /// Total number of operands in the reduction operation.
+ static unsigned getNumberOfOperands(RecurKind Kind) {
+ return isCmpSel(Kind) ? 3 : 2;
+ }
- Value *LHS;
- Value *RHS;
- if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
- return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
- RK_Arithmetic);
+ /// Checks if the instruction is in basic block \p BB.
+ /// For a min/max reduction check that both compare and select are in \p BB.
+ static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB,
+ bool IsRedOp) {
+ if (IsRedOp && isCmpSel(Kind)) {
+ auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
+ return I->getParent() == BB && Cmp && Cmp->getParent() == BB;
}
- if (auto *Select = dyn_cast<SelectInst>(V)) {
- // Look for a min/max pattern.
- if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
- } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
- } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
- m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(
- Instruction::FCmp, LHS, RHS, RK_Min,
- cast<Instruction>(Select->getCondition())->hasNoNaNs());
- } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
- } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
- } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
- m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(
- Instruction::FCmp, LHS, RHS, RK_Max,
- cast<Instruction>(Select->getCondition())->hasNoNaNs());
- } else {
- // Try harder: look for min/max pattern based on instructions producing
- // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
- // During the intermediate stages of SLP, it's very common to have
- // pattern like this (since optimizeGatherSequence is run only once
- // at the end):
- // %1 = extractelement <2 x i32> %a, i32 0
- // %2 = extractelement <2 x i32> %a, i32 1
- // %cond = icmp sgt i32 %1, %2
- // %3 = extractelement <2 x i32> %a, i32 0
- // %4 = extractelement <2 x i32> %a, i32 1
- // %select = select i1 %cond, i32 %3, i32 %4
- CmpInst::Predicate Pred;
- Instruction *L1;
- Instruction *L2;
-
- LHS = Select->getTrueValue();
- RHS = Select->getFalseValue();
- Value *Cond = Select->getCondition();
-
- // TODO: Support inverse predicates.
- if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
- if (!isa<ExtractElementInst>(RHS) ||
- !L2->isIdenticalTo(cast<Instruction>(RHS)))
- return OperationData(V);
- } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
- if (!isa<ExtractElementInst>(LHS) ||
- !L1->isIdenticalTo(cast<Instruction>(LHS)))
- return OperationData(V);
- } else {
- if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
- return OperationData(V);
- if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
- !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
- !L2->isIdenticalTo(cast<Instruction>(RHS)))
- return OperationData(V);
- }
- switch (Pred) {
- default:
- return OperationData(V);
-
- case CmpInst::ICMP_ULT:
- case CmpInst::ICMP_ULE:
- return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
-
- case CmpInst::ICMP_SLT:
- case CmpInst::ICMP_SLE:
- return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+ return I->getParent() == BB;
+ }
- case CmpInst::FCMP_OLT:
- case CmpInst::FCMP_OLE:
- case CmpInst::FCMP_ULT:
- case CmpInst::FCMP_ULE:
- return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
- cast<Instruction>(Cond)->hasNoNaNs());
+ /// Expected number of uses for reduction operations/reduced values.
+ static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I,
+ bool IsReductionOp) {
+ // SelectInst must be used twice while the condition op must have single
+ // use only.
+ if (isCmpSel(Kind))
+ return I->hasNUses(2) &&
+ (!IsReductionOp ||
+ cast<SelectInst>(I)->getCondition()->hasOneUse());
- case CmpInst::ICMP_UGT:
- case CmpInst::ICMP_UGE:
- return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+ // Arithmetic reduction operation must be used once only.
+ return I->hasOneUse();
+ }
- case CmpInst::ICMP_SGT:
- case CmpInst::ICMP_SGE:
- return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+ /// Initializes the list of reduction operations.
+ void initReductionOps(RecurKind Kind) {
+ if (isCmpSel(Kind))
+ ReductionOps.assign(2, ReductionOpsType());
+ else
+ ReductionOps.assign(1, ReductionOpsType());
+ }
- case CmpInst::FCMP_OGT:
- case CmpInst::FCMP_OGE:
- case CmpInst::FCMP_UGT:
- case CmpInst::FCMP_UGE:
- return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
- cast<Instruction>(Cond)->hasNoNaNs());
- }
- }
+ /// Add all reduction operations for the reduction instruction \p I.
+ void addReductionOps(RecurKind Kind, Instruction *I) {
+ assert(Kind != RecurKind::None && "Expected reduction operation.");
+ if (isCmpSel(Kind)) {
+ ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
+ ReductionOps[1].emplace_back(I);
+ } else {
+ ReductionOps[0].emplace_back(I);
}
- return OperationData(V);
+ }
+
+ static Value *getLHS(RecurKind Kind, Instruction *I) {
+ if (Kind == RecurKind::None)
+ return nullptr;
+ return I->getOperand(getFirstOperandIndex(Kind));
+ }
+ static Value *getRHS(RecurKind Kind, Instruction *I) {
+ if (Kind == RecurKind::None)
+ return nullptr;
+ return I->getOperand(getFirstOperandIndex(Kind) + 1);
}
public:
@@ -6684,50 +6689,59 @@ public:
/// Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
- "Thi phi needs to use the binary operator");
+ "Phi needs to use the binary operator");
- ReductionData = getOperationData(B);
+ RdxKind = getRdxKind(B);
// We could have a initial reductions that is not an add.
// r *= v1 + v2 + v3 + v4
// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {
- if (ReductionData.getLHS() == Phi) {
+ if (getLHS(RdxKind, B) == Phi) {
Phi = nullptr;
- B = dyn_cast<Instruction>(ReductionData.getRHS());
- ReductionData = getOperationData(B);
- } else if (ReductionData.getRHS() == Phi) {
+ B = dyn_cast<Instruction>(getRHS(RdxKind, B));
+ if (!B)
+ return false;
+ RdxKind = getRdxKind(B);
+ } else if (getRHS(RdxKind, B) == Phi) {
Phi = nullptr;
- B = dyn_cast<Instruction>(ReductionData.getLHS());
- ReductionData = getOperationData(B);
+ B = dyn_cast<Instruction>(getLHS(RdxKind, B));
+ if (!B)
+ return false;
+ RdxKind = getRdxKind(B);
}
}
- if (!ReductionData.isVectorizable(B))
+ if (!isVectorizable(RdxKind, B))
return false;
+ // Analyze "regular" integer/FP types for reductions - no target-specific
+ // types or pointers.
Type *Ty = B->getType();
- if (!isValidElementType(Ty))
- return false;
- if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
+ if (!isValidElementType(Ty) || Ty->isPointerTy())
return false;
- ReducedValueData.clear();
ReductionRoot = B;
+ // The opcode for leaf values that we perform a reduction on.
+ // For example: load(x) + load(y) + load(z) + fptoui(w)
+ // The leaf opcode for 'w' does not match, so we don't include it as a
+ // potential candidate for the reduction.
+ unsigned LeafOpcode = 0;
+
// Post order traverse the reduction tree starting at B. We only handle true
// trees containing only binary operators.
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
- Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
- ReductionData.initReductionOps(ReductionOps);
+ Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind)));
+ initReductionOps(RdxKind);
while (!Stack.empty()) {
Instruction *TreeN = Stack.back().first;
- unsigned EdgeToVist = Stack.back().second++;
- OperationData OpData = getOperationData(TreeN);
- bool IsReducedValue = OpData != ReductionData;
+ unsigned EdgeToVisit = Stack.back().second++;
+ const RecurKind TreeRdxKind = getRdxKind(TreeN);
+ bool IsReducedValue = TreeRdxKind != RdxKind;
- // Postorder vist.
- if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
+ // Postorder visit.
+ if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) {
if (IsReducedValue)
ReducedVals.push_back(TreeN);
else {
@@ -6745,7 +6759,7 @@ public:
markExtraArg(Stack[Stack.size() - 2], TreeN);
ExtraArgs.erase(TreeN);
} else
- ReductionData.addReductionOps(TreeN, ReductionOps);
+ addReductionOps(RdxKind, TreeN);
}
// Retract.
Stack.pop_back();
@@ -6753,91 +6767,72 @@ public:
}
// Visit left or right.
- Value *NextV = TreeN->getOperand(EdgeToVist);
- if (NextV != Phi) {
- auto *I = dyn_cast<Instruction>(NextV);
- OpData = getOperationData(I);
- // Continue analysis if the next operand is a reduction operation or
- // (possibly) a reduced value. If the reduced value opcode is not set,
- // the first met operation != reduction operation is considered as the
- // reduced value class.
- if (I && (!ReducedValueData || OpData == ReducedValueData ||
- OpData == ReductionData)) {
- const bool IsReductionOperation = OpData == ReductionData;
- // Only handle trees in the current basic block.
- if (!ReductionData.hasSameParent(I, B->getParent(),
- IsReductionOperation)) {
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), I);
- continue;
- }
-
- // Each tree node needs to have minimal number of users except for the
- // ultimate reduction.
- if (!ReductionData.hasRequiredNumberOfUses(I,
- OpData == ReductionData) &&
- I != B) {
+ Value *EdgeVal = TreeN->getOperand(EdgeToVisit);
+ auto *I = dyn_cast<Instruction>(EdgeVal);
+ if (!I) {
+ // Edge value is not a reduction instruction or a leaf instruction.
+ // (It may be a constant, function argument, or something else.)
+ markExtraArg(Stack.back(), EdgeVal);
+ continue;
+ }
+ RecurKind EdgeRdxKind = getRdxKind(I);
+ // Continue analysis if the next operand is a reduction operation or
+ // (possibly) a leaf value. If the leaf value opcode is not set,
+ // the first met operation != reduction operation is considered as the
+ // leaf opcode.
+ // Only handle trees in the current basic block.
+ // Each tree node needs to have minimal number of users except for the
+ // ultimate reduction.
+ const bool IsRdxInst = EdgeRdxKind == RdxKind;
+ if (I != Phi && I != B &&
+ hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) &&
+ hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) &&
+ (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) {
+ if (IsRdxInst) {
+ // We need to be able to reassociate the reduction operations.
+ if (!isVectorizable(EdgeRdxKind, I)) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
}
-
- if (IsReductionOperation) {
- // We need to be able to reassociate the reduction operations.
- if (!OpData.isAssociative(I)) {
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), I);
- continue;
- }
- } else if (ReducedValueData &&
- ReducedValueData != OpData) {
- // Make sure that the opcodes of the operations that we are going to
- // reduce match.
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), I);
- continue;
- } else if (!ReducedValueData)
- ReducedValueData = OpData;
-
- Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
- continue;
+ } else if (!LeafOpcode) {
+ LeafOpcode = I->getOpcode();
}
+ Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind)));
+ continue;
}
- // NextV is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), NextV);
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
}
return true;
}
- /// Attempt to vectorize the tree found by
- /// matchAssociativeReduction.
+ /// Attempt to vectorize the tree found by matchAssociativeReduction.
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
- if (ReducedVals.empty())
- return false;
-
- // If there is a sufficient number of reduction values, reduce
- // to a nearby power-of-2. Can safely generate oversized
+ // If there are a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
unsigned NumReducedVals = ReducedVals.size();
if (NumReducedVals < 4)
return false;
- unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-
- Value *VectorizedTree = nullptr;
+ // Intersect the fast-math-flags from all reduction operations.
+ FastMathFlags RdxFMF;
+ RdxFMF.set();
+ for (ReductionOpsType &RdxOp : ReductionOps) {
+ for (Value *RdxVal : RdxOp) {
+ if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
+ RdxFMF &= FPMO->getFastMathFlags();
+ }
+ }
- // FIXME: Fast-math-flags should be set based on the instructions in the
- // reduction (not all of 'fast' are required).
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
- FastMathFlags Unsafe;
- Unsafe.setFast();
- Builder.setFastMathFlags(Unsafe);
- unsigned i = 0;
+ Builder.setFastMathFlags(RdxFMF);
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
- // The same extra argument may be used several time, so log each attempt
+ // The same extra argument may be used several times, so log each attempt
// to use it.
- for (auto &Pair : ExtraArgs) {
+ for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
assert(Pair.first && "DebugLoc must be set.");
ExternallyUsedValues[Pair.second].push_back(Pair.first);
}
@@ -6857,14 +6852,48 @@ public:
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
SmallVector<Value *, 16> IgnoreList;
- for (auto &V : ReductionOps)
- IgnoreList.append(V.begin(), V.end());
+ for (ReductionOpsType &RdxOp : ReductionOps)
+ IgnoreList.append(RdxOp.begin(), RdxOp.end());
+
+ unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+ if (NumReducedVals > ReduxWidth) {
+ // In the loop below, we are building a tree based on a window of
+ // 'ReduxWidth' values.
+ // If the operands of those values have common traits (compare predicate,
+ // constant operand, etc), then we want to group those together to
+ // minimize the cost of the reduction.
+
+ // TODO: This should be extended to count common operands for
+ // compares and binops.
+
+ // Step 1: Count the number of times each compare predicate occurs.
+ SmallDenseMap<unsigned, unsigned> PredCountMap;
+ for (Value *RdxVal : ReducedVals) {
+ CmpInst::Predicate Pred;
+ if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
+ ++PredCountMap[Pred];
+ }
+ // Step 2: Sort the values so the most common predicates come first.
+ stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
+ CmpInst::Predicate PredA, PredB;
+ if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
+ match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
+ return PredCountMap[PredA] > PredCountMap[PredB];
+ }
+ return false;
+ });
+ }
+
+ Value *VectorizedTree = nullptr;
+ unsigned i = 0;
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
- auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
+ ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
Optional<ArrayRef<unsigned>> Order = V.bestOrder();
- // TODO: Handle orders of size less than number of elements in the vector.
- if (Order && Order->size() == VL.size()) {
+ if (Order) {
+ assert(Order->size() == VL.size() &&
+ "Order size must be the same as number of vectorized "
+ "instructions.");
// TODO: reorder tree nodes without tree rebuilding.
SmallVector<Value *, 4> ReorderedOps(VL.size());
llvm::transform(*Order, ReorderedOps.begin(),
@@ -6873,60 +6902,66 @@ public:
}
if (V.isTreeTinyAndNotFullyVectorizable())
break;
- if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
+ if (V.isLoadCombineReductionCandidate(RdxKind))
break;
V.computeMinimumValueSizes();
// Estimate cost.
- int TreeCost = V.getTreeCost();
- int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
- int Cost = TreeCost + ReductionCost;
+ InstructionCost TreeCost = V.getTreeCost();
+ InstructionCost ReductionCost =
+ getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ InstructionCost Cost = TreeCost + ReductionCost;
+ if (!Cost.isValid()) {
+ LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+ return false;
+ }
if (Cost >= -SLPCostThreshold) {
- V.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
- << "Vectorizing horizontal reduction is possible"
- << "but not beneficial with cost "
- << ore::NV("Cost", Cost) << " and threshold "
- << ore::NV("Threshold", -SLPCostThreshold);
- });
- break;
+ V.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
+ cast<Instruction>(VL[0]))
+ << "Vectorizing horizontal reduction is possible"
+ << "but not beneficial with cost " << ore::NV("Cost", Cost)
+ << " and threshold "
+ << ore::NV("Threshold", -SLPCostThreshold);
+ });
+ break;
}
LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n");
V.getORE()->emit([&]() {
- return OptimizationRemark(
- SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
- << "Vectorized horizontal reduction with cost "
- << ore::NV("Cost", Cost) << " and with tree size "
- << ore::NV("TreeSize", V.getTreeSize());
+ return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
+ cast<Instruction>(VL[0]))
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize());
});
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
- // Emit a reduction. For min/max, the root is a select, but the insertion
+ // Emit a reduction. If the root is a select (min/max idiom), the insert
// point is the compare condition of that select.
Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
- if (ReductionData.isMinMax())
+ if (isCmpSel(RdxKind))
Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
else
Builder.SetInsertPoint(RdxRootInst);
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
- if (VectorizedTree) {
- Builder.SetCurrentDebugLocation(Loc);
- OperationData VectReductionData(ReductionData.getOpcode(),
- VectorizedTree, ReducedSubTree,
- ReductionData.getKind());
- VectorizedTree =
- VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
- } else
+
+ if (!VectorizedTree) {
+ // Initialize the final value in the reduction.
VectorizedTree = ReducedSubTree;
+ } else {
+ // Update the final value in the reduction.
+ Builder.SetCurrentDebugLocation(Loc);
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ ReducedSubTree, "op.rdx", ReductionOps);
+ }
i += ReduxWidth;
ReduxWidth = PowerOf2Floor(NumReducedVals - i);
}
@@ -6936,19 +6971,15 @@ public:
for (; i < NumReducedVals; ++i) {
auto *I = cast<Instruction>(ReducedVals[i]);
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- OperationData VectReductionData(ReductionData.getOpcode(),
- VectorizedTree, I,
- ReductionData.getKind());
- VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
+ VectorizedTree =
+ createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
}
for (auto &Pair : ExternallyUsedValues) {
// Add each externally used value to the final reduction.
for (auto *I : Pair.second) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- OperationData VectReductionData(ReductionData.getOpcode(),
- VectorizedTree, Pair.first,
- ReductionData.getKind());
- VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ Pair.first, "op.extra", I);
}
}
@@ -6956,7 +6987,7 @@ public:
// select, we also have to RAUW for the compare instruction feeding the
// reduction root. That's because the original compare may have extra uses
// besides the final select of the reduction.
- if (ReductionData.isMinMax()) {
+ if (isCmpSel(RdxKind)) {
if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
Instruction *ScalarCmp =
getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
@@ -6972,77 +7003,68 @@ public:
return VectorizedTree != nullptr;
}
- unsigned numReductionValues() const {
- return ReducedVals.size();
- }
+ unsigned numReductionValues() const { return ReducedVals.size(); }
private:
/// Calculate the cost of a reduction.
- int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
- unsigned ReduxWidth) {
+ InstructionCost getReductionCost(TargetTransformInfo *TTI,
+ Value *FirstReducedVal,
+ unsigned ReduxWidth) {
Type *ScalarTy = FirstReducedVal->getType();
- auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth);
-
- int PairwiseRdxCost;
- int SplittingRdxCost;
- switch (ReductionData.getKind()) {
- case RK_Arithmetic:
- PairwiseRdxCost =
- TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
- /*IsPairwiseForm=*/true);
- SplittingRdxCost =
- TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
- /*IsPairwiseForm=*/false);
+ FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
+ InstructionCost VectorCost, ScalarCost;
+ switch (RdxKind) {
+ case RecurKind::Add:
+ case RecurKind::Mul:
+ case RecurKind::Or:
+ case RecurKind::And:
+ case RecurKind::Xor:
+ case RecurKind::FAdd:
+ case RecurKind::FMul: {
+ unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
+ VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
+ /*IsPairwiseForm=*/false);
+ ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
break;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax: {
- auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
- bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
- ReductionData.getKind() == RK_UMax;
- PairwiseRdxCost =
- TTI->getMinMaxReductionCost(VecTy, VecCondTy,
- /*IsPairwiseForm=*/true, IsUnsigned);
- SplittingRdxCost =
- TTI->getMinMaxReductionCost(VecTy, VecCondTy,
- /*IsPairwiseForm=*/false, IsUnsigned);
- break;
- }
- case RK_None:
- llvm_unreachable("Expected arithmetic or min/max reduction operation");
}
-
- IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
- int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
-
- int ScalarReduxCost = 0;
- switch (ReductionData.getKind()) {
- case RK_Arithmetic:
- ScalarReduxCost =
- TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+ case RecurKind::FMax:
+ case RecurKind::FMin: {
+ auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+ VectorCost =
+ TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ /*pairwise=*/false, /*unsigned=*/false);
+ ScalarCost =
+ TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy));
break;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax:
- ScalarReduxCost =
- TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
+ }
+ case RecurKind::SMax:
+ case RecurKind::SMin:
+ case RecurKind::UMax:
+ case RecurKind::UMin: {
+ auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+ bool IsUnsigned =
+ RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
+ VectorCost =
+ TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ /*IsPairwiseForm=*/false, IsUnsigned);
+ ScalarCost =
+ TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
CmpInst::makeCmpResultType(ScalarTy));
break;
- case RK_None:
+ }
+ default:
llvm_unreachable("Expected arithmetic or min/max reduction operation");
}
- ScalarReduxCost *= (ReduxWidth - 1);
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+ // Scalar cost is repeated for N-1 elements.
+ ScalarCost *= (ReduxWidth - 1);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
<< " for reduction that starts with " << *FirstReducedVal
- << " (It is a "
- << (IsPairwiseReduction ? "pairwise" : "splitting")
- << " reduction)\n");
-
- return VecReduxCost - ScalarReduxCost;
+ << " (It is a splitting reduction)\n");
+ return VectorCost - ScalarCost;
}
/// Emit a horizontal reduction of the vectorized value.
@@ -7052,92 +7074,142 @@ private:
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
- if (!IsPairwiseReduction) {
- // FIXME: The builder should use an FMF guard. It should not be hard-coded
- // to 'fast'.
- assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF");
- return createSimpleTargetReduction(
- Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
- ReductionData.getFlags(), ReductionOps.back());
- }
+ return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
+ ReductionOps.back());
+ }
+};
+
+} // end anonymous namespace
- Value *TmpVec = VectorizedValue;
- for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
- auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true);
- auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false);
+static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
+ if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
+ return cast<FixedVectorType>(IE->getType())->getNumElements();
- Value *LeftShuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
- Value *RightShuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
- "rdx.shuf.r");
- OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
- RightShuf, ReductionData.getKind());
- TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
+ unsigned AggregateSize = 1;
+ auto *IV = cast<InsertValueInst>(InsertInst);
+ Type *CurrentType = IV->getType();
+ do {
+ if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+ for (auto *Elt : ST->elements())
+ if (Elt != ST->getElementType(0)) // check homogeneity
+ return None;
+ AggregateSize *= ST->getNumElements();
+ CurrentType = ST->getElementType(0);
+ } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+ AggregateSize *= AT->getNumElements();
+ CurrentType = AT->getElementType();
+ } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
+ AggregateSize *= VT->getNumElements();
+ return AggregateSize;
+ } else if (CurrentType->isSingleValueType()) {
+ return AggregateSize;
+ } else {
+ return None;
}
+ } while (true);
+}
- // The result is in the first element of the vector.
- return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
+ unsigned OperandOffset) {
+ unsigned OperandIndex = OperandOffset;
+ if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+ auto *VT = cast<FixedVectorType>(IE->getType());
+ OperandIndex *= VT->getNumElements();
+ OperandIndex += CI->getZExtValue();
+ return OperandIndex;
+ }
+ return None;
}
-};
-} // end anonymous namespace
+ auto *IV = cast<InsertValueInst>(InsertInst);
+ Type *CurrentType = IV->getType();
+ for (unsigned int Index : IV->indices()) {
+ if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+ OperandIndex *= ST->getNumElements();
+ CurrentType = ST->getElementType(Index);
+ } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+ OperandIndex *= AT->getNumElements();
+ CurrentType = AT->getElementType();
+ } else {
+ return None;
+ }
+ OperandIndex += Index;
+ }
+ return OperandIndex;
+}
+
+static bool findBuildAggregate_rec(Instruction *LastInsertInst,
+ TargetTransformInfo *TTI,
+ SmallVectorImpl<Value *> &BuildVectorOpds,
+ SmallVectorImpl<Value *> &InsertElts,
+ unsigned OperandOffset) {
+ do {
+ Value *InsertedOperand = LastInsertInst->getOperand(1);
+ Optional<unsigned> OperandIndex =
+ getOperandIndex(LastInsertInst, OperandOffset);
+ if (!OperandIndex)
+ return false;
+ if (isa<InsertElementInst>(InsertedOperand) ||
+ isa<InsertValueInst>(InsertedOperand)) {
+ if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
+ BuildVectorOpds, InsertElts, *OperandIndex))
+ return false;
+ } else {
+ BuildVectorOpds[*OperandIndex] = InsertedOperand;
+ InsertElts[*OperandIndex] = LastInsertInst;
+ }
+ if (isa<UndefValue>(LastInsertInst->getOperand(0)))
+ return true;
+ LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
+ } while (LastInsertInst != nullptr &&
+ (isa<InsertValueInst>(LastInsertInst) ||
+ isa<InsertElementInst>(LastInsertInst)) &&
+ LastInsertInst->hasOneUse());
+ return false;
+}
/// Recognize construction of vectors like
-/// %ra = insertelement <4 x float> undef, float %s0, i32 0
+/// %ra = insertelement <4 x float> poison, float %s0, i32 0
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
/// starting from the last insertelement or insertvalue instruction.
///
-/// Also recognize aggregates like {<2 x float>, <2 x float>},
+/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
///
/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
///
/// \return true if it matches.
-static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
+static bool findBuildAggregate(Instruction *LastInsertInst,
+ TargetTransformInfo *TTI,
SmallVectorImpl<Value *> &BuildVectorOpds,
SmallVectorImpl<Value *> &InsertElts) {
+
assert((isa<InsertElementInst>(LastInsertInst) ||
isa<InsertValueInst>(LastInsertInst)) &&
"Expected insertelement or insertvalue instruction!");
- do {
- Value *InsertedOperand;
- auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
- if (IE) {
- InsertedOperand = IE->getOperand(1);
- LastInsertInst = IE->getOperand(0);
- } else {
- auto *IV = cast<InsertValueInst>(LastInsertInst);
- InsertedOperand = IV->getInsertedValueOperand();
- LastInsertInst = IV->getAggregateOperand();
- }
- if (isa<InsertElementInst>(InsertedOperand) ||
- isa<InsertValueInst>(InsertedOperand)) {
- SmallVector<Value *, 8> TmpBuildVectorOpds;
- SmallVector<Value *, 8> TmpInsertElts;
- if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
- TmpInsertElts))
- return false;
- BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
- TmpBuildVectorOpds.rend());
- InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
- } else {
- BuildVectorOpds.push_back(InsertedOperand);
- InsertElts.push_back(IE);
- }
- if (isa<UndefValue>(LastInsertInst))
- break;
- if ((!isa<InsertValueInst>(LastInsertInst) &&
- !isa<InsertElementInst>(LastInsertInst)) ||
- !LastInsertInst->hasOneUse())
- return false;
- } while (true);
- std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
- std::reverse(InsertElts.begin(), InsertElts.end());
- return true;
+
+ assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
+ "Expected empty result vectors!");
+
+ Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
+ if (!AggregateSize)
+ return false;
+ BuildVectorOpds.resize(*AggregateSize);
+ InsertElts.resize(*AggregateSize);
+
+ if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts,
+ 0)) {
+ llvm::erase_value(BuildVectorOpds, nullptr);
+ llvm::erase_value(InsertElts, nullptr);
+ if (BuildVectorOpds.size() >= 2)
+ return true;
+ }
+
+ return false;
}
static bool PhiTypeSorterFunc(Value *V, Value *V2) {
@@ -7195,6 +7267,16 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
return nullptr;
}
+static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
+ if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
+ return true;
+ return false;
+}
+
/// Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding the phi node \a P
/// with reduction operators \a Root (or one of its operands) in a basic block
@@ -7234,9 +7316,10 @@ static bool tryToVectorizeHorReductionOrInstOperands(
Instruction *Inst;
unsigned Level;
std::tie(Inst, Level) = Stack.pop_back_val();
- auto *BI = dyn_cast<BinaryOperator>(Inst);
- auto *SI = dyn_cast<SelectInst>(Inst);
- if (BI || SI) {
+ Value *B0, *B1;
+ bool IsBinop = matchRdxBop(Inst, B0, B1);
+ bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
+ if (IsBinop || IsSelect) {
HorizontalReduction HorRdx;
if (HorRdx.matchAssociativeReduction(P, Inst)) {
if (HorRdx.tryToReduce(R, TTI)) {
@@ -7247,10 +7330,10 @@ static bool tryToVectorizeHorReductionOrInstOperands(
continue;
}
}
- if (P && BI) {
- Inst = dyn_cast<Instruction>(BI->getOperand(0));
+ if (P && IsBinop) {
+ Inst = dyn_cast<Instruction>(B0);
if (Inst == P)
- Inst = dyn_cast<Instruction>(BI->getOperand(1));
+ Inst = dyn_cast<Instruction>(B1);
if (!Inst) {
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
@@ -7283,9 +7366,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI) {
- if (!V)
- return false;
- auto *I = dyn_cast<Instruction>(V);
+ auto *I = dyn_cast_or_null<Instruction>(V);
if (!I)
return false;
@@ -7307,8 +7388,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
SmallVector<Value *, 16> BuildVectorOpds;
SmallVector<Value *, 16> BuildVectorInsts;
- if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) ||
- BuildVectorOpds.size() < 2)
+ if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
return false;
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
@@ -7323,7 +7403,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
SmallVector<Value *, 16> BuildVectorInsts;
SmallVector<Value *, 16> BuildVectorOpds;
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
- BuildVectorOpds.size() < 2 ||
(llvm::all_of(BuildVectorOpds,
[](Value *V) { return isa<ExtractElementInst>(V); }) &&
isShuffle(BuildVectorOpds)))
@@ -7369,7 +7448,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
SmallPtrSet<Value *, 16> VisitedInstrs;
- unsigned MaxVecRegSize = R.getMaxVecRegSize();
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
@@ -7396,18 +7474,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Look for the next elements with the same type.
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
- Type *EltTy = (*IncIt)->getType();
- unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy)
- : MaxVecRegSize;
- unsigned MaxNumElts = MaxVecRegSize / EltSize;
- if (MaxNumElts < 2) {
- ++IncIt;
- continue;
- }
-
while (SameTypeIt != E &&
- (*SameTypeIt)->getType() == EltTy &&
- (SameTypeIt - IncIt) < MaxNumElts) {
+ (*SameTypeIt)->getType() == (*IncIt)->getType()) {
VisitedInstrs.insert(*SameTypeIt);
++SameTypeIt;
}
@@ -7439,12 +7507,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
SmallVector<Instruction *, 8> PostProcessInstructions;
SmallDenseSet<Instruction *, 4> KeyNodes;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // Skip instructions with scalable type. The num of elements is unknown at
+ // compile-time for scalable type.
+ if (isa<ScalableVectorType>(it->getType()))
+ continue;
+
// Skip instructions marked for the deletion.
if (R.isDeleted(&*it))
continue;
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second) {
- if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
+ if (it->use_empty() && KeyNodes.contains(&*it) &&
vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
@@ -7461,16 +7534,29 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Try to vectorize reductions that use PHINodes.
if (PHINode *P = dyn_cast<PHINode>(it)) {
// Check that the PHI is a reduction PHI.
- if (P->getNumIncomingValues() != 2)
- return Changed;
+ if (P->getNumIncomingValues() == 2) {
+ // Try to match and vectorize a horizontal reduction.
+ if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+ TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+ }
+ // Try to vectorize the incoming values of the PHI, to catch reductions
+ // that feed into PHIs.
+ for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
+ // Skip if the incoming block is the current BB for now. Also, bypass
+ // unreachable IR for efficiency and to avoid crashing.
+ // TODO: Collect the skipped incoming values and try to vectorize them
+ // after processing BB.
+ if (BB == P->getIncomingBlock(I) ||
+ !DT->isReachableFromEntry(P->getIncomingBlock(I)))
+ continue;
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
- TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
+ Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
+ P->getIncomingBlock(I), R, TTI);
}
continue;
}
@@ -7534,7 +7620,7 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
unsigned MaxElts = MaxVecRegSize / EltSize;
for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
auto Len = std::min<unsigned>(BE - BI, MaxElts);
- auto GEPList = makeArrayRef(&Entry.second[BI], Len);
+ ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
// Initialize a set a candidate getelementptrs. Note that we use a
// SetVector here to preserve program order. If the index computations
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 6f055ca80ff2..873701676067 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -61,17 +61,19 @@ class VPRecipeBuilder {
/// Check if the load or store instruction \p I should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
- VPWidenMemoryInstructionRecipe *
- tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
+ VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan);
/// Check if an induction recipe should be constructed for \I. If so build and
/// return it. If not, return null.
- VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi) const;
+ VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
+ VPlan &Plan) const;
/// Optimize the special case where the operand of \p I is a constant integer
/// induction variable.
VPWidenIntOrFpInductionRecipe *
- tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range) const;
+ tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
+ VPlan &Plan) const;
/// Handle non-loop phi nodes. Currently all such phi nodes are turned into
/// a sequence of select instructions as the vectorizer currently performs
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f5f28a3bffa1..b26399e0ae58 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -20,8 +20,10 @@
#include "VPlanDominatorTree.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -56,13 +58,69 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
return OS;
}
+VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
+ : SubclassID(SC), UnderlyingVal(UV), Def(Def) {
+ if (Def)
+ Def->addDefinedValue(this);
+}
+
+VPValue::~VPValue() {
+ assert(Users.empty() && "trying to delete a VPValue with remaining users");
+ if (Def)
+ Def->removeDefinedValue(this);
+}
+
void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
- if (const VPInstruction *Instr = dyn_cast<VPInstruction>(this))
- Instr->print(OS, SlotTracker);
+ if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
+ R->print(OS, "", SlotTracker);
else
printAsOperand(OS, SlotTracker);
}
+void VPValue::dump() const {
+ const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
+ VPSlotTracker SlotTracker(
+ (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
+ print(dbgs(), SlotTracker);
+ dbgs() << "\n";
+}
+
+void VPDef::dump() const {
+ const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this);
+ VPSlotTracker SlotTracker(
+ (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
+ print(dbgs(), "", SlotTracker);
+ dbgs() << "\n";
+}
+
+VPUser *VPRecipeBase::toVPUser() {
+ if (auto *U = dyn_cast<VPInstruction>(this))
+ return U;
+ if (auto *U = dyn_cast<VPWidenRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPWidenCallRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPWidenSelectRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPWidenGEPRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPBlendRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPInterleaveRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPReplicateRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPBranchOnMaskRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPWidenMemoryInstructionRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPReductionRecipe>(this))
+ return U;
+ if (auto *U = dyn_cast<VPPredInstPHIRecipe>(this))
+ return U;
+ return nullptr;
+}
+
// Get the top-most entry block of \p Start. This is the entry block of the
// containing VPlan. This function is templated to support both const and non-const blocks
template <typename T> static T *getPlanEntry(T *Start) {
@@ -142,14 +200,43 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
}
void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
- SmallVector<VPBlockBase *, 8> Blocks;
- for (VPBlockBase *Block : depth_first(Entry))
- Blocks.push_back(Block);
+ SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
for (VPBlockBase *Block : Blocks)
delete Block;
}
+VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
+ iterator It = begin();
+ while (It != end() && (isa<VPWidenPHIRecipe>(&*It) ||
+ isa<VPWidenIntOrFpInductionRecipe>(&*It) ||
+ isa<VPPredInstPHIRecipe>(&*It) ||
+ isa<VPWidenCanonicalIVRecipe>(&*It)))
+ It++;
+ return It;
+}
+
+Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
+ if (!Def->getDef() && OrigLoop->isLoopInvariant(Def->getLiveInIRValue()))
+ return Def->getLiveInIRValue();
+
+ if (hasScalarValue(Def, Instance))
+ return Data.PerPartScalars[Def][Instance.Part][Instance.Lane];
+
+ if (hasVectorValue(Def, Instance.Part)) {
+ assert(Data.PerPartOutput.count(Def));
+ auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
+ if (!VecPart->getType()->isVectorTy()) {
+ assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar");
+ return VecPart;
+ }
+ // TODO: Cache created scalar values.
+ return Builder.CreateExtractElement(VecPart,
+ Builder.getInt32(Instance.Lane));
+ }
+ return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
+}
+
BasicBlock *
VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
// BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
@@ -267,6 +354,24 @@ void VPBasicBlock::execute(VPTransformState *State) {
LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
}
+void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
+ for (VPRecipeBase &R : Recipes) {
+ for (auto *Def : R.definedValues())
+ Def->replaceAllUsesWith(NewValue);
+
+ if (auto *User = R.toVPUser())
+ for (unsigned I = 0, E = User->getNumOperands(); I != E; I++)
+ User->setOperand(I, NewValue);
+ }
+}
+
+void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
+ for (VPBlockBase *Block : depth_first(Entry))
+ // Drop all references in VPBasicBlocks and replace all uses with
+ // DummyValue.
+ Block->dropAllReferences(NewValue);
+}
+
void VPRegionBlock::execute(VPTransformState *State) {
ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
@@ -300,7 +405,9 @@ void VPRegionBlock::execute(VPTransformState *State) {
for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
State->Instance->Part = Part;
- for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) {
+ assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
+ for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
+ ++Lane) {
State->Instance->Lane = Lane;
// Visit the VPBlocks connected to \p this, starting from it.
for (VPBlockBase *Block : RPOT) {
@@ -346,6 +453,14 @@ void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
insertAfter(InsertPos);
}
+void VPRecipeBase::moveBefore(VPBasicBlock &BB,
+ iplist<VPRecipeBase>::iterator I) {
+ assert(I == BB.end() || I->getParent() == &BB);
+ removeFromParent();
+ Parent = &BB;
+ BB.getRecipeList().insert(I, this);
+}
+
void VPInstruction::generateInstruction(VPTransformState &State,
unsigned Part) {
IRBuilder<> &Builder = State.Builder;
@@ -383,14 +498,14 @@ void VPInstruction::generateInstruction(VPTransformState &State,
case VPInstruction::ActiveLaneMask: {
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
- // Get first lane of backedge-taken-count.
- Value *ScalarBTC = State.get(getOperand(1), {Part, 0});
+ // Get the original loop tripcount.
+ Value *ScalarTC = State.TripCount;
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = FixedVectorType::get(Int1Ty, State.VF);
+ auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
Instruction *Call = Builder.CreateIntrinsic(
- Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
- {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
+ Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
+ {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
State.set(this, Call, Part);
break;
}
@@ -405,18 +520,15 @@ void VPInstruction::execute(VPTransformState &State) {
generateInstruction(State, Part);
}
-void VPInstruction::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << "\"EMIT ";
- print(O, SlotTracker);
-}
-
-void VPInstruction::print(raw_ostream &O) const {
+void VPInstruction::dump() const {
VPSlotTracker SlotTracker(getParent()->getPlan());
- print(O, SlotTracker);
+ print(dbgs(), "", SlotTracker);
}
-void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
+void VPInstruction::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "EMIT ";
+
if (hasResult()) {
printAsOperand(O, SlotTracker);
O << " = ";
@@ -461,7 +573,7 @@ void VPlan::execute(VPTransformState *State) {
"trip.count.minus.1");
auto VF = State->VF;
Value *VTCMO =
- VF == 1 ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
+ VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
State->set(BackedgeTakenCount, VTCMO, Part);
}
@@ -666,7 +778,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
// Dump the block predicate.
const VPValue *Pred = BasicBlock->getPredicate();
if (Pred) {
- OS << " +\n" << Indent << " \"BlockPredicate: ";
+ OS << " +\n" << Indent << " \"BlockPredicate: \"";
if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {
PredI->printAsOperand(OS, SlotTracker);
OS << " (" << DOT::EscapeString(PredI->getParent()->getName())
@@ -676,7 +788,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
}
for (const VPRecipeBase &Recipe : *BasicBlock) {
- OS << " +\n" << Indent;
+ OS << " +\n" << Indent << "\"";
Recipe.print(OS, Indent, SlotTracker);
OS << "\\l\"";
}
@@ -715,7 +827,7 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
dumpEdges(Region);
}
-void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
+void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) {
std::string IngredientString;
raw_string_ostream RSO(IngredientString);
if (auto *Inst = dyn_cast<Instruction>(V)) {
@@ -738,24 +850,45 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient);
+ O << "WIDEN-CALL ";
+
+ auto *CI = cast<CallInst>(getUnderlyingInstr());
+ if (CI->getType()->isVoidTy())
+ O << "void ";
+ else {
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+
+ O << "call @" << CI->getCalledFunction()->getName() << "(";
+ printOperands(O, SlotTracker);
+ O << ")";
}
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"WIDEN-SELECT" << VPlanIngredient(&Ingredient)
- << (InvariantCond ? " (condition is loop invariant)" : "");
+ O << "WIDEN-SELECT ";
+ printAsOperand(O, SlotTracker);
+ O << " = select ";
+ getOperand(0)->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getOperand(1)->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getOperand(2)->printAsOperand(O, SlotTracker);
+ O << (InvariantCond ? " (condition is loop invariant)" : "");
}
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"WIDEN\\l\"";
- O << "\" " << VPlanIngredient(&Ingredient);
+ O << "WIDEN ";
+ printAsOperand(O, SlotTracker);
+ O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
+ printOperands(O, SlotTracker);
}
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"WIDEN-INDUCTION";
+ O << "WIDEN-INDUCTION";
if (Trunc) {
O << "\\l\"";
O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
@@ -766,23 +899,26 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"WIDEN-GEP ";
+ O << "WIDEN-GEP ";
O << (IsPtrLoopInvariant ? "Inv" : "Var");
size_t IndicesNumber = IsIndexLoopInvariant.size();
for (size_t I = 0; I < IndicesNumber; ++I)
O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
- O << "\\l\"";
- O << " +\n" << Indent << "\" " << VPlanIngredient(GEP);
+
+ O << " ";
+ printAsOperand(O, SlotTracker);
+ O << " = getelementptr ";
+ printOperands(O, SlotTracker);
}
void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"WIDEN-PHI " << VPlanIngredient(Phi);
+ O << "WIDEN-PHI " << VPlanIngredient(Phi);
}
void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"BLEND ";
+ O << "BLEND ";
Phi->printAsOperand(O, false);
O << " =";
if (getNumIncomingValues() == 1) {
@@ -800,46 +936,75 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
}
}
+void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ getChainOp()->printAsOperand(O, SlotTracker);
+ O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode())
+ << " (";
+ getVecOp()->printAsOperand(O, SlotTracker);
+ if (getCondOp()) {
+ O << ", ";
+ getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+}
+
void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
- << VPlanIngredient(Ingredient);
+ O << (IsUniform ? "CLONE " : "REPLICATE ");
+
+ if (!getUnderlyingInstr()->getType()->isVoidTy()) {
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+ O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
+ printOperands(O, SlotTracker);
+
if (AlsoPack)
O << " (S->V)";
}
void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst);
+ O << "PHI-PREDICATED-INSTRUCTION ";
+ printOperands(O, SlotTracker);
}
void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"WIDEN " << VPlanIngredient(&Instr);
- O << ", ";
- getAddr()->printAsOperand(O, SlotTracker);
- VPValue *Mask = getMask();
- if (Mask) {
- O << ", ";
- Mask->printAsOperand(O, SlotTracker);
+ O << "WIDEN ";
+
+ if (!isStore()) {
+ getVPValue()->printAsOperand(O, SlotTracker);
+ O << " = ";
}
+ O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
+
+ printOperands(O, SlotTracker);
}
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
Value *CanonicalIV = State.CanonicalIV;
Type *STy = CanonicalIV->getType();
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
- auto VF = State.VF;
- Value *VStart = VF == 1
+ ElementCount VF = State.VF;
+ assert(!VF.isScalable() && "the code following assumes non scalables ECs");
+ Value *VStart = VF.isScalar()
? CanonicalIV
- : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
+ : Builder.CreateVectorSplat(VF.getKnownMinValue(),
+ CanonicalIV, "broadcast");
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
SmallVector<Constant *, 8> Indices;
- for (unsigned Lane = 0; Lane < VF; ++Lane)
- Indices.push_back(ConstantInt::get(STy, Part * VF + Lane));
+ for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+ Indices.push_back(
+ ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));
// If VF == 1, there is only one iteration in the loop above, thus the
// element pushed back into Indices is ConstantInt::get(STy, Part)
- Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices);
+ Constant *VStep =
+ VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
// Add the consecutive indices to the vector value.
Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
State.set(getVPValue(), CanonicalVectorIV, Part);
@@ -848,7 +1013,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"EMIT ";
+ O << "EMIT ";
getVPValue()->printAsOperand(O, SlotTracker);
O << " = WIDEN-CANONICAL-INDUCTION";
}
@@ -856,10 +1021,18 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
void VPValue::replaceAllUsesWith(VPValue *New) {
- for (VPUser *User : users())
+ for (unsigned J = 0; J < getNumUsers();) {
+ VPUser *User = Users[J];
+ unsigned NumUsers = getNumUsers();
for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
if (User->getOperand(I) == this)
User->setOperand(I, New);
+ // If a user got removed after updating the current user, the next user to
+ // update will be moved to the current position, so we only need to
+ // increment the index if the number of users did not change.
+ if (NumUsers == getNumUsers())
+ J++;
+ }
}
void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
@@ -877,6 +1050,12 @@ void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
OS << "vp<%" << Tracker.getSlot(this) << ">";
}
+void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
+ interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
+ Op->printAsOperand(O, SlotTracker);
+ });
+}
+
void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
Old2NewTy &Old2New,
InterleavedAccessInfo &IAI) {
@@ -925,13 +1104,6 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
void VPSlotTracker::assignSlot(const VPValue *V) {
assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
- const Value *UV = V->getUnderlyingValue();
- if (UV)
- return;
- const auto *VPI = dyn_cast<VPInstruction>(V);
- if (VPI && !VPI->hasResult())
- return;
-
Slots[V] = NextSlot++;
}
@@ -950,10 +1122,8 @@ void VPSlotTracker::assignSlots(const VPRegionBlock *Region) {
void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
for (const VPRecipeBase &Recipe : *VPBB) {
- if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe))
- assignSlot(VPI);
- else if (const auto *VPIV = dyn_cast<VPWidenCanonicalIVRecipe>(&Recipe))
- assignSlot(VPIV->getVPValue());
+ for (VPValue *Def : Recipe.definedValues())
+ assignSlot(Def);
}
}
@@ -962,10 +1132,6 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
for (const VPValue *V : Plan.VPExternalDefs)
assignSlot(V);
- for (auto &E : Plan.Value2VPValue)
- if (!isa<VPInstruction>(E.second))
- assignSlot(E.second);
-
for (const VPValue *V : Plan.VPCBVs)
assignSlot(V);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f07c94e7a3c7..2cce127cd4ce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -51,13 +51,12 @@ namespace llvm {
class BasicBlock;
class DominatorTree;
class InnerLoopVectorizer;
-template <class T> class InterleaveGroup;
class LoopInfo;
class raw_ostream;
+class RecurrenceDescriptor;
class Value;
class VPBasicBlock;
class VPRegionBlock;
-class VPSlotTracker;
class VPlan;
class VPlanSlp;
@@ -66,10 +65,22 @@ class VPlanSlp;
/// [1, 9) = {1, 2, 4, 8}
struct VFRange {
// A power of 2.
- const unsigned Start;
+ const ElementCount Start;
// Need not be a power of 2. If End <= Start range is empty.
- unsigned End;
+ ElementCount End;
+
+ bool isEmpty() const {
+ return End.getKnownMinValue() <= Start.getKnownMinValue();
+ }
+
+ VFRange(const ElementCount &Start, const ElementCount &End)
+ : Start(Start), End(End) {
+ assert(Start.isScalable() == End.isScalable() &&
+ "Both Start and End should have the same scalable flag");
+ assert(isPowerOf2_32(Start.getKnownMinValue()) &&
+ "Expected Start to be a power of 2");
+ }
};
using VPlanPtr = std::unique_ptr<VPlan>;
@@ -114,7 +125,7 @@ private:
/// The vectorization factor. Each entry in the scalar map contains UF x VF
/// scalar values.
- unsigned VF;
+ ElementCount VF;
/// The vector and scalar map storage. We use std::map and not DenseMap
/// because insertions to DenseMap invalidate its iterators.
@@ -125,7 +136,7 @@ private:
public:
/// Construct an empty map with the given unroll and vectorization factors.
- VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
+ VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {}
/// \return True if the map has any vector entry for \p Key.
bool hasAnyVectorValue(Value *Key) const {
@@ -150,12 +161,14 @@ public:
/// \return True if the map has a scalar entry for \p Key and \p Instance.
bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
assert(Instance.Part < UF && "Queried Scalar Part is too large.");
- assert(Instance.Lane < VF && "Queried Scalar Lane is too large.");
+ assert(Instance.Lane < VF.getKnownMinValue() &&
+ "Queried Scalar Lane is too large.");
+
if (!hasAnyScalarValue(Key))
return false;
const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
- assert(Entry[Instance.Part].size() == VF &&
+ assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&
"ScalarParts has wrong dimensions.");
return Entry[Instance.Part][Instance.Lane] != nullptr;
}
@@ -194,7 +207,7 @@ public:
// TODO: Consider storing uniform values only per-part, as they occupy
// lane 0 only, keeping the other VF-1 redundant entries null.
for (unsigned Part = 0; Part < UF; ++Part)
- Entry[Part].resize(VF, nullptr);
+ Entry[Part].resize(VF.getKnownMinValue(), nullptr);
ScalarMapStorage[Key] = Entry;
}
ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
@@ -233,14 +246,15 @@ struct VPCallback {
/// VPTransformState holds information passed down when "executing" a VPlan,
/// needed for generating the output IR.
struct VPTransformState {
- VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT,
- IRBuilder<> &Builder, VectorizerValueMap &ValueMap,
- InnerLoopVectorizer *ILV, VPCallback &Callback)
- : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder),
- ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
+ VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI,
+ DominatorTree *DT, IRBuilder<> &Builder,
+ VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV,
+ VPCallback &Callback)
+ : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT),
+ Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
- unsigned VF;
+ ElementCount VF;
unsigned UF;
/// Hold the indices to generate specific scalar instructions. Null indicates
@@ -255,6 +269,9 @@ struct VPTransformState {
typedef SmallVector<Value *, 2> PerPartValuesTy;
DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
+
+ using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>;
+ DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
} Data;
/// Get the generated Value for a given VPValue and a given Part. Note that
@@ -271,20 +288,21 @@ struct VPTransformState {
}
/// Get the generated Value for a given VPValue and given Part and Lane.
- Value *get(VPValue *Def, const VPIteration &Instance) {
- // If the Def is managed directly by VPTransformState, extract the lane from
- // the relevant part. Note that currently only VPInstructions and external
- // defs are managed by VPTransformState. Other Defs are still created by ILV
- // and managed in its ValueMap. For those this method currently just
- // delegates the call to ILV below.
- if (Data.PerPartOutput.count(Def)) {
- auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
- // TODO: Cache created scalar values.
- return Builder.CreateExtractElement(VecPart,
- Builder.getInt32(Instance.Lane));
- }
+ Value *get(VPValue *Def, const VPIteration &Instance);
+
+ bool hasVectorValue(VPValue *Def, unsigned Part) {
+ auto I = Data.PerPartOutput.find(Def);
+ return I != Data.PerPartOutput.end() && Part < I->second.size() &&
+ I->second[Part];
+ }
- return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
+ bool hasScalarValue(VPValue *Def, VPIteration Instance) {
+ auto I = Data.PerPartScalars.find(Def);
+ if (I == Data.PerPartScalars.end())
+ return false;
+ return Instance.Part < I->second.size() &&
+ Instance.Lane < I->second[Instance.Part].size() &&
+ I->second[Instance.Part][Instance.Lane];
}
/// Set the generated Value for a given VPValue and a given Part.
@@ -295,6 +313,18 @@ struct VPTransformState {
}
Data.PerPartOutput[Def][Part] = V;
}
+ void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part);
+
+ void set(VPValue *Def, Value *V, const VPIteration &Instance) {
+ auto Iter = Data.PerPartScalars.insert({Def, {}});
+ auto &PerPartVec = Iter.first->second;
+ while (PerPartVec.size() <= Instance.Part)
+ PerPartVec.emplace_back();
+ auto &Scalars = PerPartVec[Instance.Part];
+ while (Scalars.size() <= Instance.Lane)
+ Scalars.push_back(nullptr);
+ Scalars[Instance.Lane] = V;
+ }
/// Hold state information used when constructing the CFG of the output IR,
/// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
@@ -321,6 +351,9 @@ struct VPTransformState {
CFGState() = default;
} CFG;
+ /// Hold a pointer to the original loop.
+ Loop *OrigLoop;
+
/// Hold a pointer to LoopInfo to register new basic blocks in the loop.
LoopInfo *LI;
@@ -394,14 +427,14 @@ class VPBlockBase {
/// Remove \p Predecessor from the predecessors of this block.
void removePredecessor(VPBlockBase *Predecessor) {
- auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor);
+ auto Pos = find(Predecessors, Predecessor);
assert(Pos && "Predecessor does not exist");
Predecessors.erase(Pos);
}
/// Remove \p Successor from the successors of this block.
void removeSuccessor(VPBlockBase *Successor) {
- auto Pos = std::find(Successors.begin(), Successors.end(), Successor);
+ auto Pos = find(Successors, Successor);
assert(Pos && "Successor does not exist");
Successors.erase(Pos);
}
@@ -594,49 +627,30 @@ public:
// hoisted into a VPBlockBase.
return true;
}
+
+ /// Replace all operands of VPUsers in the block with \p NewValue and also
+ /// replaces all uses of VPValues defined in the block with NewValue.
+ virtual void dropAllReferences(VPValue *NewValue) = 0;
};
/// VPRecipeBase is a base class modeling a sequence of one or more output IR
-/// instructions.
-class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
+/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
+/// and is responsible for deleting its defined values. Single-value
+/// VPRecipeBases that also inherit from VPValue must make sure to inherit from
+/// VPRecipeBase before VPValue.
+class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
+ public VPDef {
friend VPBasicBlock;
friend class VPBlockUtils;
- const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
/// Each VPRecipe belongs to a single VPBasicBlock.
VPBasicBlock *Parent = nullptr;
public:
- /// An enumeration for keeping track of the concrete subclass of VPRecipeBase
- /// that is actually instantiated. Values of this enumeration are kept in the
- /// SubclassID field of the VPRecipeBase objects. They are used for concrete
- /// type identification.
- using VPRecipeTy = enum {
- VPBlendSC,
- VPBranchOnMaskSC,
- VPInstructionSC,
- VPInterleaveSC,
- VPPredInstPHISC,
- VPReplicateSC,
- VPWidenCallSC,
- VPWidenCanonicalIVSC,
- VPWidenGEPSC,
- VPWidenIntOrFpInductionSC,
- VPWidenMemoryInstructionSC,
- VPWidenPHISC,
- VPWidenSC,
- VPWidenSelectSC
- };
-
- VPRecipeBase(const unsigned char SC) : SubclassID(SC) {}
+ VPRecipeBase(const unsigned char SC) : VPDef(SC) {}
virtual ~VPRecipeBase() = default;
- /// \return an ID for the concrete type of this object.
- /// This is used to implement the classof checks. This should not be used
- /// for any other purpose, as the values may change as LLVM evolves.
- unsigned getVPRecipeID() const { return SubclassID; }
-
/// \return the VPBasicBlock which this VPRecipe belongs to.
VPBasicBlock *getParent() { return Parent; }
const VPBasicBlock *getParent() const { return Parent; }
@@ -645,10 +659,6 @@ public:
/// this VPRecipe, thereby "executing" the VPlan.
virtual void execute(struct VPTransformState &State) = 0;
- /// Each recipe prints itself.
- virtual void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const = 0;
-
/// Insert an unlinked recipe into a basic block immediately before
/// the specified recipe.
void insertBefore(VPRecipeBase *InsertPos);
@@ -661,6 +671,11 @@ public:
/// the VPBasicBlock that MovePos lives in, right after MovePos.
void moveAfter(VPRecipeBase *MovePos);
+ /// Unlink this recipe and insert into BB before I.
+ ///
+ /// \pre I is a valid iterator into BB.
+ void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
+
/// This method unlinks 'this' from the containing basic block, but does not
/// delete it.
void removeFromParent();
@@ -669,13 +684,46 @@ public:
///
/// \returns an iterator pointing to the element after the erased one
iplist<VPRecipeBase>::iterator eraseFromParent();
+
+ /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or
+ /// nullptr otherwise.
+ VPUser *toVPUser();
+
+ /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
+ /// otherwise.
+ Instruction *getUnderlyingInstr() {
+ return cast<Instruction>(getVPValue()->getUnderlyingValue());
+ }
+ const Instruction *getUnderlyingInstr() const {
+ return cast<Instruction>(getVPValue()->getUnderlyingValue());
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPDef *D) {
+ // All VPDefs are also VPRecipeBases.
+ return true;
+ }
};
+inline bool VPUser::classof(const VPDef *Def) {
+ return Def->getVPDefID() == VPRecipeBase::VPInstructionSC ||
+ Def->getVPDefID() == VPRecipeBase::VPWidenSC ||
+ Def->getVPDefID() == VPRecipeBase::VPWidenCallSC ||
+ Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC ||
+ Def->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
+ Def->getVPDefID() == VPRecipeBase::VPBlendSC ||
+ Def->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
+ Def->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+ Def->getVPDefID() == VPRecipeBase::VPReductionSC ||
+ Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC ||
+ Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
+}
+
/// This is a concrete Recipe that models a single VPlan-level instruction.
/// While as any Recipe it may generate a sequence of IR instructions when
/// executed, these instructions would always form a single-def expression as
/// the VPInstruction is also a single def-use vertex.
-class VPInstruction : public VPUser, public VPRecipeBase {
+class VPInstruction : public VPRecipeBase, public VPUser, public VPValue {
friend class VPlanSlp;
public:
@@ -697,23 +745,26 @@ private:
void generateInstruction(VPTransformState &State, unsigned Part);
protected:
- Instruction *getUnderlyingInstr() {
- return cast_or_null<Instruction>(getUnderlyingValue());
- }
-
void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
public:
VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
- : VPUser(VPValue::VPInstructionSC, Operands),
- VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {}
+ : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands),
+ VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
+
+ VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands)
+ : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}),
+ VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {
+ for (auto *I : Operands)
+ addOperand(I->getVPValue());
+ }
VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
: VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPValue *V) {
- return V->getVPValueID() == VPValue::VPInstructionSC;
+ return V->getVPValueID() == VPValue::VPVInstructionSC;
}
VPInstruction *clone() const {
@@ -722,8 +773,8 @@ public:
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *R) {
- return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC;
+ static inline bool classof(const VPDef *R) {
+ return R->getVPDefID() == VPRecipeBase::VPInstructionSC;
}
unsigned getOpcode() const { return Opcode; }
@@ -733,13 +784,12 @@ public:
/// provided.
void execute(VPTransformState &State) override;
- /// Print the Recipe.
+ /// Print the VPInstruction to \p O.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
- /// Print the VPInstruction.
- void print(raw_ostream &O) const;
- void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
+ /// Print the VPInstruction to dbgs() (for debugging).
+ void dump() const;
/// Return true if this instruction may modify memory.
bool mayWriteToMemory() const {
@@ -773,23 +823,21 @@ public:
/// VPWidenRecipe is a recipe for producing a copy of vector type its
/// ingredient. This recipe covers most of the traditional vectorization cases
/// where each ingredient transforms into a vectorized version of itself.
-class VPWidenRecipe : public VPRecipeBase {
- /// Hold the instruction to be widened.
- Instruction &Ingredient;
-
- /// Hold VPValues for the operands of the ingredient.
- VPUser User;
-
+class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser {
public:
template <typename IterT>
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
- : VPRecipeBase(VPWidenSC), Ingredient(I), User(Operands) {}
+ : VPRecipeBase(VPRecipeBase::VPWidenSC),
+ VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {}
~VPWidenRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPWidenSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPWidenSC;
+ }
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPVWidenSC;
}
/// Produce widened copies of all Ingredients.
@@ -801,23 +849,19 @@ public:
};
/// A recipe for widening Call instructions.
-class VPWidenCallRecipe : public VPRecipeBase {
- /// Hold the call to be widened.
- CallInst &Ingredient;
-
- /// Hold VPValues for the arguments of the call.
- VPUser User;
+class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue {
public:
template <typename IterT>
VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
- : VPRecipeBase(VPWidenCallSC), Ingredient(I), User(CallArguments) {}
+ : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments),
+ VPValue(VPValue::VPVWidenCallSC, &I, this) {}
~VPWidenCallRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPWidenCallSC;
}
/// Produce a widened version of the call instruction.
@@ -829,13 +873,7 @@ public:
};
/// A recipe for widening select instructions.
-class VPWidenSelectRecipe : public VPRecipeBase {
-private:
- /// Hold the select to be widened.
- SelectInst &Ingredient;
-
- /// Hold VPValues for the operands of the select.
- VPUser User;
+class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue {
/// Is the condition of the select loop invariant?
bool InvariantCond;
@@ -844,14 +882,15 @@ public:
template <typename IterT>
VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
bool InvariantCond)
- : VPRecipeBase(VPWidenSelectSC), Ingredient(I), User(Operands),
+ : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands),
+ VPValue(VPValue::VPVWidenSelectSC, &I, this),
InvariantCond(InvariantCond) {}
~VPWidenSelectRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPWidenSelectSC;
}
/// Produce a widened version of the select instruction.
@@ -863,20 +902,24 @@ public:
};
/// A recipe for handling GEP instructions.
-class VPWidenGEPRecipe : public VPRecipeBase {
- GetElementPtrInst *GEP;
-
- /// Hold VPValues for the base and indices of the GEP.
- VPUser User;
-
+class VPWidenGEPRecipe : public VPRecipeBase,
+ public VPUser,
+ public VPValue {
bool IsPtrLoopInvariant;
SmallBitVector IsIndexLoopInvariant;
public:
template <typename IterT>
+ VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
+ : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
+ VPValue(VPWidenGEPSC, GEP, this),
+ IsIndexLoopInvariant(GEP->getNumIndices(), false) {}
+
+ template <typename IterT>
VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
Loop *OrigLoop)
- : VPRecipeBase(VPWidenGEPSC), GEP(GEP), User(Operands),
+ : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
+ VPValue(VPValue::VPVWidenGEPSC, GEP, this),
IsIndexLoopInvariant(GEP->getNumIndices(), false) {
IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
for (auto Index : enumerate(GEP->indices()))
@@ -886,8 +929,8 @@ public:
~VPWidenGEPRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPWidenGEPSC;
}
/// Generate the gep nodes.
@@ -900,18 +943,25 @@ public:
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector and scalar values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser {
PHINode *IV;
TruncInst *Trunc;
public:
- VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr)
- : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {}
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+ TruncInst *Trunc = nullptr)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV),
+ Trunc(Trunc) {
+ if (Trunc)
+ new VPValue(Trunc, this);
+ else
+ new VPValue(IV, this);
+ }
~VPWidenIntOrFpInductionRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
}
/// Generate the vectorized and scalarized versions of the phi node as
@@ -921,19 +971,38 @@ public:
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
+
+ /// Returns the start value of the induction.
+ VPValue *getStartValue() { return getOperand(0); }
};
/// A recipe for handling all phi nodes except for integer and FP inductions.
-class VPWidenPHIRecipe : public VPRecipeBase {
+/// For reduction PHIs, RdxDesc must point to the corresponding recurrence
+/// descriptor and the start value is the first operand of the recipe.
+class VPWidenPHIRecipe : public VPRecipeBase, public VPUser {
PHINode *Phi;
+ /// Descriptor for a reduction PHI.
+ RecurrenceDescriptor *RdxDesc = nullptr;
+
public:
- VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {}
+ /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p
+ /// RdxDesc.
+ VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start)
+ : VPWidenPHIRecipe(Phi) {
+ this->RdxDesc = &RdxDesc;
+ addOperand(&Start);
+ }
+
+ /// Create a VPWidenPHIRecipe for \p Phi
+ VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {
+ new VPValue(Phi, this);
+ }
~VPWidenPHIRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPWidenPHISC;
}
/// Generate the phi/select nodes.
@@ -942,21 +1011,25 @@ public:
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
+
+ /// Returns the start value of the phi, if it is a reduction.
+ VPValue *getStartValue() {
+ return getNumOperands() == 0 ? nullptr : getOperand(0);
+ }
};
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
-class VPBlendRecipe : public VPRecipeBase {
+class VPBlendRecipe : public VPRecipeBase, public VPUser {
PHINode *Phi;
+public:
/// The blend operation is a User of the incoming values and of their
/// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
/// might be incoming with a full mask for which there is no VPValue.
- VPUser User;
-
-public:
VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
- : VPRecipeBase(VPBlendSC), Phi(Phi), User(Operands) {
+ : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) {
+ new VPValue(Phi, this);
assert(Operands.size() > 0 &&
((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
"Expected either a single incoming value or a positive even number "
@@ -964,23 +1037,19 @@ public:
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPBlendSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPBlendSC;
}
/// Return the number of incoming values, taking into account that a single
/// incoming value has no mask.
- unsigned getNumIncomingValues() const {
- return (User.getNumOperands() + 1) / 2;
- }
+ unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
/// Return incoming value number \p Idx.
- VPValue *getIncomingValue(unsigned Idx) const {
- return User.getOperand(Idx * 2);
- }
+ VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); }
/// Return mask number \p Idx.
- VPValue *getMask(unsigned Idx) const { return User.getOperand(Idx * 2 + 1); }
+ VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); }
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
@@ -991,35 +1060,58 @@ public:
};
/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
-/// or stores into one wide load/store and shuffles.
-class VPInterleaveRecipe : public VPRecipeBase {
+/// or stores into one wide load/store and shuffles. The first operand of a
+/// VPInterleave recipe is the address, followed by the stored values, followed
+/// by an optional mask.
+class VPInterleaveRecipe : public VPRecipeBase, public VPUser {
const InterleaveGroup<Instruction> *IG;
- VPUser User;
+
+ bool HasMask = false;
public:
VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
- VPValue *Mask)
- : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) {
- if (Mask)
- User.addOperand(Mask);
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask)
+ : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) {
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (Instruction *I = IG->getMember(i)) {
+ if (I->getType()->isVoidTy())
+ continue;
+ new VPValue(I, this);
+ }
+
+ for (auto *SV : StoredValues)
+ addOperand(SV);
+ if (Mask) {
+ HasMask = true;
+ addOperand(Mask);
+ }
}
~VPInterleaveRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPInterleaveSC;
}
/// Return the address accessed by this recipe.
VPValue *getAddr() const {
- return User.getOperand(0); // Address is the 1st, mandatory operand.
+ return getOperand(0); // Address is the 1st, mandatory operand.
}
/// Return the mask used by this recipe. Note that a full mask is represented
/// by a nullptr.
VPValue *getMask() const {
// Mask is optional and therefore the last, currently 2nd operand.
- return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr;
+ return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
+ }
+
+ /// Return the VPValues stored by this interleave group. If it is a load
+ /// interleave group, return an empty ArrayRef.
+ ArrayRef<VPValue *> getStoredValues() const {
+ // The first operand is the address, followed by the stored values, followed
+ // by an optional mask.
+ return ArrayRef<VPValue *>(op_begin(), getNumOperands())
+ .slice(1, getNumOperands() - (HasMask ? 2 : 1));
}
/// Generate the wide load or store, and shuffles.
@@ -1032,17 +1124,61 @@ public:
const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
};
+/// A recipe to represent inloop reduction operations, performing a reduction on
+/// a vector operand into a scalar value, and adding the result to a chain.
+/// The Operands are {ChainOp, VecOp, [Condition]}.
+class VPReductionRecipe : public VPRecipeBase, public VPUser, public VPValue {
+ /// The recurrence decriptor for the reduction in question.
+ RecurrenceDescriptor *RdxDesc;
+ /// Fast math flags to use for the resulting reduction operation.
+ bool NoNaN;
+ /// Pointer to the TTI, needed to create the target reduction
+ const TargetTransformInfo *TTI;
+
+public:
+ VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp,
+ VPValue *VecOp, VPValue *CondOp, bool NoNaN,
+ const TargetTransformInfo *TTI)
+ : VPRecipeBase(VPRecipeBase::VPReductionSC), VPUser({ChainOp, VecOp}),
+ VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), NoNaN(NoNaN),
+ TTI(TTI) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
+ ~VPReductionRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPVReductionSC;
+ }
+
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPReductionSC;
+ }
+
+ /// Generate the reduction in the loop
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+
+ /// The VPValue of the scalar Chain being accumulated.
+ VPValue *getChainOp() const { return getOperand(0); }
+ /// The VPValue of the vector value to be reduced.
+ VPValue *getVecOp() const { return getOperand(1); }
+ /// The VPValue of the condition for the block.
+ VPValue *getCondOp() const {
+ return getNumOperands() > 2 ? getOperand(2) : nullptr;
+ }
+};
+
/// VPReplicateRecipe replicates a given instruction producing multiple scalar
/// copies of the original scalar type, one per lane, instead of producing a
/// single copy of widened type for all lanes. If the instruction is known to be
/// uniform only one copy, per lane zero, will be generated.
-class VPReplicateRecipe : public VPRecipeBase {
- /// The instruction being replicated.
- Instruction *Ingredient;
-
- /// Hold VPValues for the operands of the ingredient.
- VPUser User;
-
+class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue {
/// Indicator if only a single replica per lane is needed.
bool IsUniform;
@@ -1056,8 +1192,9 @@ public:
template <typename IterT>
VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
bool IsUniform, bool IsPredicated = false)
- : VPRecipeBase(VPReplicateSC), Ingredient(I), User(Operands),
- IsUniform(IsUniform), IsPredicated(IsPredicated) {
+ : VPRecipeBase(VPReplicateSC), VPUser(Operands),
+ VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform),
+ IsPredicated(IsPredicated) {
// Retain the previous behavior of predicateInstructions(), where an
// insert-element of a predicated instruction got hoisted into the
// predicated basic block iff it was its only user. This is achieved by
@@ -1069,8 +1206,12 @@ public:
~VPReplicateRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPReplicateSC;
+ }
+
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPVReplicateSC;
}
/// Generate replicas of the desired Ingredient. Replicas will be generated
@@ -1083,21 +1224,21 @@ public:
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
+
+ bool isUniform() const { return IsUniform; }
};
/// A recipe for generating conditional branches on the bits of a mask.
-class VPBranchOnMaskRecipe : public VPRecipeBase {
- VPUser User;
-
+class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser {
public:
VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
if (BlockInMask) // nullptr means all-one mask.
- User.addOperand(BlockInMask);
+ addOperand(BlockInMask);
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC;
}
/// Generate the extraction of the appropriate bit from the block mask and the
@@ -1109,7 +1250,7 @@ public:
VPSlotTracker &SlotTracker) const override {
O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
if (VPValue *Mask = getMask())
- Mask->print(O, SlotTracker);
+ Mask->printAsOperand(O, SlotTracker);
else
O << " All-One";
O << "\\l\"";
@@ -1118,9 +1259,9 @@ public:
/// Return the mask used by this recipe. Note that a full mask is represented
/// by a nullptr.
VPValue *getMask() const {
- assert(User.getNumOperands() <= 1 && "should have either 0 or 1 operands");
+ assert(getNumOperands() <= 1 && "should have either 0 or 1 operands");
// Mask is optional.
- return User.getNumOperands() == 1 ? User.getOperand(0) : nullptr;
+ return getNumOperands() == 1 ? getOperand(0) : nullptr;
}
};
@@ -1129,19 +1270,20 @@ public:
/// order to merge values that are set under such a branch and feed their uses.
/// The phi nodes can be scalar or vector depending on the users of the value.
/// This recipe works in concert with VPBranchOnMaskRecipe.
-class VPPredInstPHIRecipe : public VPRecipeBase {
- Instruction *PredInst;
+class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser {
public:
/// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
/// nodes after merging back from a Branch-on-Mask.
- VPPredInstPHIRecipe(Instruction *PredInst)
- : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {}
+ VPPredInstPHIRecipe(VPValue *PredV)
+ : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) {
+ new VPValue(PredV->getUnderlyingValue(), this);
+ }
~VPPredInstPHIRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC;
}
/// Generates phi nodes for live-outs as needed to retain SSA form.
@@ -1158,56 +1300,59 @@ public:
/// - For store: Address, stored value, optional mask
/// TODO: We currently execute only per-part unless a specific instance is
/// provided.
-class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
- Instruction &Instr;
- VPUser User;
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase,
+ public VPUser {
+ Instruction &Ingredient;
void setMask(VPValue *Mask) {
if (!Mask)
return;
- User.addOperand(Mask);
+ addOperand(Mask);
}
bool isMasked() const {
- return (isa<LoadInst>(Instr) && User.getNumOperands() == 2) ||
- (isa<StoreInst>(Instr) && User.getNumOperands() == 3);
+ return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
}
public:
VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
- : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Load), User({Addr}) {
+ : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}),
+ Ingredient(Load) {
+ new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
setMask(Mask);
}
VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
VPValue *StoredValue, VPValue *Mask)
- : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Store),
- User({Addr, StoredValue}) {
+ : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}),
+ Ingredient(Store) {
setMask(Mask);
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
}
/// Return the address accessed by this recipe.
VPValue *getAddr() const {
- return User.getOperand(0); // Address is the 1st, mandatory operand.
+ return getOperand(0); // Address is the 1st, mandatory operand.
}
/// Return the mask used by this recipe. Note that a full mask is represented
/// by a nullptr.
VPValue *getMask() const {
// Mask is optional and therefore the last operand.
- return isMasked() ? User.getOperand(User.getNumOperands() - 1) : nullptr;
+ return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
}
+ /// Returns true if this recipe is a store.
+ bool isStore() const { return isa<StoreInst>(Ingredient); }
+
/// Return the address accessed by this recipe.
VPValue *getStoredValue() const {
- assert(isa<StoreInst>(Instr) &&
- "Stored value only available for store instructions");
- return User.getOperand(1); // Stored value is the 2nd, mandatory operand.
+ assert(isStore() && "Stored value only available for store instructions");
+ return getOperand(1); // Stored value is the 2nd, mandatory operand.
}
/// Generate the wide load/store.
@@ -1220,21 +1365,16 @@ public:
/// A Recipe for widening the canonical induction variable of the vector loop.
class VPWidenCanonicalIVRecipe : public VPRecipeBase {
- /// A VPValue representing the canonical vector IV.
- VPValue Val;
-
public:
- VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {}
- ~VPWidenCanonicalIVRecipe() override = default;
+ VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {
+ new VPValue(nullptr, this);
+ }
- /// Return the VPValue representing the canonical vector induction variable of
- /// the vector loop.
- const VPValue *getVPValue() const { return &Val; }
- VPValue *getVPValue() { return &Val; }
+ ~VPWidenCanonicalIVRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPRecipeBase *V) {
- return V->getVPRecipeID() == VPRecipeBase::VPWidenCanonicalIVSC;
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
}
/// Generate a canonical vector induction variable of the vector loop, with
@@ -1321,6 +1461,11 @@ public:
/// this VPBasicBlock, thereby "executing" the VPlan.
void execute(struct VPTransformState *State) override;
+ /// Return the position of the first non-phi node recipe in the block.
+ iterator getFirstNonPhi();
+
+ void dropAllReferences(VPValue *NewValue) override;
+
private:
/// Create an IR BasicBlock to hold the output instructions generated by this
/// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -1361,8 +1506,11 @@ public:
IsReplicator(IsReplicator) {}
~VPRegionBlock() override {
- if (Entry)
+ if (Entry) {
+ VPValue DummyValue;
+ Entry->dropAllReferences(&DummyValue);
deleteCFG(Entry);
+ }
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1407,6 +1555,8 @@ public:
/// The method which generates the output IR instructions that correspond to
/// this VPRegionBlock, thereby "executing" the VPlan.
void execute(struct VPTransformState *State) override;
+
+ void dropAllReferences(VPValue *NewValue) override;
};
//===----------------------------------------------------------------------===//
@@ -1544,7 +1694,7 @@ class VPlan {
VPBlockBase *Entry;
/// Holds the VFs applicable to this VPlan.
- SmallSet<unsigned, 2> VFs;
+ SmallSetVector<ElementCount, 2> VFs;
/// Holds the name of the VPlan, for printing.
std::string Name;
@@ -1564,6 +1714,10 @@ class VPlan {
/// VPlan.
Value2VPValueTy Value2VPValue;
+ /// Contains all VPValues that been allocated by addVPValue directly and need
+ /// to be free when the plan's destructor is called.
+ SmallVector<VPValue *, 16> VPValuesToFree;
+
/// Holds the VPLoopInfo analysis for this VPlan.
VPLoopInfo VPLInfo;
@@ -1577,10 +1731,15 @@ public:
}
~VPlan() {
- if (Entry)
+ if (Entry) {
+ VPValue DummyValue;
+ for (VPBlockBase *Block : depth_first(Entry))
+ Block->dropAllReferences(&DummyValue);
+
VPBlockBase::deleteCFG(Entry);
- for (auto &MapEntry : Value2VPValue)
- delete MapEntry.second;
+ }
+ for (VPValue *VPV : VPValuesToFree)
+ delete VPV;
if (BackedgeTakenCount)
delete BackedgeTakenCount;
for (VPValue *Def : VPExternalDefs)
@@ -1608,9 +1767,9 @@ public:
return BackedgeTakenCount;
}
- void addVF(unsigned VF) { VFs.insert(VF); }
+ void addVF(ElementCount VF) { VFs.insert(VF); }
- bool hasVF(unsigned VF) { return VFs.count(VF); }
+ bool hasVF(ElementCount VF) { return VFs.count(VF); }
const std::string &getName() const { return Name; }
@@ -1630,7 +1789,15 @@ public:
void addVPValue(Value *V) {
assert(V && "Trying to add a null Value to VPlan");
assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
- Value2VPValue[V] = new VPValue(V);
+ VPValue *VPV = new VPValue(V);
+ Value2VPValue[V] = VPV;
+ VPValuesToFree.push_back(VPV);
+ }
+
+ void addVPValue(Value *V, VPValue *VPV) {
+ assert(V && "Trying to add a null Value to VPlan");
+ assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
+ Value2VPValue[V] = VPV;
}
VPValue *getVPValue(Value *V) {
@@ -1646,6 +1813,8 @@ public:
return getVPValue(V);
}
+ void removeVPValueFor(Value *V) { Value2VPValue.erase(V); }
+
/// Return the VPLoopInfo analysis for this VPlan.
VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
@@ -1723,13 +1892,13 @@ private:
void dump();
- static void printAsIngredient(raw_ostream &O, Value *V);
+ static void printAsIngredient(raw_ostream &O, const Value *V);
};
struct VPlanIngredient {
- Value *V;
+ const Value *V;
- VPlanIngredient(Value *V) : V(V) {}
+ VPlanIngredient(const Value *V) : V(V) {}
};
inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
@@ -1879,9 +2048,7 @@ public:
/// \returns nullptr if doesn't have such group.
InterleaveGroup<VPInstruction> *
getInterleaveGroup(VPInstruction *Instr) const {
- if (InterleaveGroupMap.count(Instr))
- return InterleaveGroupMap.find(Instr)->second;
- return nullptr;
+ return InterleaveGroupMap.lookup(Instr);
}
};
@@ -1965,10 +2132,7 @@ class VPlanSlp {
public:
VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
- ~VPlanSlp() {
- for (auto &KV : BundleToCombined)
- delete KV.second;
- }
+ ~VPlanSlp() = default;
/// Tries to build an SLP tree rooted at \p Operands and returns a
/// VPInstruction combining \p Operands, if they can be combined.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 7a80f3ff80a5..ac3b3505dc34 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -191,7 +191,7 @@ void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {
// Generate edge predicates and append them to the block predicate. RPO is
// necessary since the predecessor blocks' block predicate needs to be set
// before the current block's block predicate can be computed.
- for (VPBlockBase *Block : make_range(RPOT.begin(), RPOT.end())) {
+ for (VPBlockBase *Block : RPOT) {
// TODO: Handle nested regions once we start generating the same.
assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");
createOrPropagatePredicates(Block, Region);
@@ -208,7 +208,7 @@ void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
VPBlockBase *PrevBlock = nullptr;
- for (VPBlockBase *CurrBlock : make_range(RPOT.begin(), RPOT.end())) {
+ for (VPBlockBase *CurrBlock : RPOT) {
// TODO: Handle nested regions once we start generating the same.
assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index 9019ed15ec5f..6f21bf44291a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -124,7 +124,7 @@ bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
for (auto &I : *Parent) {
auto *VPI = cast<VPInstruction>(&I);
if (VPI->getOpcode() == Instruction::Load &&
- std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
+ llvm::is_contained(Operands, VPI))
LoadsSeen++;
if (LoadsSeen == Operands.size())
@@ -161,7 +161,8 @@ static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
unsigned OperandIndex) {
SmallVector<VPValue *, 4> Operands;
for (VPValue *V : Values) {
- auto *U = cast<VPUser>(V);
+ // Currently we only support VPInstructions.
+ auto *U = cast<VPInstruction>(V);
Operands.push_back(U->getOperand(OperandIndex));
}
return Operands;
@@ -222,18 +223,20 @@ static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
/// Traverses and compares operands of V1 and V2 to MaxLevel.
static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
VPInterleavedAccessInfo &IAI) {
- if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
+ auto *I1 = dyn_cast<VPInstruction>(V1);
+ auto *I2 = dyn_cast<VPInstruction>(V2);
+ // Currently we only support VPInstructions.
+ if (!I1 || !I2)
return 0;
if (MaxLevel == 0)
- return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
- cast<VPInstruction>(V2), IAI);
+ return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
unsigned Score = 0;
- for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
- for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
- Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
- cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
+ for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
+ for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
+ Score +=
+ getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
return Score;
}
@@ -463,8 +466,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
auto *VPI = new VPInstruction(Opcode, CombinedOperands);
VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
- LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
- cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
+ << *cast<VPInstruction>(Values[0]) << "\n");
addCombined(Values, VPI);
return VPI;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3a4872a72122..1a54603faf22 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -48,6 +48,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
if (DeadInstructions.count(Inst)) {
+ VPValue DummyValue;
+ VPInst->replaceAllUsesWith(&DummyValue);
Ingredient->eraseFromParent();
continue;
}
@@ -66,7 +68,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
InductionDescriptor II = Inductions.lookup(Phi);
if (II.getKind() == InductionDescriptor::IK_IntInduction ||
II.getKind() == InductionDescriptor::IK_FpInduction) {
- NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
+ VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
+ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start);
} else
NewRecipe = new VPWidenPHIRecipe(Phi);
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
@@ -77,6 +80,11 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
NewRecipe->insertBefore(Ingredient);
+ if (NewRecipe->getNumDefinedValues() == 1)
+ VPInst->replaceAllUsesWith(NewRecipe->getVPValue());
+ else
+ assert(NewRecipe->getNumDefinedValues() == 0 &&
+ "Only recpies with zero or one defined values expected");
Ingredient->eraseFromParent();
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index f73505d0279a..ed572ca36627 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -10,9 +10,9 @@
/// This file contains the declarations of the entities induced by Vectorization
/// Plans, e.g. the instructions the VPlan intends to generate if executed.
/// VPlan models the following entities:
-/// VPValue
-/// |-- VPUser
-/// | |-- VPInstruction
+/// VPValue VPUser VPDef
+/// | |
+/// VPInstruction
/// These are documented in docs/VectorizationPlan.rst.
///
//===----------------------------------------------------------------------===//
@@ -21,7 +21,9 @@
#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/ADT/iterator_range.h"
namespace llvm {
@@ -29,8 +31,11 @@ namespace llvm {
// Forward declarations.
class raw_ostream;
class Value;
+class VPDef;
class VPSlotTracker;
class VPUser;
+class VPRecipeBase;
+class VPWidenMemoryInstructionRecipe;
// This is the base class of the VPlan Def/Use graph, used for modeling the data
// flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -38,10 +43,14 @@ class VPUser;
// and live-outs which the VPlan will need to fix accordingly.
class VPValue {
friend class VPBuilder;
+ friend class VPDef;
+ friend class VPInstruction;
friend struct VPlanTransforms;
friend class VPBasicBlock;
friend class VPInterleavedAccessInfo;
friend class VPSlotTracker;
+ friend class VPRecipeBase;
+ friend class VPWidenMemoryInstructionRecipe;
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -51,8 +60,11 @@ protected:
// Hold the underlying Value, if any, attached to this VPValue.
Value *UnderlyingVal;
- VPValue(const unsigned char SC, Value *UV = nullptr)
- : SubclassID(SC), UnderlyingVal(UV) {}
+ /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the
+ /// VPValue is not defined by any recipe modeled in VPlan.
+ VPDef *Def;
+
+ VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr);
// DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
// the front-end and back-end of VPlan so that the middle-end is as
@@ -61,10 +73,6 @@ protected:
// for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
// back-end and analysis information for the new IR.
- /// Return the underlying Value attached to this VPValue.
- Value *getUnderlyingValue() { return UnderlyingVal; }
- const Value *getUnderlyingValue() const { return UnderlyingVal; }
-
// Set \p Val as the underlying Value of this VPValue.
void setUnderlyingValue(Value *Val) {
assert(!UnderlyingVal && "Underlying Value is already set.");
@@ -72,16 +80,33 @@ protected:
}
public:
+ /// Return the underlying Value attached to this VPValue.
+ Value *getUnderlyingValue() { return UnderlyingVal; }
+ const Value *getUnderlyingValue() const { return UnderlyingVal; }
+
/// An enumeration for keeping track of the concrete subclass of VPValue that
/// are actually instantiated. Values of this enumeration are kept in the
/// SubclassID field of the VPValue objects. They are used for concrete
/// type identification.
- enum { VPValueSC, VPUserSC, VPInstructionSC };
+ enum {
+ VPValueSC,
+ VPVInstructionSC,
+ VPVMemoryInstructionSC,
+ VPVReductionSC,
+ VPVReplicateSC,
+ VPVWidenSC,
+ VPVWidenCallSC,
+ VPVWidenGEPSC,
+ VPVWidenSelectSC,
+ };
- VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {}
+ VPValue(Value *UV = nullptr, VPDef *Def = nullptr)
+ : VPValue(VPValueSC, UV, Def) {}
VPValue(const VPValue &) = delete;
VPValue &operator=(const VPValue &) = delete;
+ virtual ~VPValue();
+
/// \return an ID for the concrete type of this object.
/// This is used to implement the classof checks. This should not be used
/// for any other purpose, as the values may change as LLVM evolves.
@@ -90,9 +115,28 @@ public:
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const;
void print(raw_ostream &OS, VPSlotTracker &Tracker) const;
+ /// Dump the value to stderr (for debugging).
+ void dump() const;
+
unsigned getNumUsers() const { return Users.size(); }
void addUser(VPUser &User) { Users.push_back(&User); }
+ /// Remove a single \p User from the list of users.
+ void removeUser(VPUser &User) {
+ bool Found = false;
+ // The same user can be added multiple times, e.g. because the same VPValue
+ // is used twice by the same VPUser. Remove a single one.
+ erase_if(Users, [&User, &Found](VPUser *Other) {
+ if (Found)
+ return false;
+ if (Other == &User) {
+ Found = true;
+ return true;
+ }
+ return false;
+ });
+ }
+
typedef SmallVectorImpl<VPUser *>::iterator user_iterator;
typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator;
typedef iterator_range<user_iterator> user_range;
@@ -120,6 +164,17 @@ public:
}
void replaceAllUsesWith(VPValue *New);
+
+ VPDef *getDef() { return Def; }
+
+ /// Returns the underlying IR value, if this VPValue is defined outside the
+ /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef
+ /// inside a VPlan.
+ Value *getLiveInIRValue() {
+ assert(!getDef() &&
+ "VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
+ return getUnderlyingValue();
+ }
};
typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
@@ -129,34 +184,32 @@ raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
/// This class augments VPValue with operands which provide the inverse def-use
/// edges from VPValue's users to their defs.
-class VPUser : public VPValue {
+class VPUser {
SmallVector<VPValue *, 2> Operands;
protected:
- VPUser(const unsigned char SC) : VPValue(SC) {}
- VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) {
+ /// Print the operands to \p O.
+ void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const;
+
+public:
+ VPUser() {}
+ VPUser(ArrayRef<VPValue *> Operands) {
for (VPValue *Operand : Operands)
addOperand(Operand);
}
-public:
- VPUser() : VPValue(VPValue::VPUserSC) {}
- VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {}
VPUser(std::initializer_list<VPValue *> Operands)
: VPUser(ArrayRef<VPValue *>(Operands)) {}
- template <typename IterT>
- VPUser(iterator_range<IterT> Operands) : VPValue(VPValue::VPUserSC) {
+ template <typename IterT> VPUser(iterator_range<IterT> Operands) {
for (VPValue *Operand : Operands)
addOperand(Operand);
}
VPUser(const VPUser &) = delete;
VPUser &operator=(const VPUser &) = delete;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPValue *V) {
- return V->getVPValueID() >= VPUserSC &&
- V->getVPValueID() <= VPInstructionSC;
+ virtual ~VPUser() {
+ for (VPValue *Op : operands())
+ Op->removeUser(*this);
}
void addOperand(VPValue *Operand) {
@@ -170,7 +223,11 @@ public:
return Operands[N];
}
- void setOperand(unsigned I, VPValue *New) { Operands[I] = New; }
+ void setOperand(unsigned I, VPValue *New) {
+ Operands[I]->removeUser(*this);
+ Operands[I] = New;
+ New->addUser(*this);
+ }
typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
@@ -185,7 +242,110 @@ public:
const_operand_range operands() const {
return const_operand_range(op_begin(), op_end());
}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPDef *Recipe);
};
+
+/// This class augments a recipe with a set of VPValues defined by the recipe.
+/// It allows recipes to define zero, one or multiple VPValues. A VPDef owns
+/// the VPValues it defines and is responsible for deleting its defined values.
+/// Single-value VPDefs that also inherit from VPValue must make sure to inherit
+/// from VPDef before VPValue.
+class VPDef {
+ friend class VPValue;
+
+ /// Subclass identifier (for isa/dyn_cast).
+ const unsigned char SubclassID;
+
+ /// The VPValues defined by this VPDef.
+ TinyPtrVector<VPValue *> DefinedValues;
+
+ /// Add \p V as a defined value by this VPDef.
+ void addDefinedValue(VPValue *V) {
+ assert(V->getDef() == this &&
+ "can only add VPValue already linked with this VPDef");
+ DefinedValues.push_back(V);
+ }
+
+ /// Remove \p V from the values defined by this VPDef. \p V must be a defined
+ /// value of this VPDef.
+ void removeDefinedValue(VPValue *V) {
+ assert(V->getDef() == this &&
+ "can only remove VPValue linked with this VPDef");
+ assert(is_contained(DefinedValues, V) &&
+ "VPValue to remove must be in DefinedValues");
+ erase_value(DefinedValues, V);
+ V->Def = nullptr;
+ }
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPRecipeBase
+ /// that is actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPRecipeBase objects. They are used for concrete
+ /// type identification.
+ using VPRecipeTy = enum {
+ VPBlendSC,
+ VPBranchOnMaskSC,
+ VPInstructionSC,
+ VPInterleaveSC,
+ VPPredInstPHISC,
+ VPReductionSC,
+ VPReplicateSC,
+ VPWidenCallSC,
+ VPWidenCanonicalIVSC,
+ VPWidenGEPSC,
+ VPWidenIntOrFpInductionSC,
+ VPWidenMemoryInstructionSC,
+ VPWidenPHISC,
+ VPWidenSC,
+ VPWidenSelectSC
+ };
+
+ VPDef(const unsigned char SC) : SubclassID(SC) {}
+
+ virtual ~VPDef() {
+ for (VPValue *D : make_early_inc_range(DefinedValues)) {
+ assert(D->Def == this &&
+ "all defined VPValues should point to the containing VPDef");
+ assert(D->getNumUsers() == 0 &&
+ "all defined VPValues should have no more users");
+ D->Def = nullptr;
+ delete D;
+ }
+ }
+
+ /// Returns the VPValue with index \p I defined by the VPDef.
+ VPValue *getVPValue(unsigned I = 0) {
+ assert(DefinedValues[I] && "defined value must be non-null");
+ return DefinedValues[I];
+ }
+ const VPValue *getVPValue(unsigned I = 0) const {
+ assert(DefinedValues[I] && "defined value must be non-null");
+ return DefinedValues[I];
+ }
+
+ /// Returns an ArrayRef of the values defined by the VPDef.
+ ArrayRef<VPValue *> definedValues() { return DefinedValues; }
+ /// Returns an ArrayRef of the values defined by the VPDef.
+ ArrayRef<VPValue *> definedValues() const { return DefinedValues; }
+
+ /// Returns the number of values defined by the VPDef.
+ unsigned getNumDefinedValues() const { return DefinedValues.size(); }
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPDefID() const { return SubclassID; }
+
+ /// Dump the VPDef to stderr (for debugging).
+ void dump() const;
+
+ /// Each concrete VPDef prints itself.
+ virtual void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const = 0;
+};
+
class VPlan;
class VPBasicBlock;
class VPRegionBlock;
@@ -205,7 +365,7 @@ class VPSlotTracker {
void assignSlots(const VPlan &Plan);
public:
- VPSlotTracker(const VPlan *Plan) {
+ VPSlotTracker(const VPlan *Plan = nullptr) {
if (Plan)
assignSlots(*Plan);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index b384c94121e9..6eec8d14de4a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -65,9 +65,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) {
for (const VPBlockBase *Succ : Successors) {
// There must be a bi-directional link between block and successor.
const auto &SuccPreds = Succ->getPredecessors();
- assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) !=
- SuccPreds.end() &&
- "Missing predecessor link.");
+ assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link.");
(void)SuccPreds;
}
@@ -86,9 +84,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) {
// There must be a bi-directional link between block and predecessor.
const auto &PredSuccs = Pred->getSuccessors();
- assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) !=
- PredSuccs.end() &&
- "Missing successor link.");
+ assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link.");
(void)PredSuccs;
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 64b41bf9cefa..787f146bdddc 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
@@ -33,6 +34,7 @@ using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "vector-combine"
+STATISTIC(NumVecLoad, "Number of vector loads formed");
STATISTIC(NumVecCmp, "Number of vector compares formed");
STATISTIC(NumVecBO, "Number of vector binops formed");
STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
@@ -65,6 +67,7 @@ private:
const TargetTransformInfo &TTI;
const DominatorTree &DT;
+ bool vectorizeLoadInsert(Instruction &I);
ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
ExtractElementInst *Ext1,
unsigned PreferredExtractIndex) const;
@@ -88,6 +91,138 @@ static void replaceValue(Value &Old, Value &New) {
New.takeName(&Old);
}
+bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
+ // Match insert into fixed vector of scalar value.
+ // TODO: Handle non-zero insert index.
+ auto *Ty = dyn_cast<FixedVectorType>(I.getType());
+ Value *Scalar;
+ if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
+ !Scalar->hasOneUse())
+ return false;
+
+ // Optionally match an extract from another vector.
+ Value *X;
+ bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
+ if (!HasExtract)
+ X = Scalar;
+
+ // Match source value as load of scalar or vector.
+ // Do not vectorize scalar load (widening) if atomic/volatile or under
+ // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions
+ // or create data races non-existent in the source.
+ auto *Load = dyn_cast<LoadInst>(X);
+ if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
+ Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
+ mustSuppressSpeculation(*Load))
+ return false;
+
+ const DataLayout &DL = I.getModule()->getDataLayout();
+ Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
+ assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
+
+ // If original AS != Load's AS, we can't bitcast the original pointer and have
+ // to use Load's operand instead. Ideally we would want to strip pointer casts
+ // without changing AS, but there's no API to do that ATM.
+ unsigned AS = Load->getPointerAddressSpace();
+ if (AS != SrcPtr->getType()->getPointerAddressSpace())
+ SrcPtr = Load->getPointerOperand();
+
+ // We are potentially transforming byte-sized (8-bit) memory accesses, so make
+ // sure we have all of our type-based constraints in place for this target.
+ Type *ScalarTy = Scalar->getType();
+ uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
+ unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
+ if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
+ ScalarSize % 8 != 0)
+ return false;
+
+ // Check safety of replacing the scalar load with a larger vector load.
+ // We use minimal alignment (maximum flexibility) because we only care about
+ // the dereferenceable region. When calculating cost and creating a new op,
+ // we may use a larger value based on alignment attributes.
+ unsigned MinVecNumElts = MinVectorSize / ScalarSize;
+ auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
+ unsigned OffsetEltIndex = 0;
+ Align Alignment = Load->getAlign();
+ if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) {
+ // It is not safe to load directly from the pointer, but we can still peek
+ // through gep offsets and check if it safe to load from a base address with
+ // updated alignment. If it is, we can shuffle the element(s) into place
+ // after loading.
+ unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType());
+ APInt Offset(OffsetBitWidth, 0);
+ SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+ // We want to shuffle the result down from a high element of a vector, so
+ // the offset must be positive.
+ if (Offset.isNegative())
+ return false;
+
+ // The offset must be a multiple of the scalar element to shuffle cleanly
+ // in the element's size.
+ uint64_t ScalarSizeInBytes = ScalarSize / 8;
+ if (Offset.urem(ScalarSizeInBytes) != 0)
+ return false;
+
+ // If we load MinVecNumElts, will our target element still be loaded?
+ OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
+ if (OffsetEltIndex >= MinVecNumElts)
+ return false;
+
+ if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
+ return false;
+
+ // Update alignment with offset value. Note that the offset could be negated
+ // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
+ // negation does not change the result of the alignment calculation.
+ Alignment = commonAlignment(Alignment, Offset.getZExtValue());
+ }
+
+ // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
+ // Use the greater of the alignment on the load or its source pointer.
+ Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
+ Type *LoadTy = Load->getType();
+ InstructionCost OldCost =
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
+ APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
+ OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
+ /* Insert */ true, HasExtract);
+
+ // New pattern: load VecPtr
+ InstructionCost NewCost =
+ TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
+ // Optionally, we are shuffling the loaded vector element(s) into place.
+ if (OffsetEltIndex)
+ NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy);
+
+ // We can aggressively convert to the vector form because the backend can
+ // invert this transform if it does not result in a performance win.
+ if (OldCost < NewCost || !NewCost.isValid())
+ return false;
+
+ // It is safe and potentially profitable to load a vector directly:
+ // inselt undef, load Scalar, 0 --> load VecPtr
+ IRBuilder<> Builder(Load);
+ Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
+ Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
+
+ // Set everything but element 0 to undef to prevent poison from propagating
+ // from the extra loaded memory. This will also optionally shrink/grow the
+ // vector from the loaded size to the output size.
+ // We assume this operation has no cost in codegen if there was no offset.
+ // Note that we could use freeze to avoid poison problems, but then we might
+ // still need a shuffle to change the vector size.
+ unsigned OutputNumElts = Ty->getNumElements();
+ SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
+ assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
+ Mask[0] = OffsetEltIndex;
+ VecLd = Builder.CreateShuffleVector(VecLd, Mask);
+
+ replaceValue(I, *VecLd);
+ ++NumVecLoad;
+ return true;
+}
+
/// Determine which, if any, of the inputs should be replaced by a shuffle
/// followed by extract from a different index.
ExtractElementInst *VectorCombine::getShuffleExtract(
@@ -106,8 +241,14 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
Type *VecTy = Ext0->getVectorOperand()->getType();
assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
- int Cost0 = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
- int Cost1 = TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+ InstructionCost Cost0 =
+ TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+ InstructionCost Cost1 =
+ TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+
+ // If both costs are invalid no shuffle is needed
+ if (!Cost0.isValid() && !Cost1.isValid())
+ return nullptr;
// We are extracting from 2 different indexes, so one operand must be shuffled
// before performing a vector operation and/or extract. The more expensive
@@ -143,7 +284,7 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
"Expected constant extract indexes");
Type *ScalarTy = Ext0->getType();
auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
- int ScalarOpCost, VectorOpCost;
+ InstructionCost ScalarOpCost, VectorOpCost;
// Get cost estimates for scalar and vector versions of the operation.
bool IsBinOp = Instruction::isBinaryOp(Opcode);
@@ -164,9 +305,9 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
- int Extract0Cost =
+ InstructionCost Extract0Cost =
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
- int Extract1Cost =
+ InstructionCost Extract1Cost =
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
// A more expensive extract will always be replaced by a splat shuffle.
@@ -176,11 +317,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
// TODO: Evaluate whether that always results in lowest cost. Alternatively,
// check the cost of creating a broadcast shuffle and shuffling both
// operands to element 0.
- int CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
+ InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
// Extra uses of the extracts mean that we include those costs in the
// vector total because those instructions will not be eliminated.
- int OldCost, NewCost;
+ InstructionCost OldCost, NewCost;
if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
// Handle a special case. If the 2 extracts are identical, adjust the
// formulas to account for that. The extra use charge allows for either the
@@ -231,8 +372,7 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
auto *VecTy = cast<FixedVectorType>(Vec->getType());
SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
ShufMask[NewIndex] = OldIndex;
- Value *Undef = UndefValue::get(VecTy);
- return Builder.CreateShuffleVector(Vec, Undef, ShufMask, "shift");
+ return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
}
/// Given an extract element instruction with constant index operand, shuffle
@@ -366,17 +506,23 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
return false;
- // Disallow non-vector casts and length-changing shuffles.
+ // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
+ // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
+ // mask for scalable type is a splat or not.
+ // 2) Disallow non-vector casts and length-changing shuffles.
// TODO: We could allow any shuffle.
- auto *DestTy = dyn_cast<VectorType>(I.getType());
- auto *SrcTy = cast<VectorType>(V->getType());
- if (!DestTy || I.getOperand(0)->getType() != SrcTy)
+ auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
+ auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
+ if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy)
return false;
// The new shuffle must not cost more than the old shuffle. The bitcast is
// moved ahead of the shuffle, so assume that it has the same cost as before.
- if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) >
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy))
+ InstructionCost DestCost =
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy);
+ InstructionCost SrcCost =
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy);
+ if (DestCost > SrcCost || !DestCost.isValid())
return false;
unsigned DestNumElts = DestTy->getNumElements();
@@ -399,8 +545,7 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
// bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
++NumShufOfBitcast;
Value *CastV = Builder.CreateBitCast(V, DestTy);
- Value *Shuf =
- Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), NewMask);
+ Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
replaceValue(I, *Shuf);
return true;
}
@@ -467,7 +612,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
"Unexpected types for insert element into binop or cmp");
unsigned Opcode = I.getOpcode();
- int ScalarOpCost, VectorOpCost;
+ InstructionCost ScalarOpCost, VectorOpCost;
if (IsCmp) {
ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
@@ -478,16 +623,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
// Get cost estimate for the insert element. This cost will factor into
// both sequences.
- int InsertCost =
+ InstructionCost InsertCost =
TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
- int OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) +
- VectorOpCost;
- int NewCost = ScalarOpCost + InsertCost +
- (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
- (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
+ InstructionCost OldCost =
+ (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
+ InstructionCost NewCost = ScalarOpCost + InsertCost +
+ (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
+ (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
// We want to scalarize unless the vector variant actually has lower cost.
- if (OldCost < NewCost)
+ if (OldCost < NewCost || !NewCost.isValid())
return false;
// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
@@ -567,7 +712,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
if (!VecTy)
return false;
- int OldCost = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+ InstructionCost OldCost =
+ TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
@@ -578,7 +724,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
- int NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
+ InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
NewCost +=
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy);
NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
@@ -587,7 +733,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
// Aggressively form vector ops if the cost is equal because the transform
// may enable further optimization.
// Codegen can reverse this transform (scalarize) if it was not profitable.
- if (OldCost < NewCost)
+ if (OldCost < NewCost || !NewCost.isValid())
return false;
// Create a vector constant from the 2 scalar constants.
@@ -612,6 +758,10 @@ bool VectorCombine::run() {
if (DisableVectorCombine)
return false;
+ // Don't attempt vectorization if the target does not support vectors.
+ if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
+ return false;
+
bool MadeChange = false;
for (BasicBlock &BB : F) {
// Ignore unreachable basic blocks.
@@ -625,6 +775,7 @@ bool VectorCombine::run() {
if (isa<DbgInfoIntrinsic>(I))
continue;
Builder.SetInsertPoint(&I);
+ MadeChange |= vectorizeLoadInsert(I);
MadeChange |= foldExtractExtract(I);
MadeChange |= foldBitcastShuf(I);
MadeChange |= scalarizeBinopOrCmp(I);