aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2024-07-27 23:34:35 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-10-23 18:26:01 +0000
commit0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583 (patch)
tree6cf5ab1f05330c6773b1f3f64799d56a9c7a1faa /contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent6b9f7133aba44189d9625c352bc2c2a59baf18ef (diff)
parentac9a064cb179f3425b310fa2847f8764ac970a4d (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2905
1 files changed, 1369 insertions, 1536 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index dd596c567cd4..6d28b8fabe42 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -59,7 +59,9 @@
#include "VPlan.h"
#include "VPlanAnalysis.h"
#include "VPlanHCFGBuilder.h"
+#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
+#include "VPlanVerifier.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -123,6 +125,7 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/VectorBuilder.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
@@ -202,6 +205,11 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks"));
+static cl::opt<bool> UseLegacyCostModel(
+ "vectorize-use-legacy-cost-model", cl::init(false), cl::Hidden,
+ cl::desc("Use the legacy cost model instead of the VPlan-based cost model. "
+ "This option will be removed in the future."));
+
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -247,10 +255,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
"Create lane mask using active.lane.mask intrinsic, and use "
"it for both data and control flow"),
- clEnumValN(
- TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
- "data-and-control-without-rt-check",
- "Similar to data-and-control, but remove the runtime check")));
+ clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
+ "data-and-control-without-rt-check",
+ "Similar to data-and-control, but remove the runtime check"),
+ clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
+ "Use predicated EVL instructions for tail folding. If EVL "
+ "is unsupported, fallback to data-without-lane-mask.")));
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -267,11 +277,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
-static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
- "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
- cl::desc("We don't interleave loops with a estimated constant trip count "
- "below this number"));
-
static cl::opt<unsigned> ForceTargetNumScalarRegs(
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of scalar registers."));
@@ -290,7 +295,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
cl::desc("A flag that overrides the target's max interleave factor for "
"vectorized loops."));
-static cl::opt<unsigned> ForceTargetInstructionCost(
+cl::opt<unsigned> ForceTargetInstructionCost(
"force-target-instruction-cost", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's expected cost for "
"an instruction to a single constant value. Mostly "
@@ -319,12 +324,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
cl::desc(
"Enable runtime interleaving until load/store ports are saturated"));
-/// Interleave small loops with scalar reductions.
-static cl::opt<bool> InterleaveSmallLoopScalarReduction(
- "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
- cl::desc("Enable interleaving for loops with small iteration counts that "
- "contain scalar reductions to expose ILP."));
-
/// The number of stores in a loop that are allowed to need predication.
static cl::opt<unsigned> NumberOfStoresToPredicate(
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -418,14 +417,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
}
-/// A helper function that returns the reciprocal of the block probability of
-/// predicated blocks. If we return X, we are assuming the predicated block
-/// will execute once for every X iterations of the loop header.
-///
-/// TODO: We should use actual block probability here, if available. Currently,
-/// we always assume predicated blocks have a 50% chance of executing.
-static unsigned getReciprocalPredBlockProb() { return 2; }
-
/// Returns "best known" trip count for the specified loop \p L as defined by
/// the following procedure:
/// 1) Returns exact trip count if it is known.
@@ -450,37 +441,6 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
return std::nullopt;
}
-/// Return a vector containing interleaved elements from multiple
-/// smaller input vectors.
-static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
- const Twine &Name) {
- unsigned Factor = Vals.size();
- assert(Factor > 1 && "Tried to interleave invalid number of vectors");
-
- VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
-#ifndef NDEBUG
- for (Value *Val : Vals)
- assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
-#endif
-
- // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
- // must use intrinsics to interleave.
- if (VecTy->isScalableTy()) {
- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
- return Builder.CreateIntrinsic(
- WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
- /*FMFSource=*/nullptr, Name);
- }
-
- // Fixed length. Start by concatenating all vectors into a wide vector.
- Value *WideVec = concatenateVectors(Builder, Vals);
-
- // Interleave the elements into the wide vector.
- const unsigned NumElts = VecTy->getElementCount().getFixedValue();
- return Builder.CreateShuffleVector(
- WideVec, createInterleaveMask(NumElts, Factor), Name);
-}
-
namespace {
// Forward declare GeneratedRTChecks.
class GeneratedRTChecks;
@@ -552,11 +512,6 @@ public:
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
- /// A type for vectorized values in the new loop. Each value from the
- /// original loop, when vectorized, is represented by UF vector values in the
- /// new unrolled loop, where UF is the unroll factor.
- using VectorParts = SmallVector<Value *, 2>;
-
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
@@ -567,23 +522,9 @@ public:
const VPIteration &Instance,
VPTransformState &State);
- /// Try to vectorize interleaved access group \p Group with the base address
- /// given in \p Addr, optionally masking the vector operations if \p
- /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
- /// values in the vectorized loop.
- void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
- ArrayRef<VPValue *> VPDefs,
- VPTransformState &State, VPValue *Addr,
- ArrayRef<VPValue *> StoredValues,
- VPValue *BlockInMask, bool NeedsMaskForGaps);
-
/// Fix the non-induction PHIs in \p Plan.
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
- /// Returns true if the reordering of FP operations is not allowed, but we are
- /// able to vectorize with strict in-order reductions for the given RdxDesc.
- bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
-
/// Create a new phi node for the induction variable \p OrigPhi to resume
/// iteration count in the scalar epilogue, from where the vectorized loop
/// left off. \p Step is the SCEV-expanded induction step to use. In cases
@@ -622,14 +563,6 @@ protected:
BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
VPlan &Plan, VPTransformState &State);
- /// Create the exit value of first order recurrences in the middle block and
- /// update their users.
- void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
- VPTransformState &State);
-
- /// Create code for the loop exit value of the reduction.
- void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
-
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
@@ -637,11 +570,6 @@ protected:
/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
- /// Returns a bitcasted value to the requested vector type.
- /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
- Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
- const DataLayout &DL);
-
/// Emit a bypass check to see if the vector trip count is zero, including if
/// it overflows.
void emitIterationCountCheck(BasicBlock *Bypass);
@@ -675,17 +603,6 @@ protected:
/// running the verifier. Return the preheader of the completed vector loop.
BasicBlock *completeLoopSkeleton();
- /// Collect poison-generating recipes that may generate a poison value that is
- /// used after vectorization, even when their operands are not poison. Those
- /// recipes meet the following conditions:
- /// * Contribute to the address computation of a recipe generating a widen
- /// memory load/store (VPWidenMemoryInstructionRecipe or
- /// VPInterleaveRecipe).
- /// * Such a widen memory load/store has at least one underlying Instruction
- /// that is in a basic block that needs predication and after vectorization
- /// the generated instruction won't be predicated.
- void collectPoisonGeneratingRecipes(VPTransformState &State);
-
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
virtual void printDebugTracesAtStart(){};
@@ -1028,9 +945,12 @@ void reportVectorizationFailure(const StringRef DebugMsg,
<< "loop not vectorized: " << OREMsg);
}
-void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
+/// Reports an informative message: print \p Msg for debugging purposes as well
+/// as an optimization remark. Uses either \p I as location of the remark, or
+/// otherwise \p TheLoop.
+static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
- Instruction *I) {
+ Instruction *I = nullptr) {
LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
ORE->emit(
@@ -1057,108 +977,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
} // end namespace llvm
-#ifndef NDEBUG
-/// \return string containing a file name and a line # for the given loop.
-static std::string getDebugLocString(const Loop *L) {
- std::string Result;
- if (L) {
- raw_string_ostream OS(Result);
- if (const DebugLoc LoopDbgLoc = L->getStartLoc())
- LoopDbgLoc.print(OS);
- else
- // Just print the module name.
- OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
- OS.flush();
- }
- return Result;
-}
-#endif
-
-void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
- VPTransformState &State) {
-
- // Collect recipes in the backward slice of `Root` that may generate a poison
- // value that is used after vectorization.
- SmallPtrSet<VPRecipeBase *, 16> Visited;
- auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
- SmallVector<VPRecipeBase *, 16> Worklist;
- Worklist.push_back(Root);
-
- // Traverse the backward slice of Root through its use-def chain.
- while (!Worklist.empty()) {
- VPRecipeBase *CurRec = Worklist.back();
- Worklist.pop_back();
-
- if (!Visited.insert(CurRec).second)
- continue;
-
- // Prune search if we find another recipe generating a widen memory
- // instruction. Widen memory instructions involved in address computation
- // will lead to gather/scatter instructions, which don't need to be
- // handled.
- if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
- isa<VPInterleaveRecipe>(CurRec) ||
- isa<VPScalarIVStepsRecipe>(CurRec) ||
- isa<VPCanonicalIVPHIRecipe>(CurRec) ||
- isa<VPActiveLaneMaskPHIRecipe>(CurRec))
- continue;
-
- // This recipe contributes to the address computation of a widen
- // load/store. If the underlying instruction has poison-generating flags,
- // drop them directly.
- if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
- RecWithFlags->dropPoisonGeneratingFlags();
- } else {
- Instruction *Instr = dyn_cast_or_null<Instruction>(
- CurRec->getVPSingleValue()->getUnderlyingValue());
- (void)Instr;
- assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
- "found instruction with poison generating flags not covered by "
- "VPRecipeWithIRFlags");
- }
-
- // Add new definitions to the worklist.
- for (VPValue *operand : CurRec->operands())
- if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
- Worklist.push_back(OpDef);
- }
- });
-
- // Traverse all the recipes in the VPlan and collect the poison-generating
- // recipes in the backward slice starting at the address of a VPWidenRecipe or
- // VPInterleaveRecipe.
- auto Iter = vp_depth_first_deep(State.Plan->getEntry());
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
- for (VPRecipeBase &Recipe : *VPBB) {
- if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
- Instruction &UnderlyingInstr = WidenRec->getIngredient();
- VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
- if (AddrDef && WidenRec->isConsecutive() &&
- Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
- collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
- } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
- VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
- if (AddrDef) {
- // Check if any member of the interleave group needs predication.
- const InterleaveGroup<Instruction> *InterGroup =
- InterleaveRec->getInterleaveGroup();
- bool NeedPredication = false;
- for (int I = 0, NumMembers = InterGroup->getNumMembers();
- I < NumMembers; ++I) {
- Instruction *Member = InterGroup->getMember(I);
- if (Member)
- NeedPredication |=
- Legal->blockNeedsPredication(Member->getParent());
- }
-
- if (NeedPredication)
- collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
- }
- }
- }
- }
-}
-
namespace llvm {
// Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -1222,7 +1040,7 @@ public:
bool selectUserVectorizationFactor(ElementCount UserVF) {
collectUniformsAndScalars(UserVF);
collectInstsToScalarize(UserVF);
- return expectedCost(UserVF).first.isValid();
+ return expectedCost(UserVF).isValid();
}
/// \return The size (in bits) of the smallest and widest types in the code
@@ -1298,11 +1116,9 @@ public:
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
assert(VF.isVector() &&
"Profitable to scalarize relevant only for VF > 1.");
-
- // Cost model is not run in the VPlan-native path - return conservative
- // result until this changes.
- if (EnableVPlanNativePath)
- return false;
+ assert(
+ TheLoop->isInnermost() &&
+ "cost-model should not be used for outer loops (in VPlan-native path)");
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
@@ -1312,6 +1128,9 @@ public:
/// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
+ assert(
+ TheLoop->isInnermost() &&
+ "cost-model should not be used for outer loops (in VPlan-native path)");
// Pseudo probe needs to be duplicated for each unrolled iteration and
// vector lane so that profiled loop trip count can be accurately
// accumulated instead of being under counted.
@@ -1321,11 +1140,6 @@ public:
if (VF.isScalar())
return true;
- // Cost model is not run in the VPlan-native path - return conservative
- // result until this changes.
- if (EnableVPlanNativePath)
- return false;
-
auto UniformsPerVF = Uniforms.find(VF);
assert(UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity");
@@ -1334,14 +1148,12 @@ public:
/// Returns true if \p I is known to be scalar after vectorization.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
+ assert(
+ TheLoop->isInnermost() &&
+ "cost-model should not be used for outer loops (in VPlan-native path)");
if (VF.isScalar())
return true;
- // Cost model is not run in the VPlan-native path - return conservative
- // result until this changes.
- if (EnableVPlanNativePath)
- return false;
-
auto ScalarsPerVF = Scalars.find(VF);
assert(ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF");
@@ -1399,10 +1211,9 @@ public:
/// through the cost modeling.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
assert(VF.isVector() && "Expected VF to be a vector VF");
- // Cost model is not run in the VPlan-native path - return conservative
- // result until this changes.
- if (EnableVPlanNativePath)
- return CM_GatherScatter;
+ assert(
+ TheLoop->isInnermost() &&
+ "cost-model should not be used for outer loops (in VPlan-native path)");
std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
auto Itr = WideningDecisions.find(InstOnVF);
@@ -1570,29 +1381,40 @@ public:
/// Returns true if \p I is a memory instruction in an interleaved-group
/// of memory accesses that can be vectorized with wide vector loads/stores
/// and shuffles.
- bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
+ bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
/// Check if \p Instr belongs to any interleaved access group.
- bool isAccessInterleaved(Instruction *Instr) {
+ bool isAccessInterleaved(Instruction *Instr) const {
return InterleaveInfo.isInterleaved(Instr);
}
/// Get the interleaved access group that \p Instr belongs to.
const InterleaveGroup<Instruction> *
- getInterleavedAccessGroup(Instruction *Instr) {
+ getInterleavedAccessGroup(Instruction *Instr) const {
return InterleaveInfo.getInterleaveGroup(Instr);
}
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
bool requiresScalarEpilogue(bool IsVectorizing) const {
- if (!isScalarEpilogueAllowed())
+ if (!isScalarEpilogueAllowed()) {
+ LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
return false;
+ }
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+ return true;
+ }
+ if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
+ LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
+ "interleaved group requires scalar epilogue\n");
return true;
- return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
+ }
+ LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
+ return false;
}
/// Returns true if we're required to use a scalar epilogue for at least
@@ -1617,19 +1439,67 @@ public:
}
/// Returns the TailFoldingStyle that is best for the current loop.
- TailFoldingStyle
- getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
- if (!CanFoldTailByMasking)
+ TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
+ if (!ChosenTailFoldingStyle)
return TailFoldingStyle::None;
+ return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
+ : ChosenTailFoldingStyle->second;
+ }
+
+ /// Selects and saves TailFoldingStyle for 2 options - if IV update may
+ /// overflow or not.
+ /// \param IsScalableVF true if scalable vector factors enabled.
+ /// \param UserIC User specific interleave count.
+ void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+ assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
+ if (!Legal->canFoldTailByMasking()) {
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ return;
+ }
- if (ForceTailFoldingStyle.getNumOccurrences())
- return ForceTailFoldingStyle;
+ if (!ForceTailFoldingStyle.getNumOccurrences()) {
+ ChosenTailFoldingStyle = std::make_pair(
+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
+ return;
+ }
- return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
+ // Set styles when forced.
+ ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
+ ForceTailFoldingStyle.getValue());
+ if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
+ return;
+ // Override forced styles if needed.
+ // FIXME: use actual opcode/data type for analysis here.
+ // FIXME: Investigate opportunity for fixed vector factor.
+ bool EVLIsLegal =
+ IsScalableVF && UserIC <= 1 &&
+ TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+ !EnableVPlanNativePath &&
+ // FIXME: implement support for max safe dependency distance.
+ Legal->isSafeForAnyVectorWidth();
+ if (!EVLIsLegal) {
+ // If for some reason EVL mode is unsupported, fallback to
+ // DataWithoutLaneMask to try to vectorize the loop with folded tail
+ // in a generic way.
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
+ TailFoldingStyle::DataWithoutLaneMask);
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Preference for VP intrinsics indicated. Will "
+ "not try to generate VP Intrinsics "
+ << (UserIC > 1
+ ? "since interleave count specified is greater than 1.\n"
+ : "due to non-interleaving reasons.\n"));
+ }
}
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const {
+ // TODO: check if it is possible to check for None style independent of
+ // IVUpdateMayOverflow flag in getTailFoldingStyle.
return getTailFoldingStyle() != TailFoldingStyle::None;
}
@@ -1640,6 +1510,12 @@ public:
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
+ /// Returns true if VP intrinsics with explicit vector length support should
+ /// be generated in the tail folded loop.
+ bool foldTailWithEVL() const {
+ return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
+ }
+
/// Returns true if the Phi is part of an inloop reduction.
bool isInLoopReduction(PHINode *Phi) const {
return InLoopReductions.contains(Phi);
@@ -1663,20 +1539,13 @@ public:
Scalars.clear();
}
- /// The vectorization cost is a combination of the cost itself and a boolean
- /// indicating whether any of the contributing operations will actually
- /// operate on vector values after type legalization in the backend. If this
- /// latter value is false, then all operations will be scalarized (i.e. no
- /// vectorization has actually taken place).
- using VectorizationCostTy = std::pair<InstructionCost, bool>;
-
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
/// the factor width. If \p Invalid is not nullptr, this function
/// will add a pair(Instruction*, ElementCount) to \p Invalid for
/// each instruction that has an Invalid cost for the given VF.
- VectorizationCostTy
+ InstructionCost
expectedCost(ElementCount VF,
SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
@@ -1687,6 +1556,16 @@ public:
/// \p VF is the vectorization factor chosen for the original loop.
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
+ /// Returns the execution time cost of an instruction for a given vector
+ /// width. Vector width of one means scalar.
+ InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
+
+ /// Return the cost of instructions in an inloop reduction pattern, if I is
+ /// part of that pattern.
+ std::optional<InstructionCost>
+ getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
+ TTI::TargetCostKind CostKind) const;
+
private:
unsigned NumPredStores = 0;
@@ -1708,25 +1587,14 @@ private:
ElementCount MaxSafeVF,
bool FoldTailByMasking);
+ /// Checks if scalable vectorization is supported and enabled. Caches the
+ /// result to avoid repeated debug dumps for repeated queries.
+ bool isScalableVectorizationAllowed();
+
/// \return the maximum legal scalable VF, based on the safe max number
/// of elements.
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
- /// Returns the execution time cost of an instruction for a given vector
- /// width. Vector width of one means scalar.
- VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
- /// The cost-computation logic from getInstructionCost which provides
- /// the vector type as an output parameter.
- InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
- Type *&VectorTy);
-
- /// Return the cost of instructions in an inloop reduction pattern, if I is
- /// part of that pattern.
- std::optional<InstructionCost>
- getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
- TTI::TargetCostKind CostKind) const;
-
/// Calculate vectorization cost of memory instruction \p I.
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
@@ -1782,8 +1650,13 @@ private:
/// iterations to execute in the scalar loop.
ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- /// All blocks of loop are to be masked to fold tail of scalar iterations.
- bool CanFoldTailByMasking = false;
+ /// Control finally chosen tail folding style. The first element is used if
+ /// the IV update may overflow, the second element - if it does not.
+ std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
+ ChosenTailFoldingStyle;
+
+ /// true if scalable vectorization is supported and enabled.
+ std::optional<bool> IsScalableVectorizationAllowed;
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
@@ -2118,16 +1991,18 @@ public:
BestTripCount = *EstimatedTC;
}
+ BestTripCount = std::max(BestTripCount, 1U);
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
// Let's ensure the cost is always at least 1.
NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
(InstructionCost::CostType)1);
- LLVM_DEBUG(dbgs()
- << "We expect runtime memory checks to be hoisted "
- << "out of the outer loop. Cost reduced from "
- << MemCheckCost << " to " << NewMemCheckCost << '\n');
+ if (BestTripCount > 1)
+ LLVM_DEBUG(dbgs()
+ << "We expect runtime memory checks to be hoisted "
+ << "out of the outer loop. Cost reduced from "
+ << MemCheckCost << " to " << NewMemCheckCost << '\n');
MemCheckCost = NewMemCheckCost;
}
@@ -2207,7 +2082,7 @@ public:
BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
if (AddBranchWeights)
- setBranchWeights(BI, SCEVCheckBypassWeights);
+ setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
return SCEVCheckBlock;
}
@@ -2235,7 +2110,7 @@ public:
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
if (AddBranchWeights) {
- setBranchWeights(BI, MemCheckBypassWeights);
+ setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
}
ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
MemCheckBlock->getTerminator()->setDebugLoc(
@@ -2472,276 +2347,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
return TTI.enableMaskedInterleavedAccessVectorization();
}
-// Try to vectorize the interleave group that \p Instr belongs to.
-//
-// E.g. Translate following interleaved load group (factor = 3):
-// for (i = 0; i < N; i+=3) {
-// R = Pic[i]; // Member of index 0
-// G = Pic[i+1]; // Member of index 1
-// B = Pic[i+2]; // Member of index 2
-// ... // do something to R, G, B
-// }
-// To:
-// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
-// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
-// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
-// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
-//
-// Or translate following interleaved store group (factor = 3):
-// for (i = 0; i < N; i+=3) {
-// ... do something to R, G, B
-// Pic[i] = R; // Member of index 0
-// Pic[i+1] = G; // Member of index 1
-// Pic[i+2] = B; // Member of index 2
-// }
-// To:
-// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
-// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
-// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
-// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
-// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(
- const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
- VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
- VPValue *BlockInMask, bool NeedsMaskForGaps) {
- Instruction *Instr = Group->getInsertPos();
- const DataLayout &DL = Instr->getModule()->getDataLayout();
-
- // Prepare for the vector type of the interleaved load/store.
- Type *ScalarTy = getLoadStoreType(Instr);
- unsigned InterleaveFactor = Group->getFactor();
- auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
-
- // Prepare for the new pointers.
- SmallVector<Value *, 2> AddrParts;
- unsigned Index = Group->getIndex(Instr);
-
- // TODO: extend the masked interleaved-group support to reversed access.
- assert((!BlockInMask || !Group->isReverse()) &&
- "Reversed masked interleave-group not supported.");
-
- Value *Idx;
- // If the group is reverse, adjust the index to refer to the last vector lane
- // instead of the first. We adjust the index from the first vector lane,
- // rather than directly getting the pointer for lane VF - 1, because the
- // pointer operand of the interleaved access is supposed to be uniform. For
- // uniform instructions, we're only required to generate a value for the
- // first vector lane in each unroll iteration.
- if (Group->isReverse()) {
- Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
- Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
- Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
- Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
- Idx = Builder.CreateNeg(Idx);
- } else
- Idx = Builder.getInt32(-Index);
-
- for (unsigned Part = 0; Part < UF; Part++) {
- Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
- if (auto *I = dyn_cast<Instruction>(AddrPart))
- State.setDebugLocFrom(I->getDebugLoc());
-
- // Notice current instruction could be any index. Need to adjust the address
- // to the member of index 0.
- //
- // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
- // b = A[i]; // Member of index 0
- // Current pointer is pointed to A[i+1], adjust it to A[i].
- //
- // E.g. A[i+1] = a; // Member of index 1
- // A[i] = b; // Member of index 0
- // A[i+2] = c; // Member of index 2 (Current instruction)
- // Current pointer is pointed to A[i+2], adjust it to A[i].
-
- bool InBounds = false;
- if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
- InBounds = gep->isInBounds();
- AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
- AddrParts.push_back(AddrPart);
- }
-
- State.setDebugLocFrom(Instr->getDebugLoc());
- Value *PoisonVec = PoisonValue::get(VecTy);
-
- auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
- unsigned Part, Value *MaskForGaps) -> Value * {
- if (VF.isScalable()) {
- assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(InterleaveFactor == 2 &&
- "Unsupported deinterleave factor for scalable vectors");
- auto *BlockInMaskPart = State.get(BlockInMask, Part);
- SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
- auto *MaskTy =
- VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
- return Builder.CreateIntrinsic(
- MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
- /*FMFSource=*/nullptr, "interleaved.mask");
- }
-
- if (!BlockInMask)
- return MaskForGaps;
-
- Value *BlockInMaskPart = State.get(BlockInMask, Part);
- Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart,
- createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
- "interleaved.mask");
- return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
- MaskForGaps)
- : ShuffledMask;
- };
-
- // Vectorize the interleaved load group.
- if (isa<LoadInst>(Instr)) {
- Value *MaskForGaps = nullptr;
- if (NeedsMaskForGaps) {
- MaskForGaps =
- createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
- assert(MaskForGaps && "Mask for Gaps is required but it is null");
- }
-
- // For each unroll part, create a wide load for the group.
- SmallVector<Value *, 2> NewLoads;
- for (unsigned Part = 0; Part < UF; Part++) {
- Instruction *NewLoad;
- if (BlockInMask || MaskForGaps) {
- assert(useMaskedInterleavedAccesses(*TTI) &&
- "masked interleaved groups are not allowed.");
- Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
- NewLoad =
- Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
- GroupMask, PoisonVec, "wide.masked.vec");
- }
- else
- NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
- Group->getAlign(), "wide.vec");
- Group->addMetadata(NewLoad);
- NewLoads.push_back(NewLoad);
- }
-
- if (VecTy->isScalableTy()) {
- assert(InterleaveFactor == 2 &&
- "Unsupported deinterleave factor for scalable vectors");
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- // Scalable vectors cannot use arbitrary shufflevectors (only splats),
- // so must use intrinsics to deinterleave.
- Value *DI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
- /*FMFSource=*/nullptr, "strided.vec");
- unsigned J = 0;
- for (unsigned I = 0; I < InterleaveFactor; ++I) {
- Instruction *Member = Group->getMember(I);
-
- if (!Member)
- continue;
-
- Value *StridedVec = Builder.CreateExtractValue(DI, I);
- // If this member has different type, cast the result type.
- if (Member->getType() != ScalarTy) {
- VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
- StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
- }
-
- if (Group->isReverse())
- StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
-
- State.set(VPDefs[J], StridedVec, Part);
- ++J;
- }
- }
-
- return;
- }
-
- // For each member in the group, shuffle out the appropriate data from the
- // wide loads.
- unsigned J = 0;
- for (unsigned I = 0; I < InterleaveFactor; ++I) {
- Instruction *Member = Group->getMember(I);
-
- // Skip the gaps in the group.
- if (!Member)
- continue;
-
- auto StrideMask =
- createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
- for (unsigned Part = 0; Part < UF; Part++) {
- Value *StridedVec = Builder.CreateShuffleVector(
- NewLoads[Part], StrideMask, "strided.vec");
-
- // If this member has different type, cast the result type.
- if (Member->getType() != ScalarTy) {
- assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
- StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
- }
-
- if (Group->isReverse())
- StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
-
- State.set(VPDefs[J], StridedVec, Part);
- }
- ++J;
- }
- return;
- }
-
- // The sub vector type for current instruction.
- auto *SubVT = VectorType::get(ScalarTy, VF);
-
- // Vectorize the interleaved store group.
- Value *MaskForGaps =
- createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
- assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
- "masked interleaved groups are not allowed.");
- assert((!MaskForGaps || !VF.isScalable()) &&
- "masking gaps for scalable vectors is not yet supported.");
- for (unsigned Part = 0; Part < UF; Part++) {
- // Collect the stored vector from each member.
- SmallVector<Value *, 4> StoredVecs;
- unsigned StoredIdx = 0;
- for (unsigned i = 0; i < InterleaveFactor; i++) {
- assert((Group->getMember(i) || MaskForGaps) &&
- "Fail to get a member from an interleaved store group");
- Instruction *Member = Group->getMember(i);
-
- // Skip the gaps in the group.
- if (!Member) {
- Value *Undef = PoisonValue::get(SubVT);
- StoredVecs.push_back(Undef);
- continue;
- }
-
- Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
- ++StoredIdx;
-
- if (Group->isReverse())
- StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
-
- // If this member has different type, cast it to a unified type.
-
- if (StoredVec->getType() != SubVT)
- StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
-
- StoredVecs.push_back(StoredVec);
- }
-
- // Interleave all the smaller vectors into one wider vector.
- Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
- Instruction *NewStoreInstr;
- if (BlockInMask || MaskForGaps) {
- Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
- NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
- Group->getAlign(), GroupMask);
- } else
- NewStoreInstr =
- Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
-
- Group->addMetadata(NewStoreInstr);
- }
-}
-
void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
VPReplicateRecipe *RepRecipe,
const VPIteration &Instance,
@@ -2822,9 +2427,8 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
if (Cost->foldTailByMasking()) {
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
"VF*UF must be a power of 2 when folding tail by masking");
- Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
- TC = Builder.CreateAdd(
- TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
+ TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
+ "n.rnd.up");
}
// Now we need to generate the expression for the part of the loop that the
@@ -2850,37 +2454,6 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
return VectorTripCount;
}
-Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
- const DataLayout &DL) {
- // Verify that V is a vector type with same number of elements as DstVTy.
- auto *DstFVTy = cast<VectorType>(DstVTy);
- auto VF = DstFVTy->getElementCount();
- auto *SrcVecTy = cast<VectorType>(V->getType());
- assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
- Type *SrcElemTy = SrcVecTy->getElementType();
- Type *DstElemTy = DstFVTy->getElementType();
- assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
- "Vector elements must have same size");
-
- // Do a direct cast if element types are castable.
- if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
- return Builder.CreateBitOrPointerCast(V, DstFVTy);
- }
- // V cannot be directly casted to desired vector type.
- // May happen when V is a floating point vector but DstVTy is a vector of
- // pointers or vice-versa. Handle this using a two-step bitcast using an
- // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
- assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
- "Only one type should be a pointer type");
- assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
- "Only one type should be a floating point type");
- Type *IntTy =
- IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
- auto *VecIntTy = VectorType::get(IntTy, VF);
- Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
- return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
-}
-
void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
Value *Count = getTripCount();
// Reuse existing vector loop preheader for TC checks.
@@ -2943,16 +2516,10 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// Update dominator for Bypass & LoopExit (if needed).
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- if (!Cost->requiresScalarEpilogue(VF.isVector()))
- // If there is an epilogue which must run, there's no edge from the
- // middle block to exit blocks and thus no need to update the immediate
- // dominator of the exit blocks.
- DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
-
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
- setBranchWeights(BI, MinItersBypassWeights);
+ setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
LoopBypassBlocks.push_back(TCCheckBlock);
}
@@ -3034,33 +2601,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarPreHeader =
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
nullptr, Twine(Prefix) + "scalar.ph");
-
- auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
-
- // Set up the middle block terminator. Two cases:
- // 1) If we know that we must execute the scalar epilogue, emit an
- // unconditional branch.
- // 2) Otherwise, we must have a single unique exit block (due to how we
- // implement the multiple exit case). In this case, set up a conditional
- // branch from the middle block to the loop scalar preheader, and the
- // exit block. completeLoopSkeleton will update the condition to use an
- // iteration check, if required to decide whether to execute the remainder.
- BranchInst *BrInst =
- Cost->requiresScalarEpilogue(VF.isVector())
- ? BranchInst::Create(LoopScalarPreHeader)
- : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
- Builder.getTrue());
- BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
- ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
-
- // Update dominator for loop exit. During skeleton creation, only the vector
- // pre-header and the middle block are created. The vector loop is entirely
- // created during VPlan exection.
- if (!Cost->requiresScalarEpilogue(VF.isVector()))
- // If there is an epilogue which must run, there's no edge from the
- // middle block to exit blocks and thus no need to update the immediate
- // dominator of the exit blocks.
- DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
}
PHINode *InnerLoopVectorizer::createInductionResumeValue(
@@ -3100,7 +2640,7 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
// Create phi nodes to merge from the backedge-taken check block.
PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getTerminator());
+ LoopScalarPreHeader->getFirstNonPHI());
// Copy original phi DL over to the new one.
BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
@@ -3157,51 +2697,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
}
}
-BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
- // The trip counts should be cached by now.
- Value *Count = getTripCount();
- Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
-
- auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
-
- // Add a check in the middle block to see if we have completed
- // all of the iterations in the first vector loop. Three cases:
- // 1) If we require a scalar epilogue, there is no conditional branch as
- // we unconditionally branch to the scalar preheader. Do nothing.
- // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
- // Thus if tail is to be folded, we know we don't need to run the
- // remainder and we can use the previous value for the condition (true).
- // 3) Otherwise, construct a runtime check.
- if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
- !Cost->foldTailByMasking()) {
- // Here we use the same DebugLoc as the scalar loop latch terminator instead
- // of the corresponding compare because they may have ended up with
- // different line numbers and we want to avoid awkward line stepping while
- // debugging. Eg. if the compare has got a line number inside the loop.
- // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
- // operands. Perform simplification directly on VPlan once the branch is
- // modeled there.
- IRBuilder<> B(LoopMiddleBlock->getTerminator());
- B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
- Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
- BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
- BI.setCondition(CmpN);
- if (hasBranchWeightMD(*ScalarLatchTerm)) {
- // Assume that `Count % VectorTripCount` is equally distributed.
- unsigned TripCount = UF * VF.getKnownMinValue();
- assert(TripCount > 0 && "trip count should not be zero");
- const uint32_t Weights[] = {1, TripCount - 1};
- setBranchWeights(BI, Weights);
- }
- }
-
-#ifdef EXPENSIVE_CHECKS
- assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-#endif
-
- return LoopVectorPreHeader;
-}
-
std::pair<BasicBlock *, Value *>
InnerLoopVectorizer::createVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) {
@@ -3224,17 +2719,18 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
| [ ]_| <-- vector loop (created during VPlan execution).
| |
| v
- \ -[ ] <--- middle-block.
+ \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
+ | | successors created during VPlan execution)
\/ |
/\ v
- | ->[ ] <--- new preheader.
+ | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
| |
(opt) v <-- edge from middle to exit iff epilogue is not required.
| [ ] \
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
\ |
\ v
- >[ ] <-- exit block(s).
+ >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
...
*/
@@ -3261,7 +2757,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
// Emit phis for the new starting index of the scalar loop.
createInductionResumeValues(ExpandedSCEVs);
- return {completeLoopSkeleton(), nullptr};
+ return {LoopVectorPreHeader, nullptr};
}
// Fix up external users of the induction variable. At this point, we are
@@ -3447,37 +2943,12 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
TargetTransformInfo::TCK_RecipThroughput);
}
-static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
- auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
- auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
- return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
-}
-
-static Type *largestIntegerVectorType(Type *T1, Type *T2) {
- auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
- auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
- return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
-}
-
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VPlan &Plan) {
// Fix widened non-induction PHIs by setting up the PHI operands.
if (EnableVPlanNativePath)
fixNonInductionPHIs(Plan, State);
- // At this point every instruction in the original loop is widened to a
- // vector form. Now we need to fix the recurrences in the loop. These PHI
- // nodes are currently empty because we did not want to introduce cycles.
- // This is the second stage of vectorizing recurrences. Note that fixing
- // reduction phis are already modeled in VPlan.
- // TODO: Also model fixing fixed-order recurrence phis in VPlan.
- VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
- VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
- for (VPRecipeBase &R : HeaderVPBB->phis()) {
- if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
- fixFixedOrderRecurrence(FOR, State);
- }
-
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
PSE.getSE()->forgetBlockAndLoopDispositions();
@@ -3491,6 +2962,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
+ VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
if (Cost->requiresScalarEpilogue(VF.isVector())) {
@@ -3513,10 +2985,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VectorLoop->getHeader(), Plan, State);
}
- // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
- // in the exit block, so update the builder.
- State.Builder.SetInsertPoint(State.CFG.ExitBB,
- State.CFG.ExitBB->getFirstNonPHIIt());
+ // Fix live-out phis not already fixed earlier.
for (const auto &KV : Plan.getLiveOuts())
KV.second->fixPhi(Plan, State);
@@ -3544,125 +3013,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VF.getKnownMinValue() * UF);
}
-void InnerLoopVectorizer::fixFixedOrderRecurrence(
- VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
- // This is the second phase of vectorizing first-order recurrences. An
- // overview of the transformation is described below. Suppose we have the
- // following loop.
- //
- // for (int i = 0; i < n; ++i)
- // b[i] = a[i] - a[i - 1];
- //
- // There is a first-order recurrence on "a". For this loop, the shorthand
- // scalar IR looks like:
- //
- // scalar.ph:
- // s_init = a[-1]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, ...
- //
- // In this example, s1 is a recurrence because it's value depends on the
- // previous iteration. In the first phase of vectorization, we created a
- // vector phi v1 for s1. We now complete the vectorization and produce the
- // shorthand vector IR shown below (for VF = 4, UF = 1).
- //
- // vector.ph:
- // v_init = vector(..., ..., ..., a[-1])
- // br vector.body
- //
- // vector.body
- // i = phi [0, vector.ph], [i+4, vector.body]
- // v1 = phi [v_init, vector.ph], [v2, vector.body]
- // v2 = a[i, i+1, i+2, i+3];
- // v3 = vector(v1(3), v2(0, 1, 2))
- // b[i, i+1, i+2, i+3] = v2 - v3
- // br cond, vector.body, middle.block
- //
- // middle.block:
- // x = v2(3)
- // br scalar.ph
- //
- // scalar.ph:
- // s_init = phi [x, middle.block], [a[-1], otherwise]
- // br scalar.body
- //
- // After execution completes the vector loop, we extract the next value of
- // the recurrence (x) to use as the initial value in the scalar loop.
-
- // Extract the last vector element in the middle block. This will be the
- // initial value for the recurrence when jumping to the scalar loop.
- VPValue *PreviousDef = PhiR->getBackedgeValue();
- Value *Incoming = State.get(PreviousDef, UF - 1);
- auto *ExtractForScalar = Incoming;
- auto *IdxTy = Builder.getInt32Ty();
- Value *RuntimeVF = nullptr;
- if (VF.isVector()) {
- auto *One = ConstantInt::get(IdxTy, 1);
- Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
- auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
- ExtractForScalar =
- Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
- }
-
- auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
- assert(PhiR->getNumUsers() == 1 &&
- RecurSplice->getOpcode() ==
- VPInstruction::FirstOrderRecurrenceSplice &&
- "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
- SmallVector<VPLiveOut *> LiveOuts;
- for (VPUser *U : RecurSplice->users())
- if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
- LiveOuts.push_back(LiveOut);
-
- if (!LiveOuts.empty()) {
- // Extract the second last element in the middle block if the
- // Phi is used outside the loop. We need to extract the phi itself
- // and not the last element (the phi update in the current iteration). This
- // will be the value when jumping to the exit block from the
- // LoopMiddleBlock, when the scalar loop is not run at all.
- Value *ExtractForPhiUsedOutsideLoop = nullptr;
- if (VF.isVector()) {
- auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
- ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
- Incoming, Idx, "vector.recur.extract.for.phi");
- } else {
- assert(UF > 1 && "VF and UF cannot both be 1");
- // When loop is unrolled without vectorizing, initialize
- // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
- // value of `Incoming`. This is analogous to the vectorized case above:
- // extracting the second last element when VF > 1.
- ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
- }
-
- for (VPLiveOut *LiveOut : LiveOuts) {
- assert(!Cost->requiresScalarEpilogue(VF.isVector()));
- PHINode *LCSSAPhi = LiveOut->getPhi();
- LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
- State.Plan->removeLiveOut(LCSSAPhi);
- }
- }
-
- // Fix the initial value of the original recurrence in the scalar loop.
- Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
- PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
- auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
- auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
- for (auto *BB : predecessors(LoopScalarPreHeader)) {
- auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
- Start->addIncoming(Incoming, BB);
- }
-
- Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
- Phi->setName("scalar.recur");
-}
-
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
@@ -3758,11 +3108,6 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
}
}
-bool InnerLoopVectorizer::useOrderedReductions(
- const RecurrenceDescriptor &RdxDesc) {
- return Cost->useOrderedReductions(RdxDesc);
-}
-
void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
@@ -3928,6 +3273,13 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
if (!ScalarInd)
continue;
+ // If the induction variable update is a fixed-order recurrence, neither the
+ // induction variable or its update should be marked scalar after
+ // vectorization.
+ auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
+ if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
+ continue;
+
// Determine if all users of the induction variable update instruction are
// scalar after vectorization.
auto ScalarIndUpdate =
@@ -4100,7 +3452,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
}
bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
- Instruction *I, ElementCount VF) {
+ Instruction *I, ElementCount VF) const {
assert(isAccessInterleaved(I) && "Expecting interleaved access.");
assert(getWideningDecision(I, VF) == CM_Unknown &&
"Decision should not be set yet.");
@@ -4109,7 +3461,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
// If the instruction's allocated size doesn't equal it's type size, it
// requires padding and will be scalarized.
- auto &DL = I->getModule()->getDataLayout();
+ auto &DL = I->getDataLayout();
auto *ScalarTy = getLoadStoreType(I);
if (hasIrregularType(ScalarTy, DL))
return false;
@@ -4187,7 +3539,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
// If the instruction's allocated size doesn't equal it's type size, it
// requires padding and will be scalarized.
- auto &DL = I->getModule()->getDataLayout();
+ auto &DL = I->getDataLayout();
if (hasIrregularType(ScalarTy, DL))
return false;
@@ -4220,34 +3572,37 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// Worklist containing uniform instructions demanding lane 0.
SetVector<Instruction *> Worklist;
- BasicBlock *Latch = TheLoop->getLoopLatch();
// Add uniform instructions demanding lane 0 to the worklist. Instructions
- // that are scalar with predication must not be considered uniform after
+ // that require predication must not be considered uniform after
// vectorization, because that would create an erroneous replicating region
// where only a single instance out of VF should be formed.
- // TODO: optimize such seldom cases if found important, see PR40816.
auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
if (isOutOfScope(I)) {
LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n");
return;
}
- if (isScalarWithPredication(I, VF)) {
- LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
- << *I << "\n");
+ if (isPredicatedInst(I)) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Found not uniform due to requiring predication: " << *I
+ << "\n");
return;
}
LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
Worklist.insert(I);
};
- // Start with the conditional branch. If the branch condition is an
- // instruction contained in the loop that is only used by the branch, it is
- // uniform.
- auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
- addToWorklistIfAllowed(Cmp);
+ // Start with the conditional branches exiting the loop. If the branch
+ // condition is an instruction contained in the loop that is only used by the
+ // branch, it is uniform.
+ SmallVector<BasicBlock *> Exiting;
+ TheLoop->getExitingBlocks(Exiting);
+ for (BasicBlock *E : Exiting) {
+ auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+ addToWorklistIfAllowed(Cmp);
+ }
auto PrevVF = VF.divideCoefficientBy(2);
// Return true if all lanes perform the same memory operation, and we can
@@ -4388,6 +3743,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// nodes separately. An induction variable will remain uniform if all users
// of the induction variable and induction variable update remain uniform.
// The code below handles both pointer and non-pointer induction variables.
+ BasicBlock *Latch = TheLoop->getLoopLatch();
for (const auto &Induction : Legal->getInductionVars()) {
auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
@@ -4454,15 +3810,18 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
-ElementCount
-LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
+bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
+ if (IsScalableVectorizationAllowed)
+ return *IsScalableVectorizationAllowed;
+
+ IsScalableVectorizationAllowed = false;
if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
- return ElementCount::getScalable(0);
+ return false;
if (Hints->isScalableVectorizationDisabled()) {
reportVectorizationInfo("Scalable vectorization is explicitly disabled",
"ScalableVectorizationDisabled", ORE, TheLoop);
- return ElementCount::getScalable(0);
+ return false;
}
LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
@@ -4482,7 +3841,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
"Scalable vectorization not supported for the reduction "
"operations found in this loop.",
"ScalableVFUnfeasible", ORE, TheLoop);
- return ElementCount::getScalable(0);
+ return false;
}
// Disable scalable vectorization if the loop contains any instructions
@@ -4494,17 +3853,33 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
reportVectorizationInfo("Scalable vectorization is not supported "
"for all element types found in this loop.",
"ScalableVFUnfeasible", ORE, TheLoop);
- return ElementCount::getScalable(0);
+ return false;
+ }
+
+ if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
+ reportVectorizationInfo("The target does not provide maximum vscale value "
+ "for safe distance analysis.",
+ "ScalableVFUnfeasible", ORE, TheLoop);
+ return false;
}
+ IsScalableVectorizationAllowed = true;
+ return true;
+}
+
+ElementCount
+LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
+ if (!isScalableVectorizationAllowed())
+ return ElementCount::getScalable(0);
+
+ auto MaxScalableVF = ElementCount::getScalable(
+ std::numeric_limits<ElementCount::ScalarTy>::max());
if (Legal->isSafeForAnyVectorWidth())
return MaxScalableVF;
+ std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
// Limit MaxScalableVF by the maximum safe dependence distance.
- if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
- MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
- else
- MaxScalableVF = ElementCount::getScalable(0);
+ MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
if (!MaxScalableVF)
reportVectorizationInfo(
@@ -4738,8 +4113,22 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- if (Legal->prepareToFoldTailByMasking()) {
- CanFoldTailByMasking = true;
+ setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
+ if (foldTailByMasking()) {
+ if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
+ "try to generate VP Intrinsics with scalable vector "
+ "factors only.\n");
+ // Tail folded loop using VP intrinsics restricts the VF to be scalable
+ // for now.
+ // TODO: extend it for fixed vectors, if required.
+ assert(MaxFactors.ScalableVF.isScalable() &&
+ "Expected scalable vector factor.");
+
+ MaxFactors.FixedVF = ElementCount::getFixed(1);
+ }
return MaxFactors;
}
@@ -4860,15 +4249,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
// Select the largest VF which doesn't require more registers than existing
// ones.
- for (int i = RUs.size() - 1; i >= 0; --i) {
- bool Selected = true;
- for (auto &pair : RUs[i].MaxLocalUsers) {
- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
- if (pair.second > TargetNumRegisters)
- Selected = false;
- }
- if (Selected) {
- MaxVF = VFs[i];
+ for (int I = RUs.size() - 1; I >= 0; --I) {
+ const auto &MLU = RUs[I].MaxLocalUsers;
+ if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
+ return LU.second <= TTI.getNumberOfRegisters(LU.first);
+ })) {
+ MaxVF = VFs[I];
break;
}
}
@@ -4913,28 +4299,6 @@ bool LoopVectorizationPlanner::isMoreProfitable(
unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
- if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
- // If the trip count is a known (possibly small) constant, the trip count
- // will be rounded up to an integer number of iterations under
- // FoldTailByMasking. The total cost in that case will be
- // VecCost*ceil(TripCount/VF). When not folding the tail, the total
- // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
- // some extra overheads, but for the purpose of comparing the costs of
- // different VFs we can use this to compare the total loop-body cost
- // expected after vectorization.
- auto GetCostForTC = [MaxTripCount, this](unsigned VF,
- InstructionCost VectorCost,
- InstructionCost ScalarCost) {
- return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
- : VectorCost * (MaxTripCount / VF) +
- ScalarCost * (MaxTripCount % VF);
- };
- auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
- auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
-
- return RTCostA < RTCostB;
- }
-
// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
@@ -4948,13 +4312,39 @@ bool LoopVectorizationPlanner::isMoreProfitable(
// Assume vscale may be larger than 1 (or the value being tuned for),
// so that scalable vectorization is slightly favorable over fixed-width
// vectorization.
- if (A.Width.isScalable() && !B.Width.isScalable())
- return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
+ bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
+ A.Width.isScalable() && !B.Width.isScalable();
+
+ auto CmpFn = [PreferScalable](const InstructionCost &LHS,
+ const InstructionCost &RHS) {
+ return PreferScalable ? LHS <= RHS : LHS < RHS;
+ };
// To avoid the need for FP division:
- // (CostA / A.Width) < (CostB / B.Width)
- // <=> (CostA * B.Width) < (CostB * A.Width)
- return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
+ // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
+ // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
+ if (!MaxTripCount)
+ return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
+
+ auto GetCostForTC = [MaxTripCount, this](unsigned VF,
+ InstructionCost VectorCost,
+ InstructionCost ScalarCost) {
+ // If the trip count is a known (possibly small) constant, the trip count
+ // will be rounded up to an integer number of iterations under
+ // FoldTailByMasking. The total cost in that case will be
+ // VecCost*ceil(TripCount/VF). When not folding the tail, the total
+ // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
+ // some extra overheads, but for the purpose of comparing the costs of
+ // different VFs we can use this to compare the total loop-body cost
+ // expected after vectorization.
+ if (CM.foldTailByMasking())
+ return VectorCost * divideCeil(MaxTripCount, VF);
+ return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
+ };
+
+ auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
+ auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
+ return CmpFn(RTCostA, RTCostB);
}
static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
@@ -4977,8 +4367,10 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
if (Numbering[A.first] != Numbering[B.first])
return Numbering[A.first] < Numbering[B.first];
- ElementCountComparator ECC;
- return ECC(A.second, B.second);
+ const auto &LHS = A.second;
+ const auto &RHS = B.second;
+ return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
+ std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
});
// For a list of ordered instruction-vf pairs:
@@ -5021,13 +4413,111 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
} while (!Tail.empty());
}
-VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
- const ElementCountSet &VFCandidates) {
- InstructionCost ExpectedCost =
- CM.expectedCost(ElementCount::getFixed(1)).first;
+/// Check if any recipe of \p Plan will generate a vector value, which will be
+/// assigned a vector register.
+static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
+ const TargetTransformInfo &TTI) {
+ assert(VF.isVector() && "Checking a scalar VF?");
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
+ Plan.getCanonicalIV()->getScalarType()->getContext());
+ DenseSet<VPRecipeBase *> EphemeralRecipes;
+ collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
+ // Set of already visited types.
+ DenseSet<Type *> Visited;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+ for (VPRecipeBase &R : *VPBB) {
+ if (EphemeralRecipes.contains(&R))
+ continue;
+ // Continue early if the recipe is considered to not produce a vector
+ // result. Note that this includes VPInstruction where some opcodes may
+ // produce a vector, to preserve existing behavior as VPInstructions model
+ // aspects not directly mapped to existing IR instructions.
+ switch (R.getVPDefID()) {
+ case VPDef::VPDerivedIVSC:
+ case VPDef::VPScalarIVStepsSC:
+ case VPDef::VPScalarCastSC:
+ case VPDef::VPReplicateSC:
+ case VPDef::VPInstructionSC:
+ case VPDef::VPCanonicalIVPHISC:
+ case VPDef::VPVectorPointerSC:
+ case VPDef::VPExpandSCEVSC:
+ case VPDef::VPEVLBasedIVPHISC:
+ case VPDef::VPPredInstPHISC:
+ case VPDef::VPBranchOnMaskSC:
+ continue;
+ case VPDef::VPReductionSC:
+ case VPDef::VPActiveLaneMaskPHISC:
+ case VPDef::VPWidenCallSC:
+ case VPDef::VPWidenCanonicalIVSC:
+ case VPDef::VPWidenCastSC:
+ case VPDef::VPWidenGEPSC:
+ case VPDef::VPWidenSC:
+ case VPDef::VPWidenSelectSC:
+ case VPDef::VPBlendSC:
+ case VPDef::VPFirstOrderRecurrencePHISC:
+ case VPDef::VPWidenPHISC:
+ case VPDef::VPWidenIntOrFpInductionSC:
+ case VPDef::VPWidenPointerInductionSC:
+ case VPDef::VPReductionPHISC:
+ case VPDef::VPInterleaveSC:
+ case VPDef::VPWidenLoadEVLSC:
+ case VPDef::VPWidenLoadSC:
+ case VPDef::VPWidenStoreEVLSC:
+ case VPDef::VPWidenStoreSC:
+ break;
+ default:
+ llvm_unreachable("unhandled recipe");
+ }
+
+ auto WillWiden = [&TTI, VF](Type *ScalarTy) {
+ Type *VectorTy = ToVectorTy(ScalarTy, VF);
+ unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
+ if (!NumLegalParts)
+ return false;
+ if (VF.isScalable()) {
+ // <vscale x 1 x iN> is assumed to be profitable over iN because
+ // scalable registers are a distinct register class from scalar
+ // ones. If we ever find a target which wants to lower scalable
+ // vectors back to scalars, we'll need to update this code to
+ // explicitly ask TTI about the register class uses for each part.
+ return NumLegalParts <= VF.getKnownMinValue();
+ }
+ // Two or more parts that share a register - are vectorized.
+ return NumLegalParts < VF.getKnownMinValue();
+ };
+
+ // If no def nor is a store, e.g., branches, continue - no value to check.
+ if (R.getNumDefinedValues() == 0 &&
+ !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
+ &R))
+ continue;
+ // For multi-def recipes, currently only interleaved loads, suffice to
+ // check first def only.
+ // For stores check their stored value; for interleaved stores suffice
+ // the check first stored value only. In all cases this is the second
+ // operand.
+ VPValue *ToCheck =
+ R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
+ Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
+ if (!Visited.insert({ScalarTy}).second)
+ continue;
+ if (WillWiden(ScalarTy))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
+ InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
- assert(VFCandidates.count(ElementCount::getFixed(1)) &&
+ assert(any_of(VPlans,
+ [](std::unique_ptr<VPlan> &P) {
+ return P->hasVF(ElementCount::getFixed(1));
+ }) &&
"Expected Scalar VF to be a candidate");
const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
@@ -5035,7 +4525,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
VectorizationFactor ChosenFactor = ScalarCost;
bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
- if (ForceVectorization && VFCandidates.size() > 1) {
+ if (ForceVectorization &&
+ (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
// evaluation.
@@ -5043,43 +4534,45 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
}
SmallVector<InstructionVFPair> InvalidCosts;
- for (const auto &i : VFCandidates) {
- // The cost for scalar VF=1 is already calculated, so ignore it.
- if (i.isScalar())
- continue;
+ for (auto &P : VPlans) {
+ for (ElementCount VF : P->vectorFactors()) {
+ // The cost for scalar VF=1 is already calculated, so ignore it.
+ if (VF.isScalar())
+ continue;
- LoopVectorizationCostModel::VectorizationCostTy C =
- CM.expectedCost(i, &InvalidCosts);
- VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
+ InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
+ VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
#ifndef NDEBUG
- unsigned AssumedMinimumVscale =
- getVScaleForTuning(OrigLoop, TTI).value_or(1);
- unsigned Width =
- Candidate.Width.isScalable()
- ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
- : Candidate.Width.getFixedValue();
- LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
- << " costs: " << (Candidate.Cost / Width));
- if (i.isScalable())
- LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
- << AssumedMinimumVscale << ")");
- LLVM_DEBUG(dbgs() << ".\n");
+ unsigned AssumedMinimumVscale =
+ getVScaleForTuning(OrigLoop, TTI).value_or(1);
+ unsigned Width =
+ Candidate.Width.isScalable()
+ ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
+ : Candidate.Width.getFixedValue();
+ LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
+ << " costs: " << (Candidate.Cost / Width));
+ if (VF.isScalable())
+ LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
+ << AssumedMinimumVscale << ")");
+ LLVM_DEBUG(dbgs() << ".\n");
#endif
- if (!C.second && !ForceVectorization) {
- LLVM_DEBUG(
- dbgs() << "LV: Not considering vector loop of width " << i
- << " because it will not generate any vector instructions.\n");
- continue;
- }
+ if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Not considering vector loop of width " << VF
+ << " because it will not generate any vector instructions.\n");
+ continue;
+ }
- // If profitable add it to ProfitableVF list.
- if (isMoreProfitable(Candidate, ScalarCost))
- ProfitableVFs.push_back(Candidate);
+ // If profitable add it to ProfitableVF list.
+ if (isMoreProfitable(Candidate, ScalarCost))
+ ProfitableVFs.push_back(Candidate);
- if (isMoreProfitable(Candidate, ChosenFactor))
- ChosenFactor = Candidate;
+ if (isMoreProfitable(Candidate, ChosenFactor))
+ ChosenFactor = Candidate;
+ }
}
emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
@@ -5258,7 +4751,7 @@ std::pair<unsigned, unsigned>
LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned MinWidth = -1U;
unsigned MaxWidth = 8;
- const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+ const DataLayout &DL = TheFunction->getDataLayout();
// For in-loop reductions, no element types are added to ElementTypesInLoop
// if there are no loads/stores in the loop. In this case, check through the
// reduction variables to determine the maximum width.
@@ -5349,25 +4842,24 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (!isScalarEpilogueAllowed())
return 1;
+ // Do not interleave if EVL is preferred and no User IC is specified.
+ if (foldTailWithEVL()) {
+ LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
+ "Unroll factor forced to be 1.\n");
+ return 1;
+ }
+
// We used the distance for the interleave count.
if (!Legal->isSafeForAnyVectorWidth())
return 1;
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
const bool HasReductions = !Legal->getReductionVars().empty();
- // Do not interleave loops with a relatively small known or estimated trip
- // count. But we will interleave when InterleaveSmallLoopScalarReduction is
- // enabled, and the code has scalar reductions(HasReductions && VF = 1),
- // because with the above conditions interleaving can expose ILP and break
- // cross iteration dependences for reductions.
- if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
- !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
- return 1;
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
- LoopCost = expectedCost(VF).first;
+ LoopCost = expectedCost(VF);
assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
// Loop body is free and there is no need for interleaving.
@@ -5443,7 +4935,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- if (KnownTC) {
+ if (KnownTC > 0) {
+ // At least one iteration must be scalar when this constraint holds. So the
+ // maximum available iterations for interleaving is one less.
+ unsigned AvailableTC =
+ requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
+
// If trip count is known we select between two prospective ICs, where
// 1) the aggressive IC is capped by the trip count divided by VF
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5453,27 +4950,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// we run the vector loop at least twice.
unsigned InterleaveCountUB = bit_floor(
- std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+ std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
unsigned InterleaveCountLB = bit_floor(std::max(
- 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
MaxInterleaveCount = InterleaveCountLB;
if (InterleaveCountUB != InterleaveCountLB) {
- unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
- unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+ unsigned TailTripCountUB =
+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
+ unsigned TailTripCountLB =
+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
// If both produce same scalar tail, maximize the IC to do the same work
// in fewer vector loop iterations
if (TailTripCountUB == TailTripCountLB)
MaxInterleaveCount = InterleaveCountUB;
}
- } else if (BestKnownTC) {
+ } else if (BestKnownTC && *BestKnownTC > 0) {
+ // At least one iteration must be scalar when this constraint holds. So the
+ // maximum available iterations for interleaving is one less.
+ unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
+ ? (*BestKnownTC) - 1
+ : *BestKnownTC;
+
// If trip count is an estimated compile time constant, limit the
// IC to be capped by the trip count divided by VF * 2, such that the vector
// loop runs at least twice to make interleaving seem profitable when there
// is an epilogue loop present. Since exact Trip count is not known we
// choose to be conservative in our IC estimate.
MaxInterleaveCount = bit_floor(std::max(
- 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
}
assert(MaxInterleaveCount > 0 &&
@@ -5577,8 +5082,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// If there are scalar reductions and TTI has enabled aggressive
// interleaving for reductions, we will interleave to expose ILP.
- if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
- AggressivelyInterleaveReductions) {
+ if (VF.isScalar() && AggressivelyInterleaveReductions) {
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
// Interleave no less than SmallIC but not as aggressive as the normal IC
// to satisfy the rare situation when resources are too limited.
@@ -5845,15 +5349,21 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
for (Instruction &I : *BB)
if (isScalarWithPredication(&I, VF)) {
ScalarCostsTy ScalarCosts;
- // Do not apply discount if scalable, because that would lead to
- // invalid scalarization costs.
- // Do not apply discount logic if hacked cost is needed
- // for emulated masked memrefs.
- if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
+ // Do not apply discount logic for:
+ // 1. Scalars after vectorization, as there will only be a single copy
+ // of the instruction.
+ // 2. Scalable VF, as that would lead to invalid scalarization costs.
+ // 3. Emulated masked memrefs, if a hacked cost is needed.
+ if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
+ !useEmulatedMaskMemRefHack(&I, VF) &&
computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
// Remember that BB will remain after vectorization.
PredicatedBBsAfterVectorization[VF].insert(BB);
+ for (auto *Pred : predecessors(BB)) {
+ if (Pred->getSingleSuccessor() == BB)
+ PredicatedBBsAfterVectorization[VF].insert(Pred);
+ }
}
}
}
@@ -5920,15 +5430,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
// Compute the cost of the vector instruction. Note that this cost already
// includes the scalarization overhead of the predicated instruction.
- InstructionCost VectorCost = getInstructionCost(I, VF).first;
+ InstructionCost VectorCost = getInstructionCost(I, VF);
// Compute the cost of the scalarized instruction. This cost is the cost of
// the instruction as if it wasn't if-converted and instead remained in the
// predicated block. We will scale this cost by block probability after
// computing the scalarization overhead.
InstructionCost ScalarCost =
- VF.getFixedValue() *
- getInstructionCost(I, ElementCount::getFixed(1)).first;
+ VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
@@ -5972,14 +5481,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
return Discount;
}
-LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(
+InstructionCost LoopVectorizationCostModel::expectedCost(
ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
- VectorizationCostTy Cost;
+ InstructionCost Cost;
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
- VectorizationCostTy BlockCost;
+ InstructionCost BlockCost;
// For each instruction in the old loop.
for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -5988,22 +5496,19 @@ LoopVectorizationCostModel::expectedCost(
(VF.isVector() && VecValuesToIgnore.count(&I)))
continue;
- VectorizationCostTy C = getInstructionCost(&I, VF);
+ InstructionCost C = getInstructionCost(&I, VF);
// Check if we should override the cost.
- if (C.first.isValid() &&
- ForceTargetInstructionCost.getNumOccurrences() > 0)
- C.first = InstructionCost(ForceTargetInstructionCost);
+ if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
+ C = InstructionCost(ForceTargetInstructionCost);
// Keep a list of instructions with invalid costs.
- if (Invalid && !C.first.isValid())
+ if (Invalid && !C.isValid())
Invalid->emplace_back(&I, VF);
- BlockCost.first += C.first;
- BlockCost.second |= C.second;
- LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
- << " for VF " << VF << " For instruction: " << I
- << '\n');
+ BlockCost += C;
+ LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
+ << VF << " For instruction: " << I << '\n');
}
// If we are vectorizing a predicated block, it will have been
@@ -6014,10 +5519,9 @@ LoopVectorizationCostModel::expectedCost(
// cost by the probability of executing it. blockNeedsPredication from
// Legal is used so as to not include all blocks in tail folded loops.
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
- BlockCost.first /= getReciprocalPredBlockProb();
+ BlockCost /= getReciprocalPredBlockProb();
- Cost.first += BlockCost.first;
- Cost.second |= BlockCost.second;
+ Cost += BlockCost;
}
return Cost;
@@ -6273,12 +5777,20 @@ LoopVectorizationCostModel::getReductionPatternCost(
const RecurrenceDescriptor &RdxDesc =
Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
- InstructionCost BaseCost = TTI.getArithmeticReductionCost(
- RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+ InstructionCost BaseCost;
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
+ Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
+ BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
+ RdxDesc.getFastMathFlags(), CostKind);
+ } else {
+ BaseCost = TTI.getArithmeticReductionCost(
+ RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+ }
// For a call to the llvm.fmuladd intrinsic we need to add the cost of a
// normal fmul instruction to the cost of the fadd reduction.
- if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
+ if (RK == RecurKind::FMulAdd)
BaseCost +=
TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
@@ -6416,49 +5928,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
return getWideningCost(I, VF);
}
-LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::getInstructionCost(Instruction *I,
- ElementCount VF) {
- // If we know that this instruction will remain uniform, check the cost of
- // the scalar version.
- if (isUniformAfterVectorization(I, VF))
- VF = ElementCount::getFixed(1);
-
- if (VF.isVector() && isProfitableToScalarize(I, VF))
- return VectorizationCostTy(InstsToScalarize[VF][I], false);
-
- // Forced scalars do not have any scalarization overhead.
- auto ForcedScalar = ForcedScalars.find(VF);
- if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
- auto InstSet = ForcedScalar->second;
- if (InstSet.count(I))
- return VectorizationCostTy(
- (getInstructionCost(I, ElementCount::getFixed(1)).first *
- VF.getKnownMinValue()),
- false);
- }
-
- Type *VectorTy;
- InstructionCost C = getInstructionCost(I, VF, VectorTy);
-
- bool TypeNotScalarized = false;
- if (VF.isVector() && VectorTy->isVectorTy()) {
- if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
- if (VF.isScalable())
- // <vscale x 1 x iN> is assumed to be profitable over iN because
- // scalable registers are a distinct register class from scalar ones.
- // If we ever find a target which wants to lower scalable vectors
- // back to scalars, we'll need to update this code to explicitly
- // ask TTI about the register class uses for each part.
- TypeNotScalarized = NumParts <= VF.getKnownMinValue();
- else
- TypeNotScalarized = NumParts < VF.getKnownMinValue();
- } else
- C = InstructionCost::getInvalid();
- }
- return VectorizationCostTy(C, TypeNotScalarized);
-}
-
InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
@@ -6849,8 +6318,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
}
InstructionCost
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
- Type *&VectorTy) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+ ElementCount VF) {
+ // If we know that this instruction will remain uniform, check the cost of
+ // the scalar version.
+ if (isUniformAfterVectorization(I, VF))
+ VF = ElementCount::getFixed(1);
+
+ if (VF.isVector() && isProfitableToScalarize(I, VF))
+ return InstsToScalarize[VF][I];
+
+ // Forced scalars do not have any scalarization overhead.
+ auto ForcedScalar = ForcedScalars.find(VF);
+ if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
+ auto InstSet = ForcedScalar->second;
+ if (InstSet.count(I))
+ return getInstructionCost(I, ElementCount::getFixed(1)) *
+ VF.getKnownMinValue();
+ }
+
Type *RetTy = I->getType();
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6873,6 +6359,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
};
(void) hasSingleCopyAfterVectorization;
+ Type *VectorTy;
if (isScalarAfterVectorization(I, VF)) {
// With the exception of GEPs and PHIs, after scalarization there should
// only be one copy of the instruction generated in the loop. This is
@@ -6888,6 +6375,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
} else
VectorTy = ToVectorTy(RetTy, VF);
+ if (VF.isVector() && VectorTy->isVectorTy() &&
+ !TTI.getNumberOfParts(VectorTy))
+ return InstructionCost::getInvalid();
+
// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
@@ -6900,11 +6391,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// In cases of scalarized and predicated instructions, there will be VF
// predicated blocks in the vectorized loop. Each branch around these
// blocks requires also an extract of its vector compare i1 element.
+ // Note that the conditional branch from the loop latch will be replaced by
+ // a single branch controlling the loop, so there is no extra overhead from
+ // scalarization.
bool ScalarPredicatedBB = false;
BranchInst *BI = cast<BranchInst>(I);
if (VF.isVector() && BI->isConditional() &&
(PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
- PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
+ PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
+ BI->getParent() != TheLoop->getLoopLatch())
ScalarPredicatedBB = true;
if (ScalarPredicatedBB) {
@@ -6934,6 +6429,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// First-order recurrences are replaced by vector shuffles inside the loop.
if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
+ // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
+ // penultimate value of the recurrence.
+ // TODO: Consider vscale_range info.
+ if (VF.isScalable() && VF.getKnownMinValue() == 1)
+ return InstructionCost::getInvalid();
SmallVector<int> Mask(VF.getKnownMinValue());
std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
@@ -6999,25 +6499,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
- auto InstrCost = TTI.getArithmeticInstrCost(
+ return TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
- Op2Info, Operands, I);
-
- // Some targets can replace frem with vector library calls.
- InstructionCost VecCallCost = InstructionCost::getInvalid();
- if (I->getOpcode() == Instruction::FRem) {
- LibFunc Func;
- if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
- TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
- SmallVector<Type *, 4> OpTypes;
- for (auto &Op : I->operands())
- OpTypes.push_back(Op->getType());
- VecCallCost =
- TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
- }
- }
- return std::min(InstrCost, VecCallCost);
+ Op2Info, Operands, I, TLI);
}
case Instruction::FNeg: {
return TTI.getArithmeticInstrCost(
@@ -7157,25 +6642,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return *RedCost;
Type *SrcScalarTy = I->getOperand(0)->getType();
+ Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+ if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+ SrcScalarTy =
+ IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
Type *SrcVecTy =
VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
+
if (canTruncateToMinimalBitwidth(I, VF)) {
- // This cast is going to be shrunk. This may remove the cast or it might
- // turn it into slightly different cast. For example, if MinBW == 16,
- // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
- //
- // Calculate the modified src and dest types.
- Type *MinVecTy = VectorTy;
- if (Opcode == Instruction::Trunc) {
- SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
- VectorTy =
- largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
- } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
- // Leave SrcVecTy unchanged - we only shrink the destination element
- // type.
- VectorTy =
- smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
- }
+ // If the result type is <= the source type, there will be no extend
+ // after truncating the users to the minimal required bitwidth.
+ if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
+ (I->getOpcode() == Instruction::ZExt ||
+ I->getOpcode() == Instruction::SExt))
+ return 0;
}
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
@@ -7200,16 +6680,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
- // Find all stores to invariant variables. Since they are going to sink
- // outside the loop we do not need calculate cost for them.
+ SmallVector<Value *, 4> DeadInterleavePointerOps;
for (BasicBlock *BB : TheLoop->blocks())
for (Instruction &I : *BB) {
+ // Find all stores to invariant variables. Since they are going to sink
+ // outside the loop we do not need calculate cost for them.
StoreInst *SI;
if ((SI = dyn_cast<StoreInst>(&I)) &&
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
ValuesToIgnore.insert(&I);
+
+ // For interleave groups, we only create a pointer for the start of the
+ // interleave group. Queue up addresses of group members except the insert
+ // position for further processing.
+ if (isAccessInterleaved(&I)) {
+ auto *Group = getInterleavedAccessGroup(&I);
+ if (Group->getInsertPos() == &I)
+ continue;
+ Value *PointerOp = getLoadStorePointerOperand(&I);
+ DeadInterleavePointerOps.push_back(PointerOp);
+ }
}
+ // Mark ops feeding interleave group members as free, if they are only used
+ // by other dead computations.
+ for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
+ auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
+ if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
+ Instruction *UI = cast<Instruction>(U);
+ return !VecValuesToIgnore.contains(U) &&
+ (!isAccessInterleaved(UI) ||
+ getInterleavedAccessGroup(UI)->getInsertPos() == UI);
+ }))
+ continue;
+ VecValuesToIgnore.insert(Op);
+ DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
+ }
+
// Ignore type-promoting instructions we identified during reduction
// detection.
for (const auto &Reduction : Legal->getReductionVars()) {
@@ -7370,6 +6877,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.invalidateCostModelingDecisions();
}
+ if (CM.foldTailByMasking())
+ Legal->prepareToFoldTailByMasking();
+
ElementCount MaxUserVF =
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
@@ -7395,14 +6905,14 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
"InvalidCost", ORE, OrigLoop);
}
- // Populate the set of Vectorization Factor Candidates.
- ElementCountSet VFCandidates;
+ // Collect the Vectorization Factor Candidates.
+ SmallVector<ElementCount> VFCandidates;
for (auto VF = ElementCount::getFixed(1);
ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
- VFCandidates.insert(VF);
+ VFCandidates.push_back(VF);
for (auto VF = ElementCount::getScalable(1);
ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
- VFCandidates.insert(VF);
+ VFCandidates.push_back(VF);
CM.collectInLoopReductions();
for (const auto &VF : VFCandidates) {
@@ -7419,11 +6929,17 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
LLVM_DEBUG(printPlans(dbgs()));
- if (!MaxFactors.hasVector())
+ if (VPlans.empty())
+ return std::nullopt;
+ if (all_of(VPlans,
+ [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
return VectorizationFactor::Disabled();
- // Select the optimal vectorization factor.
- VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
+ // Select the optimal vectorization factor according to the legacy cost-model.
+ // This is now only used to verify the decisions by the new VPlan-based
+ // cost-model and will be retired once the VPlan-based cost-model is
+ // stabilized.
+ VectorizationFactor VF = selectVectorizationFactor();
assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
if (!hasPlanWithVF(VF.Width)) {
LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
@@ -7433,6 +6949,212 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return VF;
}
+InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
+ ElementCount VF) const {
+ return CM.getInstructionCost(UI, VF);
+}
+
+bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
+ return CM.ValuesToIgnore.contains(UI) ||
+ (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
+ SkipCostComputation.contains(UI);
+}
+
+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
+ ElementCount VF) const {
+ InstructionCost Cost = 0;
+ LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
+ VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
+
+ // Cost modeling for inductions is inaccurate in the legacy cost model
+ // compared to the recipes that are generated. To match here initially during
+ // VPlan cost model bring up directly use the induction costs from the legacy
+ // cost model. Note that we do this as pre-processing; the VPlan may not have
+ // any recipes associated with the original induction increment instruction
+ // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
+ // the cost of induction phis and increments (both that are represented by
+ // recipes and those that are not), to avoid distinguishing between them here,
+ // and skip all recipes that represent induction phis and increments (the
+ // former case) later on, if they exist, to avoid counting them twice.
+ // Similarly we pre-compute the cost of any optimized truncates.
+ // TODO: Switch to more accurate costing based on VPlan.
+ for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
+ Instruction *IVInc = cast<Instruction>(
+ IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+ SmallVector<Instruction *> IVInsts = {IV, IVInc};
+ for (User *U : IV->users()) {
+ auto *CI = cast<Instruction>(U);
+ if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
+ continue;
+ IVInsts.push_back(CI);
+ }
+ for (Instruction *IVInst : IVInsts) {
+ if (!CostCtx.SkipCostComputation.insert(IVInst).second)
+ continue;
+ InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
+ LLVM_DEBUG({
+ dbgs() << "Cost of " << InductionCost << " for VF " << VF
+ << ": induction instruction " << *IVInst << "\n";
+ });
+ Cost += InductionCost;
+ }
+ }
+
+ /// Compute the cost of all exiting conditions of the loop using the legacy
+ /// cost model. This is to match the legacy behavior, which adds the cost of
+ /// all exit conditions. Note that this over-estimates the cost, as there will
+ /// be a single condition to control the vector loop.
+ SmallVector<BasicBlock *> Exiting;
+ CM.TheLoop->getExitingBlocks(Exiting);
+ SetVector<Instruction *> ExitInstrs;
+ // Collect all exit conditions.
+ for (BasicBlock *EB : Exiting) {
+ auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
+ if (!Term)
+ continue;
+ if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
+ ExitInstrs.insert(CondI);
+ }
+ }
+ // Compute the cost of all instructions only feeding the exit conditions.
+ for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
+ Instruction *CondI = ExitInstrs[I];
+ if (!OrigLoop->contains(CondI) ||
+ !CostCtx.SkipCostComputation.insert(CondI).second)
+ continue;
+ Cost += CostCtx.getLegacyCost(CondI, VF);
+ for (Value *Op : CondI->operands()) {
+ auto *OpI = dyn_cast<Instruction>(Op);
+ if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
+ return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
+ !ExitInstrs.contains(cast<Instruction>(U));
+ }))
+ continue;
+ ExitInstrs.insert(OpI);
+ }
+ }
+
+ // The legacy cost model has special logic to compute the cost of in-loop
+ // reductions, which may be smaller than the sum of all instructions involved
+ // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
+ // which the legacy cost model uses to assign cost. Pre-compute their costs
+ // for now.
+ // TODO: Switch to costing based on VPlan once the logic has been ported.
+ for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
+ if (!CM.isInLoopReduction(RedPhi) &&
+ !RecurrenceDescriptor::isAnyOfRecurrenceKind(
+ RdxDesc.getRecurrenceKind()))
+ continue;
+
+ // AnyOf reduction codegen may remove the select. To match the legacy cost
+ // model, pre-compute the cost for AnyOf reductions here.
+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ auto *Select = cast<SelectInst>(*find_if(
+ RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
+ assert(!CostCtx.SkipCostComputation.contains(Select) &&
+ "reduction op visited multiple times");
+ CostCtx.SkipCostComputation.insert(Select);
+ auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
+ LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
+ << ":\n any-of reduction " << *Select << "\n");
+ Cost += ReductionCost;
+ continue;
+ }
+
+ const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
+ SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
+ ChainOps.end());
+ // Also include the operands of instructions in the chain, as the cost-model
+ // may mark extends as free.
+ for (auto *ChainOp : ChainOps) {
+ for (Value *Op : ChainOp->operands()) {
+ if (auto *I = dyn_cast<Instruction>(Op))
+ ChainOpsAndOperands.insert(I);
+ }
+ }
+
+ // Pre-compute the cost for I, if it has a reduction pattern cost.
+ for (Instruction *I : ChainOpsAndOperands) {
+ auto ReductionCost = CM.getReductionPatternCost(
+ I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+ if (!ReductionCost)
+ continue;
+
+ assert(!CostCtx.SkipCostComputation.contains(I) &&
+ "reduction op visited multiple times");
+ CostCtx.SkipCostComputation.insert(I);
+ LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
+ << ":\n in-loop reduction " << *I << "\n");
+ Cost += *ReductionCost;
+ }
+ }
+
+ // Pre-compute the costs for branches except for the backedge, as the number
+ // of replicate regions in a VPlan may not directly match the number of
+ // branches, which would lead to different decisions.
+ // TODO: Compute cost of branches for each replicate region in the VPlan,
+ // which is more accurate than the legacy cost model.
+ for (BasicBlock *BB : OrigLoop->blocks()) {
+ if (BB == OrigLoop->getLoopLatch())
+ continue;
+ CostCtx.SkipCostComputation.insert(BB->getTerminator());
+ auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
+ Cost += BranchCost;
+ }
+ // Now compute and add the VPlan-based cost.
+ Cost += Plan.cost(VF, CostCtx);
+ LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+ return Cost;
+}
+
+VPlan &LoopVectorizationPlanner::getBestPlan() const {
+ // If there is a single VPlan with a single VF, return it directly.
+ VPlan &FirstPlan = *VPlans[0];
+ if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
+ return FirstPlan;
+
+ VPlan *BestPlan = &FirstPlan;
+ ElementCount ScalarVF = ElementCount::getFixed(1);
+ assert(hasPlanWithVF(ScalarVF) &&
+ "More than a single plan/VF w/o any plan having scalar VF");
+
+ // TODO: Compute scalar cost using VPlan-based cost model.
+ InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
+ VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
+
+ bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+ if (ForceVectorization) {
+ // Ignore scalar width, because the user explicitly wants vectorization.
+ // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+ // evaluation.
+ BestFactor.Cost = InstructionCost::getMax();
+ }
+
+ for (auto &P : VPlans) {
+ for (ElementCount VF : P->vectorFactors()) {
+ if (VF.isScalar())
+ continue;
+ if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Not considering vector loop of width " << VF
+ << " because it will not generate any vector instructions.\n");
+ continue;
+ }
+
+ InstructionCost Cost = cost(*P, VF);
+ VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
+ if (isMoreProfitable(CurrentFactor, BestFactor)) {
+ BestFactor = CurrentFactor;
+ BestPlan = &*P;
+ }
+ }
+ }
+ BestPlan->setVF(BestFactor.Width);
+ return *BestPlan;
+}
+
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
assert(count_if(VPlans,
[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -7485,7 +7207,8 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
static void createAndCollectMergePhiForReduction(
VPInstruction *RedResult,
DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
- VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
+ VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
+ bool VectorizingEpilogue) {
if (!RedResult ||
RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
return;
@@ -7493,19 +7216,29 @@ static void createAndCollectMergePhiForReduction(
auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
- TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Value *FinalValue =
State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
auto *ResumePhi =
dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+ if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
+ assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
+ assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
+ ResumePhi = cast<PHINode>(Cmp->getOperand(0));
+ }
+ assert((!VectorizingEpilogue || ResumePhi) &&
+ "when vectorizing the epilogue loop, we need a resume phi from main "
+ "vector loop");
// TODO: bc.merge.rdx should not be created here, instead it should be
// modeled in VPlan.
BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
- auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
- LoopScalarPreHeader->getTerminator());
+ auto *BCBlockPhi =
+ PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
+ LoopScalarPreHeader->getTerminator()->getIterator());
// If we are fixing reductions in the epilogue loop then we should already
// have created a bc.merge.rdx Phi after the main vector body. Ensure that
@@ -7517,7 +7250,7 @@ static void createAndCollectMergePhiForReduction(
BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
Incoming);
else
- BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+ BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
}
auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
@@ -7549,12 +7282,14 @@ LoopVectorizationPlanner::executePlan(
assert(
(IsEpilogueVectorization || !ExpandedSCEVs) &&
"expanded SCEVs to reuse can only be used during epilogue vectorization");
+ (void)IsEpilogueVectorization;
- LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
- << '\n');
+ VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
- if (!IsEpilogueVectorization)
- VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+ LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
+ << ", UF=" << BestUF << '\n');
+ BestVPlan.setName("Final VPlan");
+ LLVM_DEBUG(BestVPlan.dump());
// Perform the actual loop transformation.
VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
@@ -7579,6 +7314,9 @@ LoopVectorizationPlanner::executePlan(
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
: State.ExpandedSCEVs);
+#ifdef EXPENSIVE_CHECKS
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+#endif
// Only use noalias metadata when using memory checks guaranteeing no overlap
// across all iterations.
@@ -7598,8 +7336,6 @@ LoopVectorizationPlanner::executePlan(
State.LVer->prepareNoAliasMetadata();
}
- ILV.collectPoisonGeneratingRecipes(State);
-
ILV.printDebugTracesAtStart();
//===------------------------------------------------===//
@@ -7622,9 +7358,9 @@ LoopVectorizationPlanner::executePlan(
auto *ExitVPBB =
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
for (VPRecipeBase &R : *ExitVPBB) {
- createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
- ReductionResumeValues, State, OrigLoop,
- State.CFG.VPBB2IRBB[ExitVPBB]);
+ createAndCollectMergePhiForReduction(
+ dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
+ State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
}
// 2.6. Maintain Loop Hints
@@ -7661,6 +7397,18 @@ LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();
+ // 4. Adjust branch weight of the branch in the middle block.
+ auto *MiddleTerm =
+ cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+ if (MiddleTerm->isConditional() &&
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+ // Assume that `Count % VectorTripCount` is equally distributed.
+ unsigned TripCount = State.UF * State.VF.getKnownMinValue();
+ assert(TripCount > 0 && "trip count should not be zero");
+ const uint32_t Weights[] = {1, TripCount - 1};
+ setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+ }
+
return {State.ExpandedSCEVs, ReductionResumeValues};
}
@@ -7717,7 +7465,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
// inductions in the epilogue loop are created before executing the plan for
// the epilogue loop.
- return {completeLoopSkeleton(), nullptr};
+ return {LoopVectorPreHeader, nullptr};
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7772,14 +7520,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass");
- // Update dominator for Bypass & LoopExit.
+ // Update dominator for Bypass.
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
- // For loops with multiple exits, there's no edge from the middle block
- // to exit blocks (as the epilogue must run) and thus no need to update
- // the immediate dominator of the exit blocks.
- DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
-
LoopBypassBlocks.push_back(TCCheckBlock);
// Save the trip count so we don't have to regenerate it in the
@@ -7791,7 +7533,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
- setBranchWeights(BI, MinItersBypassWeights);
+ setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
return TCCheckBlock;
@@ -7810,11 +7552,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
// Now, compare the remaining count and if there aren't enough iterations to
// execute the vectorized epilogue skip to the scalar part.
- BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
- VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
- LoopVectorPreHeader =
- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
- LI, nullptr, "vec.epilog.ph");
+ LoopVectorPreHeader->setName("vec.epilog.ph");
+ BasicBlock *VecEpilogueIterationCountCheck =
+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
+ nullptr, "vec.epilog.iter.check", true);
emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
VecEpilogueIterationCountCheck);
@@ -7907,7 +7648,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
{VecEpilogueIterationCountCheck,
EPI.VectorTripCount} /* AdditionalBypass */);
- return {completeLoopSkeleton(), EPResumeVal};
+ return {LoopVectorPreHeader, EPResumeVal};
}
BasicBlock *
@@ -7949,10 +7690,9 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
const uint32_t Weights[] = {EstimatedSkipCount,
MainLoopStep - EstimatedSkipCount};
- setBranchWeights(BI, Weights);
+ setBranchWeights(BI, Weights, /*IsExpected=*/false);
}
ReplaceInstWithInst(Insert->getTerminator(), &BI);
-
LoopBypassBlocks.push_back(Insert);
return Insert;
}
@@ -8000,8 +7740,19 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
}
}
-VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
- VPlan &Plan) {
+iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
+VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
+ std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
+ if (auto *I = dyn_cast<Instruction>(Op)) {
+ if (auto *R = Ingredient2Recipe.lookup(I))
+ return R->getVPSingleValue();
+ }
+ return Plan.getOrAddLiveIn(Op);
+ };
+ return map_range(Operands, Fn);
+}
+
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
// Look for cached value.
@@ -8025,27 +7776,34 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (OrigLoop->isLoopExiting(Src))
return EdgeMaskCache[Edge] = SrcMask;
- VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
+ VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
assert(EdgeMask && "No Edge Mask found for condition");
if (BI->getSuccessor(0) != Dst)
EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
- // The condition is 'SrcMask && EdgeMask', which is equivalent to
- // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
- // The select version does not introduce new UB if SrcMask is false and
- // EdgeMask is poison. Using 'and' here introduces undefined behavior.
- VPValue *False = Plan.getVPValueOrAddLiveIn(
- ConstantInt::getFalse(BI->getCondition()->getType()));
- EdgeMask =
- Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
+ // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
+ // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
+ // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
+ EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
}
return EdgeMaskCache[Edge] = EdgeMask;
}
-void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
+VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+ // Look for cached value.
+ std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+ EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
+ assert(ECEntryIt != EdgeMaskCache.end() &&
+ "looking up mask for edge which has not been created");
+ return ECEntryIt->second;
+}
+
+void VPRecipeBuilder::createHeaderMask() {
BasicBlock *Header = OrigLoop->getHeader();
// When not folding the tail, use nullptr to model all-true mask.
@@ -8080,7 +7838,7 @@ VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
return BCEntryIt->second;
}
-void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
+void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
assert(OrigLoop->getHeader() != BB &&
@@ -8091,7 +7849,7 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
VPValue *BlockMask = nullptr;
// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
- VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
+ VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
BlockMaskCache[BB] = EdgeMask;
return;
@@ -8108,10 +7866,9 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
BlockMaskCache[BB] = BlockMask;
}
-VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
- ArrayRef<VPValue *> Operands,
- VFRange &Range,
- VPlanPtr &Plan) {
+VPWidenMemoryRecipe *
+VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
+ VFRange &Range) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store");
@@ -8154,12 +7911,12 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
Ptr = VectorPtr;
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
- return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
- Reverse);
+ return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+ I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
- return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
- Consecutive, Reverse);
+ return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
+ Reverse, I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -8167,8 +7924,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
VPValue *Start, const InductionDescriptor &IndDesc,
- VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
- VFRange &Range) {
+ VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
assert(IndDesc.getStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
@@ -8183,14 +7939,14 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
}
-VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
- PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
+VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
+ PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
- *PSE.getSE(), *OrigLoop, Range);
+ *PSE.getSE(), *OrigLoop);
// Check if this is pointer induction. If so, build the recipe for it.
if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
@@ -8208,7 +7964,7 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
}
VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
- TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
+ TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -8228,62 +7984,46 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
auto *Phi = cast<PHINode>(I->getOperand(0));
const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
- VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
+ VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
- *OrigLoop, Range);
+ *OrigLoop);
}
return nullptr;
}
-VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
- ArrayRef<VPValue *> Operands,
- VPlanPtr &Plan) {
- // If all incoming values are equal, the incoming VPValue can be used directly
- // instead of creating a new VPBlendRecipe.
- if (llvm::all_equal(Operands))
- return Operands[0];
-
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
+ ArrayRef<VPValue *> Operands) {
unsigned NumIncoming = Phi->getNumIncomingValues();
- // For in-loop reductions, we do not need to create an additional select.
- VPValue *InLoopVal = nullptr;
- for (unsigned In = 0; In < NumIncoming; In++) {
- PHINode *PhiOp =
- dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
- if (PhiOp && CM.isInLoopReduction(PhiOp)) {
- assert(!InLoopVal && "Found more than one in-loop reduction!");
- InLoopVal = Operands[In];
- }
- }
-
- assert((!InLoopVal || NumIncoming == 2) &&
- "Found an in-loop reduction for PHI with unexpected number of "
- "incoming values");
- if (InLoopVal)
- return Operands[Operands[0] == InLoopVal ? 1 : 0];
// We know that all PHIs in non-header blocks are converted into selects, so
// we don't have to worry about the insertion order and we can just use the
// builder. At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
+ // TODO: At the moment the first mask is always skipped, but it would be
+ // better to skip the most expensive mask.
SmallVector<VPValue *, 2> OperandsWithMask;
for (unsigned In = 0; In < NumIncoming; In++) {
- VPValue *EdgeMask =
- createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
- assert((EdgeMask || NumIncoming == 1) &&
- "Multiple predecessors with one having a full mask");
OperandsWithMask.push_back(Operands[In]);
- if (EdgeMask)
- OperandsWithMask.push_back(EdgeMask);
+ VPValue *EdgeMask =
+ getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
+ if (!EdgeMask) {
+ assert(In == 0 && "Both null and non-null edge masks found");
+ assert(all_equal(Operands) &&
+ "Distinct incoming values with one having a full mask");
+ break;
+ }
+ if (In == 0)
+ continue;
+ OperandsWithMask.push_back(EdgeMask);
}
- return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
+ return new VPBlendRecipe(Phi, OperandsWithMask);
}
VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ArrayRef<VPValue *> Operands,
- VFRange &Range,
- VPlanPtr &Plan) {
+ VFRange &Range) {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[this, CI](ElementCount VF) {
return CM.isScalarWithPredication(CI, VF);
@@ -8301,6 +8041,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
return nullptr;
SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
+ Ops.push_back(Operands.back());
// Is it beneficial to perform intrinsic call compared to lib call?
bool ShouldUseVectorIntrinsic =
@@ -8311,7 +8052,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
},
Range);
if (ShouldUseVectorIntrinsic)
- return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
+ return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
CI->getDebugLoc());
Function *Variant = nullptr;
@@ -8358,13 +8099,13 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
if (Legal->isMaskRequired(CI))
Mask = getBlockInMask(CI->getParent());
else
- Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
+ Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
Ops.insert(Ops.begin() + *MaskPos, Mask);
}
- return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
+ return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
Intrinsic::not_intrinsic, CI->getDebugLoc(),
Variant);
}
@@ -8386,9 +8127,9 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
Range);
}
-VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
- ArrayRef<VPValue *> Operands,
- VPBasicBlock *VPBB, VPlanPtr &Plan) {
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
+ ArrayRef<VPValue *> Operands,
+ VPBasicBlock *VPBB) {
switch (I->getOpcode()) {
default:
return nullptr;
@@ -8401,12 +8142,9 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
VPValue *Mask = getBlockInMask(I->getParent());
- VPValue *One = Plan->getVPValueOrAddLiveIn(
- ConstantInt::get(I->getType(), 1u, false));
- auto *SafeRHS =
- new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
- I->getDebugLoc());
- VPBB->appendRecipe(SafeRHS);
+ VPValue *One =
+ Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
+ auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
Ops[1] = SafeRHS;
return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
}
@@ -8445,9 +8183,8 @@ void VPRecipeBuilder::fixHeaderPhis() {
}
}
-VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
- VFRange &Range,
- VPlan &Plan) {
+VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
+ VFRange &Range) {
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
@@ -8497,29 +8234,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
BlockInMask = getBlockInMask(I->getParent());
}
- auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
+ // Note that there is some custom logic to mark some intrinsics as uniform
+ // manually above for scalable vectors, which this assert needs to account for
+ // as well.
+ assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
+ (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
+ "Should not predicate a uniform recipe");
+ auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
IsUniform, BlockInMask);
- return toVPRecipeResult(Recipe);
+ return Recipe;
}
-VPRecipeOrVPValueTy
+VPRecipeBase *
VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
ArrayRef<VPValue *> Operands,
- VFRange &Range, VPBasicBlock *VPBB,
- VPlanPtr &Plan) {
+ VFRange &Range, VPBasicBlock *VPBB) {
// First, check for specific widening recipes that deal with inductions, Phi
// nodes, calls and memory operations.
VPRecipeBase *Recipe;
if (auto Phi = dyn_cast<PHINode>(Instr)) {
if (Phi->getParent() != OrigLoop->getHeader())
- return tryToBlend(Phi, Operands, Plan);
+ return tryToBlend(Phi, Operands);
- // Always record recipes for header phis. Later first-order recurrence phis
- // can have earlier phis as incoming values.
- recordRecipeOf(Phi);
-
- if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
- return toVPRecipeResult(Recipe);
+ if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
+ return Recipe;
VPHeaderPHIRecipe *PhiRecipe = nullptr;
assert((Legal->isReductionVariable(Phi) ||
@@ -8542,22 +8280,13 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
}
- // Record the incoming value from the backedge, so we can add the incoming
- // value from the backedge after all recipes have been created.
- auto *Inc = cast<Instruction>(
- Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
- auto RecipeIter = Ingredient2Recipe.find(Inc);
- if (RecipeIter == Ingredient2Recipe.end())
- recordRecipeOf(Inc);
-
PhisToFix.push_back(PhiRecipe);
- return toVPRecipeResult(PhiRecipe);
+ return PhiRecipe;
}
- if (isa<TruncInst>(Instr) &&
- (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
- Range, *Plan)))
- return toVPRecipeResult(Recipe);
+ if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
+ cast<TruncInst>(Instr), Operands, Range)))
+ return Recipe;
// All widen recipes below deal only with VF > 1.
if (LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8565,29 +8294,29 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return nullptr;
if (auto *CI = dyn_cast<CallInst>(Instr))
- return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
+ return tryToWidenCall(CI, Operands, Range);
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
- return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
+ return tryToWidenMemory(Instr, Operands, Range);
if (!shouldWiden(Instr, Range))
return nullptr;
if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
- return toVPRecipeResult(new VPWidenGEPRecipe(
- GEP, make_range(Operands.begin(), Operands.end())));
+ return new VPWidenGEPRecipe(GEP,
+ make_range(Operands.begin(), Operands.end()));
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
- return toVPRecipeResult(new VPWidenSelectRecipe(
- *SI, make_range(Operands.begin(), Operands.end())));
+ return new VPWidenSelectRecipe(
+ *SI, make_range(Operands.begin(), Operands.end()));
}
if (auto *CI = dyn_cast<CastInst>(Instr)) {
- return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
- CI->getType(), *CI));
+ return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
+ *CI);
}
- return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
+ return tryToWiden(Instr, Operands, VPBB);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -8603,7 +8332,12 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::truncateToMinimalBitwidths(
*Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
VPlanTransforms::optimize(*Plan, *PSE.getSE());
- assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
+ // TODO: try to put it close to addActiveLaneMask().
+ // Discard the plan if it is not EVL-compatible
+ if (CM.foldTailWithEVL() &&
+ !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
+ break;
+ assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
VF = SubRange.End;
@@ -8615,7 +8349,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
DebugLoc DL) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
- auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
+ auto *StartV = Plan.getOrAddLiveIn(StartIdx);
// Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
@@ -8623,27 +8357,22 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
Header->insert(CanonicalIVPHI, Header->begin());
- // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
- // IV by VF * UF.
- auto *CanonicalIVIncrement =
- new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
- {HasNUW, false}, DL, "index.next");
+ VPBuilder Builder(TopRegion->getExitingBasicBlock());
+ // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
+ auto *CanonicalIVIncrement = Builder.createOverflowingOp(
+ Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
+ "index.next");
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
- VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
- EB->appendRecipe(CanonicalIVIncrement);
-
// Add the BranchOnCount VPInstruction to the latch.
- VPInstruction *BranchBack =
- new VPInstruction(VPInstruction::BranchOnCount,
- {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
- EB->appendRecipe(BranchBack);
+ Builder.createNaryOp(VPInstruction::BranchOnCount,
+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
}
// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
// original exit block.
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
- VPlan &Plan) {
+ VPRecipeBuilder &Builder, VPlan &Plan) {
BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
// Only handle single-exit loops with unique exit blocks for now.
@@ -8654,17 +8383,115 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
for (PHINode &ExitPhi : ExitBB->phis()) {
Value *IncomingValue =
ExitPhi.getIncomingValueForBlock(ExitingBB);
- VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
+ VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
+ // Exit values for inductions are computed and updated outside of VPlan and
+ // independent of induction recipes.
+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
+ // live-outs.
+ if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+ !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+ isa<VPWidenPointerInductionRecipe>(V))
+ continue;
Plan.addLiveOut(&ExitPhi, V);
}
}
+/// Feed a resume value for every FOR from the vector loop to the scalar loop,
+/// if middle block branches to scalar preheader, by introducing ExtractFromEnd
+/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
+/// latter and corresponds to the scalar header.
+static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+
+ // Start by finding out if middle block branches to scalar preheader, which is
+ // not a VPIRBasicBlock, unlike Exit block - the other possible successor of
+ // middle block.
+ // TODO: Should be replaced by
+ // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
+ // scalar region is modeled as well.
+ VPBasicBlock *ScalarPHVPBB = nullptr;
+ auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
+ for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {
+ if (isa<VPIRBasicBlock>(Succ))
+ continue;
+ assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");
+ ScalarPHVPBB = cast<VPBasicBlock>(Succ);
+ }
+ if (!ScalarPHVPBB)
+ return;
+
+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+ VPBuilder MiddleBuilder(MiddleVPBB);
+ // Reset insert point so new recipes are inserted before terminator and
+ // condition, if there is either the former or both.
+ if (auto *Terminator = MiddleVPBB->getTerminator()) {
+ auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
+ assert((!Condition || Condition->getParent() == MiddleVPBB) &&
+ "Condition expected in MiddleVPBB");
+ MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);
+ }
+ VPValue *OneVPV = Plan.getOrAddLiveIn(
+ ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+
+ for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
+ auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
+ if (!FOR)
+ continue;
+
+ // Extract the resume value and create a new VPLiveOut for it.
+ auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
+ {FOR->getBackedgeValue(), OneVPV},
+ {}, "vector.recur.extract");
+ auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
+ VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
+ "scalar.recur.init");
+ Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe);
+ }
+}
+
VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
- VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
+ // ---------------------------------------------------------------------------
+ // Build initial VPlan: Scan the body of the loop in a topological order to
+ // visit each basic block after having visited its predecessor basic blocks.
+ // ---------------------------------------------------------------------------
+
+ // Create initial VPlan skeleton, having a basic block for the pre-header
+ // which contains SCEV expansions that need to happen before the CFG is
+ // modified; a basic block for the vector pre-header, followed by a region for
+ // the vector loop, followed by the middle basic block. The skeleton vector
+ // loop region contains a header and latch basic blocks.
+
+ bool RequiresScalarEpilogueCheck =
+ LoopVectorizationPlanner::getDecisionAndClampRange(
+ [this](ElementCount VF) {
+ return !CM.requiresScalarEpilogue(VF.isVector());
+ },
+ Range);
+ VPlanPtr Plan = VPlan::createInitialVPlan(
+ createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
+ *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
+ OrigLoop);
+
+ // Don't use getDecisionAndClampRange here, because we don't know the UF
+ // so this function is better to be conservative, rather than to split
+ // it up into different VPlans.
+ // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
+ bool IVUpdateMayOverflow = false;
+ for (ElementCount VF : Range)
+ IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
+
+ DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
+ TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
+ // When not folding the tail, we know that the induction increment will not
+ // overflow.
+ bool HasNUW = Style == TailFoldingStyle::None;
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
+
+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
// ---------------------------------------------------------------------------
// Pre-construction: record ingredients whose recipes we'll need to further
@@ -8690,55 +8517,26 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
if (!getDecisionAndClampRange(applyIG, Range))
continue;
InterleaveGroups.insert(IG);
- for (unsigned i = 0; i < IG->getFactor(); i++)
- if (Instruction *Member = IG->getMember(i))
- RecipeBuilder.recordRecipeOf(Member);
};
// ---------------------------------------------------------------------------
- // Build initial VPlan: Scan the body of the loop in a topological order to
- // visit each basic block after having visited its predecessor basic blocks.
+ // Construct recipes for the instructions in the loop
// ---------------------------------------------------------------------------
- // Create initial VPlan skeleton, having a basic block for the pre-header
- // which contains SCEV expansions that need to happen before the CFG is
- // modified; a basic block for the vector pre-header, followed by a region for
- // the vector loop, followed by the middle basic block. The skeleton vector
- // loop region contains a header and latch basic blocks.
- VPlanPtr Plan = VPlan::createInitialVPlan(
- createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
- *PSE.getSE());
- VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
- VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
- VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
- Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
- Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
-
- // Don't use getDecisionAndClampRange here, because we don't know the UF
- // so this function is better to be conservative, rather than to split
- // it up into different VPlans.
- // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
- bool IVUpdateMayOverflow = false;
- for (ElementCount VF : Range)
- IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
-
- DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
- TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
- // When not folding the tail, we know that the induction increment will not
- // overflow.
- bool HasNUW = Style == TailFoldingStyle::None;
- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
-
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
+ VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
VPBasicBlock *VPBB = HeaderVPBB;
- bool NeedsMasks = CM.foldTailByMasking() ||
- any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
- return Legal->blockNeedsPredication(BB);
- });
+ BasicBlock *HeaderBB = OrigLoop->getHeader();
+ bool NeedsMasks =
+ CM.foldTailByMasking() ||
+ any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
+ bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
+ return Legal->blockNeedsPredication(BB) || NeedsBlends;
+ });
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
@@ -8747,9 +8545,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
Builder.setInsertPoint(VPBB);
if (VPBB == HeaderVPBB)
- RecipeBuilder.createHeaderMask(*Plan);
+ RecipeBuilder.createHeaderMask();
else if (NeedsMasks)
- RecipeBuilder.createBlockInMask(BB, *Plan);
+ RecipeBuilder.createBlockInMask(BB);
// Introduce each ingredient into VPlan.
// TODO: Model and preserve debug intrinsics in VPlan.
@@ -8757,11 +8555,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
Instruction *Instr = &I;
SmallVector<VPValue *, 4> Operands;
auto *Phi = dyn_cast<PHINode>(Instr);
- if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
- Operands.push_back(Plan->getVPValueOrAddLiveIn(
+ if (Phi && Phi->getParent() == HeaderBB) {
+ Operands.push_back(Plan->getOrAddLiveIn(
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
} else {
- auto OpRange = Plan->mapToVPValues(Instr->operands());
+ auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
Operands = {OpRange.begin(), OpRange.end()};
}
@@ -8772,26 +8570,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
continue;
- auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
- Instr, Operands, Range, VPBB, Plan);
- if (!RecipeOrValue)
- RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
- // If Instr can be simplified to an existing VPValue, use it.
- if (isa<VPValue *>(RecipeOrValue)) {
- auto *VPV = cast<VPValue *>(RecipeOrValue);
- Plan->addVPValue(Instr, VPV);
- // If the re-used value is a recipe, register the recipe for the
- // instruction, in case the recipe for Instr needs to be recorded.
- if (VPRecipeBase *R = VPV->getDefiningRecipe())
- RecipeBuilder.setRecipe(Instr, R);
- continue;
- }
- // Otherwise, add the new recipe.
- VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
- for (auto *Def : Recipe->definedValues()) {
- auto *UV = Def->getUnderlyingValue();
- Plan->addVPValue(UV, Def);
- }
+ VPRecipeBase *Recipe =
+ RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
+ if (!Recipe)
+ Recipe = RecipeBuilder.handleReplication(Instr, Range);
RecipeBuilder.setRecipe(Instr, Recipe);
if (isa<VPHeaderPHIRecipe>(Recipe)) {
@@ -8823,7 +8605,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
} else
- addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
+ addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -8831,30 +8613,33 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
+ addLiveOutsForFirstOrderRecurrences(*Plan);
+
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
// ---------------------------------------------------------------------------
// Adjust the recipes for any inloop reductions.
- adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
+ adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
for (const auto *IG : InterleaveGroups) {
- auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
- RecipeBuilder.getRecipe(IG->getInsertPos()));
+ auto *Recipe =
+ cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
SmallVector<VPValue *, 4> StoredValues;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
- auto *StoreR =
- cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
+ auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
StoredValues.push_back(StoreR->getStoredValue());
}
bool NeedsMaskForGaps =
IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
+ assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) &&
+ "masked interleaved groups are not allowed.");
auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
Recipe->getMask(), NeedsMaskForGaps);
VPIG->insertBefore(Recipe);
@@ -8883,17 +8668,31 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Only handle constant strides for now.
if (!ScevStride)
continue;
- Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
- auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
- // The versioned value may not be used in the loop directly, so just add a
- // new live-in in those cases.
- Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
+ auto *CI = Plan->getOrAddLiveIn(
+ ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
+ if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
+ StrideVPV->replaceAllUsesWith(CI);
+
+ // The versioned value may not be used in the loop directly but through a
+ // sext/zext. Add new live-ins in those cases.
+ for (Value *U : StrideV->users()) {
+ if (!isa<SExtInst, ZExtInst>(U))
+ continue;
+ VPValue *StrideVPV = Plan->getLiveIn(U);
+ if (!StrideVPV)
+ continue;
+ unsigned BW = U->getType()->getScalarSizeInBits();
+ APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
+ : ScevStride->getAPInt().zext(BW);
+ VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+ StrideVPV->replaceAllUsesWith(CI);
+ }
}
- // From this point onwards, VPlan-to-VPlan transformations may change the plan
- // in ways that accessing values using original IR values is incorrect.
- Plan->disableValue2VPValue();
+ VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
+ return Legal->blockNeedsPredication(BB);
+ });
// Sink users of fixed-order recurrence past the recipe defining the previous
// value and introduce FirstOrderRecurrenceSplice VPInstructions.
@@ -8923,7 +8722,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// Create new empty VPlan
auto Plan = VPlan::createInitialVPlan(
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
- *PSE.getSE());
+ *PSE.getSE(), true, false, OrigLoop);
// Build hierarchical CFG
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -8948,6 +8747,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
bool HasNUW = true;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
DebugLoc());
+ assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
@@ -8960,9 +8760,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// A ComputeReductionResult recipe is added to the middle block, also for
// in-loop reductions which compute their result in-loop, because generating
// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
+//
+// Adjust AnyOf reductions; replace the reduction phi for the selected value
+// with a boolean reduction phi node to check if the condition is true in any
+// iteration. The final value is selected by the final ComputeReductionResult.
void LoopVectorizationPlanner::adjustRecipesForReductions(
- VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
- ElementCount MinVF) {
+ VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
@@ -9034,7 +8837,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// the phi until LoopExitValue. We keep track of the previous item
// (PreviousLink) to tell which of the two operands of a Link will remain
// scalar and which will be reduced. For minmax by select(cmp), Link will be
- // the select instructions.
+ // the select instructions. Blend recipes of in-loop reduction phi's will
+ // get folded to their non-phi operand, as the reduction recipe handles the
+ // condition directly.
VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
@@ -9065,6 +8870,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
VecOp = FMulRecipe;
} else {
+ auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
+ if (PhiR->isInLoop() && Blend) {
+ assert(Blend->getNumIncomingValues() == 2 &&
+ "Blend must have 2 incoming values");
+ if (Blend->getIncomingValue(0) == PhiR)
+ Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
+ else {
+ assert(Blend->getIncomingValue(1) == PhiR &&
+ "PhiR must be an operand of the blend");
+ Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
+ }
+ continue;
+ }
+
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
if (isa<VPWidenRecipe>(CurrentLink)) {
assert(isa<CmpInst>(CurrentLinkI) &&
@@ -9095,14 +8914,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
BasicBlock *BB = CurrentLinkI->getParent();
VPValue *CondOp = nullptr;
- if (CM.blockNeedsPredicationForAnyReason(BB)) {
- VPBuilder::InsertPointGuard Guard(Builder);
- Builder.setInsertPoint(CurrentLink);
+ if (CM.blockNeedsPredicationForAnyReason(BB))
CondOp = RecipeBuilder.getBlockInMask(BB);
- }
- VPReductionRecipe *RedRecipe = new VPReductionRecipe(
- RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);
+ VPReductionRecipe *RedRecipe =
+ new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
+ CondOp, CM.useOrderedReductions(RdxDesc));
// Append the recipe to the end of the VPBasicBlock because we need to
// ensure that it comes after all of it's inputs, including CondOp.
// Note that this transformation may leave over dead recipes (including
@@ -9112,7 +8929,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
PreviousLink = RedRecipe;
}
}
+ VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
Builder.setInsertPoint(&*LatchVPBB->begin());
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
+ VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
@@ -9120,6 +8941,41 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
continue;
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+ // Adjust AnyOf reductions; replace the reduction phi for the selected value
+ // with a boolean reduction phi node to check if the condition is true in
+ // any iteration. The final value is selected by the final
+ // ComputeReductionResult.
+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
+ return isa<VPWidenSelectRecipe>(U) ||
+ (isa<VPReplicateRecipe>(U) &&
+ cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
+ Instruction::Select);
+ }));
+ VPValue *Cmp = Select->getOperand(0);
+ // If the compare is checking the reduction PHI node, adjust it to check
+ // the start value.
+ if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
+ for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
+ if (CmpR->getOperand(I) == PhiR)
+ CmpR->setOperand(I, PhiR->getStartValue());
+ }
+ VPBuilder::InsertPointGuard Guard(Builder);
+ Builder.setInsertPoint(Select);
+
+ // If the true value of the select is the reduction phi, the new value is
+ // selected if the negated condition is true in any iteration.
+ if (Select->getOperand(1) == PhiR)
+ Cmp = Builder.createNot(Cmp);
+ VPValue *Or = Builder.createOr(PhiR, Cmp);
+ Select->getVPSingleValue()->replaceAllUsesWith(Or);
+
+ // Convert the reduction phi to operate on bools.
+ PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
+ OrigLoop->getHeader()->getContext())));
+ }
+
// If tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the beginning of the
// dedicated latch block.
@@ -9152,7 +9008,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
- if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+ if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
+ !RecurrenceDescriptor::isAnyOfRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
Type *RdxTy = RdxDesc.getRecurrenceType();
auto *Trunc =
@@ -9184,8 +9042,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// also modeled in VPlan.
auto *FinalReductionResult = new VPInstruction(
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
- cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
- ->appendRecipe(FinalReductionResult);
+ FinalReductionResult->insertBefore(*MiddleVPBB, IP);
OrigExitingVPV->replaceUsesWithIf(
FinalReductionResult,
[](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
@@ -9194,91 +9051,29 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPlanTransforms::clearReductionWrapFlags(*Plan);
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
- IG->getInsertPos()->printAsOperand(O, false);
- O << ", ";
- getAddr()->printAsOperand(O, SlotTracker);
- VPValue *Mask = getMask();
- if (Mask) {
- O << ", ";
- Mask->printAsOperand(O, SlotTracker);
- }
-
- unsigned OpIdx = 0;
- for (unsigned i = 0; i < IG->getFactor(); ++i) {
- if (!IG->getMember(i))
- continue;
- if (getNumStoreOperands() > 0) {
- O << "\n" << Indent << " store ";
- getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
- O << " to index " << i;
- } else {
- O << "\n" << Indent << " ";
- getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
- O << " = load from index " << i;
- }
- ++OpIdx;
- }
-}
-#endif
-
void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
"Not a pointer induction according to InductionDescriptor!");
assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
"Unexpected type.");
+ assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
+ "Recipe should have been replaced");
auto *IVR = getParent()->getPlan()->getCanonicalIV();
- PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
-
- if (onlyScalarsGenerated(State.VF)) {
- // This is the normalized GEP that starts counting at zero.
- Value *PtrInd = State.Builder.CreateSExtOrTrunc(
- CanonicalIV, IndDesc.getStep()->getType());
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If the instruction is uniform, we only need to generate the
- // first lane. Otherwise, we generate all VF values.
- bool IsUniform = vputils::onlyFirstLaneUsed(this);
- assert((IsUniform || !State.VF.isScalable()) &&
- "Cannot scalarize a scalable VF");
- unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
-
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *PartStart =
- createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
-
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- Value *Idx = State.Builder.CreateAdd(
- PartStart, ConstantInt::get(PtrInd->getType(), Lane));
- Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
-
- Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
- Value *SclrGep = emitTransformedIndex(
- State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
- IndDesc.getKind(), IndDesc.getInductionBinOp());
- SclrGep->setName("next.gep");
- State.set(this, SclrGep, VPIteration(Part, Lane));
- }
- }
- return;
- }
-
+ PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
Type *PhiType = IndDesc.getStep()->getType();
// Build a pointer phi
Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
Type *ScStValueType = ScalarStartValue->getType();
- PHINode *NewPointerPhi =
- PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
+ PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
+ CanonicalIV->getIterator());
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
// A pointer induction, performed by using a gep
- Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
+ BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
@@ -9329,84 +9124,21 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
Value *Step = State.get(getStepValue(), VPIteration(0, 0));
- Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
+ Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
Value *DerivedIV = emitTransformedIndex(
State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
Kind, cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName("offset.idx");
- if (TruncResultTy) {
- assert(TruncResultTy != DerivedIV->getType() &&
- Step->getType()->isIntegerTy() &&
- "Truncation requires an integer step");
- DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
- }
assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
State.set(this, DerivedIV, VPIteration(0, 0));
}
-void VPInterleaveRecipe::execute(VPTransformState &State) {
- assert(!State.Instance && "Interleave group being replicated.");
- State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
- getStoredValues(), getMask(),
- NeedsMaskForGaps);
-}
-
-void VPReductionRecipe::execute(VPTransformState &State) {
- assert(!State.Instance && "Reduction being replicated.");
- Value *PrevInChain = State.get(getChainOp(), 0);
- RecurKind Kind = RdxDesc.getRecurrenceKind();
- bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
- // Propagate the fast-math flags carried by the underlying instruction.
- IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
- State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *NewVecOp = State.get(getVecOp(), Part);
- if (VPValue *Cond = getCondOp()) {
- Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
- : State.get(Cond, {Part, 0});
- VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
- Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
- Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
- RdxDesc.getFastMathFlags());
- if (State.VF.isVector()) {
- Iden =
- State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
- }
-
- Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
- NewVecOp = Select;
- }
- Value *NewRed;
- Value *NextInChain;
- if (IsOrdered) {
- if (State.VF.isVector())
- NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
- PrevInChain);
- else
- NewRed = State.Builder.CreateBinOp(
- (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
- NewVecOp);
- PrevInChain = NewRed;
- } else {
- PrevInChain = State.get(getChainOp(), Part);
- NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
- }
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
- NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
- NewRed, PrevInChain);
- } else if (IsOrdered)
- NextInChain = NewRed;
- else
- NextInChain = State.Builder.CreateBinOp(
- (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
- State.set(this, NextInChain, Part);
- }
-}
-
void VPReplicateRecipe::execute(VPTransformState &State) {
Instruction *UI = getUnderlyingInstr();
if (State.Instance) { // Generate a single instance.
+ assert((State.VF.isScalar() || !isUniform()) &&
+ "uniform recipe shouldn't be predicated");
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
// Insert scalar instance packing it into a vector.
@@ -9464,98 +9196,180 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
}
-void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
- VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
-
- // Attempt to issue a wide load.
- LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
- StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
-
- assert((LI || SI) && "Invalid Load/Store instruction");
- assert((!SI || StoredValue) && "No stored value provided for widened store");
- assert((!LI || !StoredValue) && "Stored value provided for widened load");
+void VPWidenLoadRecipe::execute(VPTransformState &State) {
+ auto *LI = cast<LoadInst>(&Ingredient);
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
-
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGatherScatter = !isConsecutive();
+ bool CreateGather = !isConsecutive();
auto &Builder = State.Builder;
- InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
- bool isMaskRequired = getMask();
- if (isMaskRequired) {
- // Mask reversal is only needed for non-all-one (null) masks, as reverse of
- // a null all-one mask is a null mask.
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *Mask = State.get(getMask(), Part);
+ State.setDebugLocFrom(getDebugLoc());
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *NewLI;
+ Value *Mask = nullptr;
+ if (auto *VPMask = getMask()) {
+ // Mask reversal is only needed for non-all-one (null) masks, as reverse
+ // of a null all-one mask is a null mask.
+ Mask = State.get(VPMask, Part);
if (isReverse())
Mask = Builder.CreateVectorReverse(Mask, "reverse");
- BlockInMaskParts[Part] = Mask;
}
+
+ Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
+ if (CreateGather) {
+ NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
+ "wide.masked.gather");
+ } else if (Mask) {
+ NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.load");
+ } else {
+ NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
+ }
+ // Add metadata to the load, but setVectorValue to the reverse shuffle.
+ State.addMetadata(NewLI, LI);
+ if (Reverse)
+ NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
+ State.set(this, NewLI, Part);
}
+}
- // Handle Stores:
- if (SI) {
- State.setDebugLocFrom(SI->getDebugLoc());
+/// Use all-true mask for reverse rather than actual mask, as it avoids a
+/// dependence w/o affecting the result.
+static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
+ Value *EVL, const Twine &Name) {
+ VectorType *ValTy = cast<VectorType>(Operand->getType());
+ Value *AllTrueMask =
+ Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
+ return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
+ {Operand, AllTrueMask, EVL}, nullptr, Name);
+}
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Instruction *NewSI = nullptr;
- Value *StoredVal = State.get(StoredValue, Part);
- if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
- Value *VectorGep = State.get(getAddr(), Part);
- NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
- MaskPart);
- } else {
- if (isReverse()) {
- // If we store to reverse consecutive memory locations, then we need
- // to reverse the order of elements in the stored value.
- StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
- // We don't want to update the value in the map as it might be used in
- // another expression. So don't call resetVectorValue(StoredVal).
- }
- auto *VecPtr = State.get(getAddr(), Part);
- if (isMaskRequired)
- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
- BlockInMaskParts[Part]);
- else
- NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
- }
- State.addMetadata(NewSI, SI);
- }
- return;
+void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
+ assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+ "explicit vector length.");
+ auto *LI = cast<LoadInst>(&Ingredient);
+
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ bool CreateGather = !isConsecutive();
+
+ auto &Builder = State.Builder;
+ State.setDebugLocFrom(getDebugLoc());
+ CallInst *NewLI;
+ Value *EVL = State.get(getEVL(), VPIteration(0, 0));
+ Value *Addr = State.get(getAddr(), 0, !CreateGather);
+ Value *Mask = nullptr;
+ if (VPValue *VPMask = getMask()) {
+ Mask = State.get(VPMask, 0);
+ if (isReverse())
+ Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
+ } else {
+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+ }
+
+ if (CreateGather) {
+ NewLI =
+ Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
+ nullptr, "wide.masked.gather");
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Load, DataTy, Addr, "vp.op.load"));
}
+ NewLI->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ State.addMetadata(NewLI, LI);
+ Instruction *Res = NewLI;
+ if (isReverse())
+ Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
+ State.set(this, Res, 0);
+}
+
+void VPWidenStoreRecipe::execute(VPTransformState &State) {
+ auto *SI = cast<StoreInst>(&Ingredient);
+
+ VPValue *StoredVPValue = getStoredValue();
+ bool CreateScatter = !isConsecutive();
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+ auto &Builder = State.Builder;
+ State.setDebugLocFrom(getDebugLoc());
- // Handle loads.
- assert(LI && "Must have a load instruction");
- State.setDebugLocFrom(LI->getDebugLoc());
for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *NewLI;
- if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
- Value *VectorGep = State.get(getAddr(), Part);
- NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
- nullptr, "wide.masked.gather");
- State.addMetadata(NewLI, LI);
- } else {
- auto *VecPtr = State.get(getAddr(), Part);
- if (isMaskRequired)
- NewLI = Builder.CreateMaskedLoad(
- DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
- PoisonValue::get(DataTy), "wide.masked.load");
- else
- NewLI =
- Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+ Instruction *NewSI = nullptr;
+ Value *Mask = nullptr;
+ if (auto *VPMask = getMask()) {
+ // Mask reversal is only needed for non-all-one (null) masks, as reverse
+ // of a null all-one mask is a null mask.
+ Mask = State.get(VPMask, Part);
+ if (isReverse())
+ Mask = Builder.CreateVectorReverse(Mask, "reverse");
+ }
- // Add metadata to the load, but setVectorValue to the reverse shuffle.
- State.addMetadata(NewLI, LI);
- if (Reverse)
- NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
+ Value *StoredVal = State.get(StoredVPValue, Part);
+ if (isReverse()) {
+ // If we store to reverse consecutive memory locations, then we need
+ // to reverse the order of elements in the stored value.
+ StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
+ // We don't want to update the value in the map as it might be used in
+ // another expression. So don't call resetVectorValue(StoredVal).
}
+ Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
+ if (CreateScatter)
+ NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
+ else if (Mask)
+ NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+ else
+ NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
+ State.addMetadata(NewSI, SI);
+ }
+}
- State.set(getVPSingleValue(), NewLI, Part);
+void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
+ assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+ "explicit vector length.");
+ auto *SI = cast<StoreInst>(&Ingredient);
+
+ VPValue *StoredValue = getStoredValue();
+ bool CreateScatter = !isConsecutive();
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+ auto &Builder = State.Builder;
+ State.setDebugLocFrom(getDebugLoc());
+
+ CallInst *NewSI = nullptr;
+ Value *StoredVal = State.get(StoredValue, 0);
+ Value *EVL = State.get(getEVL(), VPIteration(0, 0));
+ if (isReverse())
+ StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
+ Value *Mask = nullptr;
+ if (VPValue *VPMask = getMask()) {
+ Mask = State.get(VPMask, 0);
+ if (isReverse())
+ Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
+ } else {
+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}
+ Value *Addr = State.get(getAddr(), 0, !CreateScatter);
+ if (CreateScatter) {
+ NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
+ Intrinsic::vp_scatter,
+ {StoredVal, Addr, Mask, EVL});
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Store, Type::getVoidTy(EVL->getContext()),
+ {StoredVal, Addr}));
+ }
+ NewSI->addParamAttr(
+ 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
+ State.addMetadata(NewSI, SI);
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -9658,7 +9472,7 @@ static bool processLoopInVPlanNativePath(
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
- F->getParent()->getDataLayout(), AddBranchWeights);
+ F->getDataLayout(), AddBranchWeights);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9741,7 +9555,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
}
// The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
- double ScalarC = *VF.ScalarCost.getValue();
+ uint64_t ScalarC = *VF.ScalarCost.getValue();
if (ScalarC == 0)
return true;
@@ -9768,7 +9582,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
// RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
//
// Now we can compute the minimum required trip count TC as
- // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
+ // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
//
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
// the computations are performed on doubles, not integers and the result
@@ -9780,9 +9594,9 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
AssumedMinimumVscale = *VScale;
IntVF *= AssumedMinimumVscale;
}
- double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
- double RtC = *CheckCost.getValue();
- double MinTC1 = RtC / (ScalarC - VecCOverVF);
+ uint64_t RtC = *CheckCost.getValue();
+ uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
+ uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
// Second, compute a minimum iteration count so that the cost of the
// runtime checks is only a fraction of the total scalar loop cost. This
@@ -9791,12 +9605,12 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
// * TC. To bound the runtime check to be a fraction 1/X of the scalar
// cost, compute
// RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
- double MinTC2 = RtC * 10 / ScalarC;
+ uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
// Now pick the larger minimum. If it is not a multiple of VF and a scalar
// epilogue is allowed, choose the next closest multiple of VF. This should
// partly compensate for ignoring the epilogue cost.
- uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
+ uint64_t MinTC = std::max(MinTC1, MinTC2);
if (SEL == CM_ScalarEpilogueAllowed)
MinTC = alignTo(MinTC, IntVF);
VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
@@ -9831,13 +9645,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->isInnermost()) &&
"VPlan-native path is not enabled. Only process inner loops.");
-#ifndef NDEBUG
- const std::string DebugLocStr = getDebugLocString(L);
-#endif /* NDEBUG */
-
LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
<< L->getHeader()->getParent()->getName() << "' from "
- << DebugLocStr << "\n");
+ << L->getLocStr() << "\n");
LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
@@ -10006,7 +9816,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
- F->getParent()->getDataLayout(), AddBranchWeights);
+ F->getDataLayout(), AddBranchWeights);
if (MaybeVF) {
VF = *MaybeVF;
// Select the interleave count.
@@ -10107,7 +9917,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
});
} else if (VectorizeLoop && !InterleaveLoop) {
LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
- << ") in " << DebugLocStr << '\n');
+ << ") in " << L->getLocStr() << '\n');
ORE->emit([&]() {
return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
L->getStartLoc(), L->getHeader())
@@ -10115,7 +9925,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
});
} else if (VectorizeLoop && InterleaveLoop) {
LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
- << ") in " << DebugLocStr << '\n');
+ << ") in " << L->getLocStr() << '\n');
LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
@@ -10130,7 +9940,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM, BFI, PSI, Checks);
- VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
+ VPlan &BestPlan =
+ UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width) : LVP.getBestPlan();
+ assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) &&
+ "VPlan cost model and legacy cost model disagreed");
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
@@ -10154,9 +9967,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI, &LVL, &CM, BFI, PSI, Checks);
- VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
+ std::unique_ptr<VPlan> BestMainPlan(
+ LVP.getBestPlanFor(EPI.MainLoopVF).duplicate());
const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
- EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
+ EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
// Second pass vectorizes the epilogue and adjusts the control flow
@@ -10181,9 +9995,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EpilogILV.setTripCount(MainILV.getTripCount());
for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
- auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
+ auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
ExpandedSCEVs.find(ExpandR->getSCEV())->second);
ExpandR->replaceAllUsesWith(ExpandedVal);
+ if (BestEpiPlan.getTripCount() == ExpandR)
+ BestEpiPlan.resetTripCount(ExpandedVal);
ExpandR->eraseFromParent();
}
@@ -10197,9 +10013,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Value *ResumeV = nullptr;
// TODO: Move setting of resume values to prepareToExecute.
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
- ResumeV = ReductionResumeValues
- .find(&ReductionPhi->getRecurrenceDescriptor())
- ->second;
+ const RecurrenceDescriptor &RdxDesc =
+ ReductionPhi->getRecurrenceDescriptor();
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+ // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
+ // start value; compare the final value from the main vector loop
+ // to the start value.
+ IRBuilder<> Builder(
+ cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
+ ResumeV = Builder.CreateICmpNE(ResumeV,
+ RdxDesc.getRecurrenceStartValue());
+ }
} else {
// Create induction resume values for both widened pointer and
// integer/fp inductions and update the start value of the induction
@@ -10220,10 +10046,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
{EPI.MainLoopIterationCountCheck});
}
assert(ResumeV && "Must have a resume value");
- VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
+ VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
}
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
+ "DT not preserved correctly");
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
@@ -10231,12 +10059,22 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (!MainILV.areSafetyChecksAdded())
DisableRuntimeUnroll = true;
} else {
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
+ ElementCount Width = VF.Width;
+ VPlan &BestPlan =
+ UseLegacyCostModel ? LVP.getBestPlanFor(Width) : LVP.getBestPlan();
+ if (!UseLegacyCostModel) {
+ assert(size(BestPlan.vectorFactors()) == 1 &&
+ "Plan should have a single VF");
+ Width = *BestPlan.vectorFactors().begin();
+ LLVM_DEBUG(dbgs()
+ << "VF picked by VPlan cost model: " << Width << "\n");
+ assert(VF.Width == Width &&
+ "VPlan cost model and legacy cost model disagreed");
+ }
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
PSI, Checks);
-
- VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
- LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+ LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there
@@ -10376,15 +10214,10 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
RemoveRedundantDbgInstrs(&BB);
}
- // We currently do not preserve loopinfo/dominator analyses with outer loop
- // vectorization. Until this is addressed, mark these analyses as preserved
- // only for non-VPlan-native path.
- // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
- if (!EnableVPlanNativePath) {
- PA.preserve<LoopAnalysis>();
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<ScalarEvolutionAnalysis>();
- }
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ PA.preserve<LoopAccessAnalysis>();
if (Result.MadeCFGChange) {
// Making CFG changes likely means a loop got vectorized. Indicate that