aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp3587
1 files changed, 2598 insertions, 989 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 35af8e425778..ea0d7673edf6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -130,6 +130,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -157,18 +158,37 @@ using namespace llvm;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
+#ifndef NDEBUG
+const char VerboseDebug[] = DEBUG_TYPE "-verbose";
+#endif
+
/// @{
/// Metadata attribute names
-static const char *const LLVMLoopVectorizeFollowupAll =
- "llvm.loop.vectorize.followup_all";
-static const char *const LLVMLoopVectorizeFollowupVectorized =
+const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
+const char LLVMLoopVectorizeFollowupVectorized[] =
"llvm.loop.vectorize.followup_vectorized";
-static const char *const LLVMLoopVectorizeFollowupEpilogue =
+const char LLVMLoopVectorizeFollowupEpilogue[] =
"llvm.loop.vectorize.followup_epilogue";
/// @}
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+
+static cl::opt<bool> EnableEpilogueVectorization(
+ "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
+ cl::desc("Enable vectorization of epilogue loops."));
+
+static cl::opt<unsigned> EpilogueVectorizationForceVF(
+ "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
+ cl::desc("When epilogue vectorization is enabled, and a value greater than "
+ "1 is specified, forces the given VF for all applicable epilogue "
+ "loops."));
+
+static cl::opt<unsigned> EpilogueVectorizationMinVF(
+ "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
+ cl::desc("Only loops with vectorization factor equal to or larger than "
+ "the specified value are considered for epilogue vectorization."));
/// Loops with a known constant trip count below this number are vectorized only
/// if no scalar iteration overheads are incurred.
@@ -178,13 +198,36 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
"value are vectorized only if no scalar iteration overheads "
"are incurred."));
-// Indicates that an epilogue is undesired, predication is preferred.
-// This means that the vectorizer will try to fold the loop-tail (epilogue)
-// into the loop and predicate the loop body accordingly.
-static cl::opt<bool> PreferPredicateOverEpilog(
- "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
- cl::desc("Indicate that an epilogue is undesired, predication should be "
- "used instead."));
+// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
+// that predication is preferred, and this lists all options. I.e., the
+// vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// and predicate the instructions accordingly. If tail-folding fails, there are
+// different fallback strategies depending on these values:
+namespace PreferPredicateTy {
+ enum Option {
+ ScalarEpilogue = 0,
+ PredicateElseScalarEpilogue,
+ PredicateOrDontVectorize
+ };
+} // namespace PreferPredicateTy
+
+static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
+ "prefer-predicate-over-epilogue",
+ cl::init(PreferPredicateTy::ScalarEpilogue),
+ cl::Hidden,
+ cl::desc("Tail-folding and predication preferences over creating a scalar "
+ "epilogue loop."),
+ cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
+ "scalar-epilogue",
+ "Don't tail-predicate loops, create scalar epilogue"),
+ clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+ "predicate-else-scalar-epilogue",
+ "prefer tail-folding, create scalar epilogue if tail "
+ "folding fails."),
+ clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+ "predicate-dont-vectorize",
+ "prefers tail-folding, don't attempt vectorization if "
+ "tail-folding fails.")));
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -196,7 +239,7 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
/// An interleave-group may need masking if it resides in a block that needs
-/// predication, or in order to mask away gaps.
+/// predication, or in order to mask away gaps.
static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
@@ -230,6 +273,12 @@ static cl::opt<unsigned> ForceTargetInstructionCost(
"an instruction to a single constant value. Mostly "
"useful for getting consistent testing."));
+static cl::opt<bool> ForceTargetSupportsScalableVectors(
+ "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Pretend that scalable vectors are supported, even if the target does "
+ "not support them. This flag should only be used for testing."));
+
static cl::opt<unsigned> SmallLoopCost(
"small-loop-cost", cl::init(20), cl::Hidden,
cl::desc(
@@ -247,6 +296,12 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
cl::desc(
"Enable runtime interleaving until load/store ports are saturated"));
+/// Interleave small loops with scalar reductions.
+static cl::opt<bool> InterleaveSmallLoopScalarReduction(
+ "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
+ cl::desc("Enable interleaving for loops with small iteration counts that "
+ "contain scalar reductions to expose ILP."));
+
/// The number of stores in a loop that are allowed to need predication.
static cl::opt<unsigned> NumberOfStoresToPredicate(
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -265,6 +320,17 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
+static cl::opt<bool>
+ PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
+ cl::Hidden,
+ cl::desc("Prefer in-loop vector reductions, "
+ "overriding the targets preference."));
+
+static cl::opt<bool> PreferPredicatedReductionSelect(
+ "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Prefer predicating a reduction operation over an after loop select."));
+
cl::opt<bool> EnableVPlanNativePath(
"enable-vplan-native-path", cl::init(false), cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
@@ -307,12 +373,14 @@ static Type *getMemInstValueType(Value *I) {
/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type at the given vectorization factor.
-static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
+static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
// Determine if an array of VF elements of type Ty is "bitcast compatible"
// with a <VF x Ty> vector.
- if (VF > 1) {
- auto *VectorTy = FixedVectorType::get(Ty, VF);
- return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
+ if (VF.isVector()) {
+ auto *VectorTy = VectorType::get(Ty, VF);
+ return TypeSize::get(VF.getKnownMinValue() *
+ DL.getTypeAllocSize(Ty).getFixedValue(),
+ VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
}
// If the vectorization factor is one, we just check if an array of type Ty
@@ -393,29 +461,42 @@ public:
LoopInfo *LI, DominatorTree *DT,
const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, unsigned VecWidth,
+ OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
- LoopVectorizationCostModel *CM)
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()),
- VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
+ VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
+ BFI(BFI), PSI(PSI) {
+ // Query this against the original loop and save it here because the profile
+ // of the original loop header may change as the transformation happens.
+ OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
+ OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
+ }
+
virtual ~InnerLoopVectorizer() = default;
- /// Create a new empty loop. Unlink the old loop and connect the new one.
- /// Return the pre-header block of the new loop.
- BasicBlock *createVectorizedLoopSkeleton();
+ /// Create a new empty loop that will contain vectorized instructions later
+ /// on, while the old loop will be used as the scalar remainder. Control flow
+ /// is generated around the vectorized (and scalar epilogue) loops consisting
+ /// of various checks and bypasses. Return the pre-header block of the new
+ /// loop.
+ /// In the case of epilogue vectorization, this function is overriden to
+ /// handle the more complex control flow around the loops.
+ virtual BasicBlock *createVectorizedLoopSkeleton();
/// Widen a single instruction within the innermost loop.
- void widenInstruction(Instruction &I, VPUser &Operands,
+ void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
VPTransformState &State);
/// Widen a single call instruction within the innermost loop.
- void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+ void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
VPTransformState &State);
/// Widen a single select instruction within the innermost loop.
- void widenSelectInstruction(SelectInst &I, VPUser &Operands,
+ void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
bool InvariantCond, VPTransformState &State);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
@@ -431,14 +512,15 @@ public:
/// Vectorize a single GetElementPtrInst based on information gathered and
/// decisions taken during planning.
- void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
- unsigned VF, bool IsPtrLoopInvariant,
+ void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
+ unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
- void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
+ void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
+ Value *StartV, unsigned UF, ElementCount VF);
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
@@ -452,7 +534,8 @@ public:
/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
/// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+ void widenIntOrFpInduction(PHINode *IV, Value *Start,
+ TruncInst *Trunc = nullptr);
/// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
/// vector or scalar value on-demand if one is not yet available. When
@@ -477,6 +560,10 @@ public:
/// value into a vector.
Value *getOrCreateVectorValue(Value *V, unsigned Part);
+ void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
+ VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
+ }
+
/// Return a value in the new loop corresponding to \p V from the original
/// loop at unroll and vector indices \p Instance. If the value has been
/// vectorized but not scalarized, the necessary extractelement instruction
@@ -491,7 +578,9 @@ public:
/// BlockInMask is non-null. Use \p State to translate given VPValues to IR
/// values in the vectorized loop.
void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
+ ArrayRef<VPValue *> VPDefs,
VPTransformState &State, VPValue *Addr,
+ ArrayRef<VPValue *> StoredValues,
VPValue *BlockInMask = nullptr);
/// Vectorize Load and Store instructions with the base address given in \p
@@ -499,8 +588,8 @@ public:
/// non-null. Use \p State to translate given VPValues to IR values in the
/// vectorized loop.
void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
- VPValue *Addr, VPValue *StoredValue,
- VPValue *BlockInMask);
+ VPValue *Def, VPValue *Addr,
+ VPValue *StoredValue, VPValue *BlockInMask);
/// Set the debug location in the builder using the debug location in
/// the instruction.
@@ -544,10 +633,11 @@ protected:
/// Clear NSW/NUW flags from reduction instructions if necessary.
void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
- /// The Loop exit block may have single value PHI nodes with some
- /// incoming value. While vectorizing we only handled real values
- /// that were defined inside the loop and we should have one value for
- /// each predecessor of its parent basic block. See PR14725.
+ /// Fixup the LCSSA phi nodes in the unique exit block. This simply
+ /// means we need to add the appropriate incoming value from the middle
+ /// block as exiting edges from the scalar epilogue loop (if present) are
+ /// already in place, and we exit the vector loop exclusively to the middle
+ /// block.
void fixLCSSAPHIs();
/// Iteratively sink the scalarized operands of a predicated instruction into
@@ -586,7 +676,8 @@ protected:
/// truncate instruction, instead of widening the original IV, we widen a
/// version of the IV truncated to \p EntryVal's type.
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
- Value *Step, Instruction *EntryVal);
+ Value *Step, Value *Start,
+ Instruction *EntryVal);
/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
@@ -654,6 +745,28 @@ protected:
const DataLayout &DL,
const InductionDescriptor &ID) const;
+ /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
+ /// vector loop preheader, middle block and scalar preheader. Also
+ /// allocate a loop object for the new vector loop and return it.
+ Loop *createVectorLoopSkeleton(StringRef Prefix);
+
+ /// Create new phi nodes for the induction variables to resume iteration count
+ /// in the scalar epilogue, from where the vectorized loop left off (given by
+ /// \p VectorTripCount).
+ /// In cases where the loop skeleton is more complicated (eg. epilogue
+ /// vectorization) and the resume values can come from an additional bypass
+ /// block, the \p AdditionalBypass pair provides information about the bypass
+ /// block and the end value on the edge from bypass to this loop.
+ void createInductionResumeValues(
+ Loop *L, Value *VectorTripCount,
+ std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+
+ /// Complete the loop skeleton by adding debug MDs, creating appropriate
+ /// conditional branches in the middle block, preparing the builder and
+ /// running the verifier. Take in the vector loop \p L as argument, and return
+ /// the preheader of the completed vector loop.
+ BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
+
/// Add additional metadata to \p To that was not present on \p Orig.
///
/// Currently this is used to add the noalias annotations based on the
@@ -672,6 +785,11 @@ protected:
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
+ /// Allow subclasses to override and print debug traces before/after vplan
+ /// execution, when trace information is requested.
+ virtual void printDebugTracesAtStart(){};
+ virtual void printDebugTracesAtEnd(){};
+
/// The original loop.
Loop *OrigLoop;
@@ -710,7 +828,7 @@ protected:
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
- unsigned VF;
+ ElementCount VF;
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
@@ -730,7 +848,8 @@ protected:
/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;
- /// The ExitBlock of the scalar loop.
+ /// The (unique) ExitBlock of the scalar loop. Note that
+ /// there can be multiple exiting edges reaching this block.
BasicBlock *LoopExitBlock;
/// The vector loop body.
@@ -779,6 +898,14 @@ protected:
// Vector of original scalar PHIs whose corresponding widened PHIs need to be
// fixed up at the end of vector code generation.
SmallVector<PHINode *, 8> OrigPHIsToFix;
+
+ /// BFI and PSI are used to check for profile guided size optimizations.
+ BlockFrequencyInfo *BFI;
+ ProfileSummaryInfo *PSI;
+
+ // Whether this loop should be optimized for size based on profile guided size
+ // optimizatios.
+ bool OptForSizeBasedOnProfile;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -789,9 +916,11 @@ public:
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
LoopVectorizationLegality *LVL,
- LoopVectorizationCostModel *CM)
- : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
- UnrollFactor, LVL, CM) {}
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+ ElementCount::getFixed(1), UnrollFactor, LVL, CM,
+ BFI, PSI) {}
private:
Value *getBroadcastInstrs(Value *V) override;
@@ -801,6 +930,128 @@ private:
Value *reverseVector(Value *Vec) override;
};
+/// Encapsulate information regarding vectorization of a loop and its epilogue.
+/// This information is meant to be updated and used across two stages of
+/// epilogue vectorization.
+struct EpilogueLoopVectorizationInfo {
+ ElementCount MainLoopVF = ElementCount::getFixed(0);
+ unsigned MainLoopUF = 0;
+ ElementCount EpilogueVF = ElementCount::getFixed(0);
+ unsigned EpilogueUF = 0;
+ BasicBlock *MainLoopIterationCountCheck = nullptr;
+ BasicBlock *EpilogueIterationCountCheck = nullptr;
+ BasicBlock *SCEVSafetyCheck = nullptr;
+ BasicBlock *MemSafetyCheck = nullptr;
+ Value *TripCount = nullptr;
+ Value *VectorTripCount = nullptr;
+
+ EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
+ unsigned EUF)
+ : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
+ EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
+ assert(EUF == 1 &&
+ "A high UF for the epilogue loop is likely not beneficial.");
+ }
+};
+
+/// An extension of the inner loop vectorizer that creates a skeleton for a
+/// vectorized loop that has its epilogue (residual) also vectorized.
+/// The idea is to run the vplan on a given loop twice, firstly to setup the
+/// skeleton and vectorize the main loop, and secondly to complete the skeleton
+/// from the first step and vectorize the epilogue. This is achieved by
+/// deriving two concrete strategy classes from this base class and invoking
+/// them in succession from the loop vectorizer planner.
+class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
+public:
+ InnerLoopAndEpilogueVectorizer(
+ Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ DominatorTree *DT, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+ LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+ EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
+ EPI(EPI) {}
+
+ // Override this function to handle the more complex control flow around the
+ // three loops.
+ BasicBlock *createVectorizedLoopSkeleton() final override {
+ return createEpilogueVectorizedLoopSkeleton();
+ }
+
+ /// The interface for creating a vectorized skeleton using one of two
+ /// different strategies, each corresponding to one execution of the vplan
+ /// as described above.
+ virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+
+ /// Holds and updates state information required to vectorize the main loop
+ /// and its epilogue in two separate passes. This setup helps us avoid
+ /// regenerating and recomputing runtime safety checks. It also helps us to
+ /// shorten the iteration-count-check path length for the cases where the
+ /// iteration count of the loop is so small that the main vector loop is
+ /// completely skipped.
+ EpilogueLoopVectorizationInfo &EPI;
+};
+
+/// A specialized derived class of inner loop vectorizer that performs
+/// vectorization of *main* loops in the process of vectorizing loops and their
+/// epilogues.
+class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
+public:
+ EpilogueVectorizerMainLoop(
+ Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ DominatorTree *DT, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+ LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+ : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+ EPI, LVL, CM, BFI, PSI) {}
+ /// Implements the interface for creating a vectorized skeleton using the
+ /// *main loop* strategy (ie the first pass of vplan execution).
+ BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+protected:
+ /// Emits an iteration count bypass check once for the main loop (when \p
+ /// ForEpilogue is false) and once for the epilogue loop (when \p
+ /// ForEpilogue is true).
+ BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
+ bool ForEpilogue);
+ void printDebugTracesAtStart() override;
+ void printDebugTracesAtEnd() override;
+};
+
+// A specialized derived class of inner loop vectorizer that performs
+// vectorization of *epilogue* loops in the process of vectorizing loops and
+// their epilogues.
+class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
+public:
+ EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE,
+ EpilogueLoopVectorizationInfo &EPI,
+ LoopVectorizationLegality *LVL,
+ llvm::LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+ : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+ EPI, LVL, CM, BFI, PSI) {}
+ /// Implements the interface for creating a vectorized skeleton using the
+ /// *epilogue loop* strategy (ie the second pass of vplan execution).
+ BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+protected:
+ /// Emits an iteration count bypass check after the main vector loop has
+ /// finished to see if there are any iterations left to execute by either
+ /// the vector epilogue or the scalar epilogue.
+ BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
+ BasicBlock *Bypass,
+ BasicBlock *Insert);
+ void printDebugTracesAtStart() override;
+ void printDebugTracesAtEnd() override;
+};
} // end namespace llvm
/// Look for a meaningful debug location on the instruction or it's
@@ -827,7 +1078,9 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
const DILocation *DIL = Inst->getDebugLoc();
if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
!isa<DbgInfoIntrinsic>(Inst)) {
- auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ auto NewDIL =
+ DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
if (NewDIL)
B.SetCurrentDebugLocation(NewDIL.getValue());
else
@@ -881,6 +1134,15 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
return R;
}
+/// Return a value for Step multiplied by VF.
+static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
+ assert(isa<ConstantInt>(Step) && "Expected an integer step");
+ Constant *StepVal = ConstantInt::get(
+ Step->getType(),
+ cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
+ return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
+}
+
namespace llvm {
void reportVectorizationFailure(const StringRef DebugMsg,
@@ -952,7 +1214,10 @@ enum ScalarEpilogueLowering {
CM_ScalarEpilogueNotAllowedLowTripLoop,
// Loop hint predicate indicating an epilogue is undesired.
- CM_ScalarEpilogueNotNeededUsePredicate
+ CM_ScalarEpilogueNotNeededUsePredicate,
+
+ // Directive indicating we must either tail fold or not vectorize
+ CM_ScalarEpilogueNotAllowedUsePredicate
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -979,7 +1244,7 @@ public:
/// \return An upper bound for the vectorization factor, or None if
/// vectorization and interleaving should be avoided up front.
- Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
+ Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
/// \return True if runtime checks are required for vectorization, and false
/// otherwise.
@@ -989,10 +1254,13 @@ public:
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
- VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+ VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
+ VectorizationFactor
+ selectEpilogueVectorizationFactor(const ElementCount MaxVF,
+ const LoopVectorizationPlanner &LVP);
/// Setup cost-based decisions for user vectorization factor.
- void selectUserVectorizationFactor(unsigned UserVF) {
+ void selectUserVectorizationFactor(ElementCount UserVF) {
collectUniformsAndScalars(UserVF);
collectInstsToScalarize(UserVF);
}
@@ -1006,7 +1274,7 @@ public:
/// If interleave count has been specified by metadata it will be returned.
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
/// are the selected vectorization factor and the cost of the selected VF.
- unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
+ unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
/// Memory access instruction may be vectorized in more than one way.
/// Form of instruction after vectorization depends on cost.
@@ -1015,7 +1283,7 @@ public:
/// the lists of loop-uniform and loop-scalar instructions.
/// The calculated cost is saved with widening decision in order to
/// avoid redundant calculations.
- void setCostBasedWideningDecision(unsigned VF);
+ void setCostBasedWideningDecision(ElementCount VF);
/// A struct that represents some properties of the register usage
/// of a loop.
@@ -1030,11 +1298,16 @@ public:
/// \return Returns information about the register usages of the loop for the
/// given vectorization factors.
- SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
+ SmallVector<RegisterUsage, 8>
+ calculateRegisterUsage(ArrayRef<ElementCount> VFs);
/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();
+ /// Split reductions into those that happen in the loop, and those that happen
+ /// outside. In loop reductions are collected into InLoopReductionChains.
+ void collectInLoopReductions();
+
/// \returns The smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this
/// type.
@@ -1044,8 +1317,9 @@ public:
/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
- bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
- assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
+ bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
+ assert(VF.isVector() &&
+ "Profitable to scalarize relevant only for VF > 1.");
// Cost model is not run in the VPlan-native path - return conservative
// result until this changes.
@@ -1059,8 +1333,8 @@ public:
}
/// Returns true if \p I is known to be uniform after vectorization.
- bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
- if (VF == 1)
+ bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
+ if (VF.isScalar())
return true;
// Cost model is not run in the VPlan-native path - return conservative
@@ -1075,8 +1349,8 @@ public:
}
/// Returns true if \p I is known to be scalar after vectorization.
- bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
- if (VF == 1)
+ bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
+ if (VF.isScalar())
return true;
// Cost model is not run in the VPlan-native path - return conservative
@@ -1092,8 +1366,8 @@ public:
/// \returns True if instruction \p I can be truncated to a smaller bitwidth
/// for vectorization factor \p VF.
- bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
- return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
+ bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
+ return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
!isProfitableToScalarize(I, VF) &&
!isScalarAfterVectorization(I, VF);
}
@@ -1110,17 +1384,18 @@ public:
/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// instruction \p I and vector width \p VF.
- void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
- unsigned Cost) {
- assert(VF >= 2 && "Expected VF >=2");
+ void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
+ InstructionCost Cost) {
+ assert(VF.isVector() && "Expected VF >=2");
WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
}
/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// interleaving group \p Grp and vector width \p VF.
- void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
- InstWidening W, unsigned Cost) {
- assert(VF >= 2 && "Expected VF >=2");
+ void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
+ ElementCount VF, InstWidening W,
+ InstructionCost Cost) {
+ assert(VF.isVector() && "Expected VF >=2");
/// Broadcast this decicion to all instructions inside the group.
/// But the cost will be assigned to one instruction only.
for (unsigned i = 0; i < Grp->getFactor(); ++i) {
@@ -1136,15 +1411,14 @@ public:
/// Return the cost model decision for the given instruction \p I and vector
/// width \p VF. Return CM_Unknown if this instruction did not pass
/// through the cost modeling.
- InstWidening getWideningDecision(Instruction *I, unsigned VF) {
- assert(VF >= 2 && "Expected VF >=2");
-
+ InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
+ assert(VF.isVector() && "Expected VF to be a vector VF");
// Cost model is not run in the VPlan-native path - return conservative
// result until this changes.
if (EnableVPlanNativePath)
return CM_GatherScatter;
- std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+ std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
auto Itr = WideningDecisions.find(InstOnVF);
if (Itr == WideningDecisions.end())
return CM_Unknown;
@@ -1153,9 +1427,9 @@ public:
/// Return the vectorization cost for the given instruction \p I and vector
/// width \p VF.
- unsigned getWideningCost(Instruction *I, unsigned VF) {
- assert(VF >= 2 && "Expected VF >=2");
- std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+ InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
+ assert(VF.isVector() && "Expected VF >=2");
+ std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
"The cost is not calculated");
return WideningDecisions[InstOnVF].second;
@@ -1164,7 +1438,7 @@ public:
/// Return True if instruction \p I is an optimizable truncate whose operand
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
- bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
+ bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
// If the instruction is not a truncate, return false.
auto *Trunc = dyn_cast<TruncInst>(I);
if (!Trunc)
@@ -1189,14 +1463,14 @@ public:
/// Collects the instructions to scalarize for each predicated instruction in
/// the loop.
- void collectInstsToScalarize(unsigned VF);
+ void collectInstsToScalarize(ElementCount VF);
/// Collect Uniform and Scalar values for the given \p VF.
/// The sets depend on CM decision for Load/Store instructions
/// that may be vectorized as interleave, gather-scatter or scalarized.
- void collectUniformsAndScalars(unsigned VF) {
+ void collectUniformsAndScalars(ElementCount VF) {
// Do the analysis once.
- if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
+ if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
return;
setCostBasedWideningDecision(VF);
collectLoopUniforms(VF);
@@ -1247,7 +1521,8 @@ public:
/// instructions that may divide by zero.
/// If a non-zero VF has been calculated, we check if I will be scalarized
/// predication for that VF.
- bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
+ bool isScalarWithPredication(Instruction *I,
+ ElementCount VF = ElementCount::getFixed(1));
// Returns true if \p I is an instruction that will be predicated either
// through scalar predication or masked load/store or masked gather/scatter.
@@ -1264,12 +1539,16 @@ public:
/// Returns true if \p I is a memory instruction with consecutive memory
/// access that can be widened.
- bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+ bool
+ memoryInstructionCanBeWidened(Instruction *I,
+ ElementCount VF = ElementCount::getFixed(1));
/// Returns true if \p I is a memory instruction in an interleaved-group
/// of memory accesses that can be vectorized with wide vector loads/stores
/// and shuffles.
- bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
+ bool
+ interleavedAccessCanBeWidened(Instruction *I,
+ ElementCount VF = ElementCount::getFixed(1));
/// Check if \p Instr belongs to any interleaved access group.
bool isAccessInterleaved(Instruction *Instr) {
@@ -1282,11 +1561,16 @@ public:
return InterleaveInfo.getInterleaveGroup(Instr);
}
- /// Returns true if an interleaved group requires a scalar iteration
- /// to handle accesses with gaps, and there is nothing preventing us from
- /// creating a scalar epilogue.
+ /// Returns true if we're required to use a scalar epilogue for at least
+ /// the final iteration of the original loop.
bool requiresScalarEpilogue() const {
- return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
+ if (!isScalarEpilogueAllowed())
+ return false;
+ // If we might exit from anywhere but the latch, must run the exiting
+ // iteration in scalar form.
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
+ return true;
+ return InterleaveInfo.requiresScalarEpilogue();
}
/// Returns true if a scalar epilogue is not allowed due to optsize or a
@@ -1302,17 +1586,34 @@ public:
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
+ /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
+ /// nodes to the chain of instructions representing the reductions. Uses a
+ /// MapVector to ensure deterministic iteration order.
+ using ReductionChainMap =
+ SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
+
+ /// Return the chain of instructions representing an inloop reduction.
+ const ReductionChainMap &getInLoopReductionChains() const {
+ return InLoopReductionChains;
+ }
+
+ /// Returns true if the Phi is part of an inloop reduction.
+ bool isInLoopReduction(PHINode *Phi) const {
+ return InLoopReductionChains.count(Phi);
+ }
+
/// Estimate cost of an intrinsic call instruction CI if it were vectorized
/// with factor VF. Return the cost of the instruction, including
/// scalarization overhead if it's needed.
- unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
+ InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead
/// if it's needed. The flag NeedToScalarize shows if the call needs to be
/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.
- unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
+ InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
+ bool &NeedToScalarize);
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
@@ -1327,7 +1628,8 @@ private:
/// \return An upper bound for the vectorization factor, a power-of-2 larger
/// than zero. One is returned if vectorization should best be avoided due
/// to cost.
- unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
+ ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
@@ -1336,47 +1638,54 @@ private:
/// is
/// false, then all operations will be scalarized (i.e. no vectorization has
/// actually taken place).
- using VectorizationCostTy = std::pair<unsigned, bool>;
+ using VectorizationCostTy = std::pair<InstructionCost, bool>;
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
/// the factor width.
- VectorizationCostTy expectedCost(unsigned VF);
+ VectorizationCostTy expectedCost(ElementCount VF);
/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
- VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+ VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
/// The cost-computation logic from getInstructionCost which provides
/// the vector type as an output parameter.
- unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
+ InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
+ Type *&VectorTy);
+
+ /// Return the cost of instructions in an inloop reduction pattern, if I is
+ /// part of that pattern.
+ InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
+ Type *VectorTy,
+ TTI::TargetCostKind CostKind);
/// Calculate vectorization cost of memory instruction \p I.
- unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
+ InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
/// The cost computation for scalarized memory instruction.
- unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+ InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
/// The cost computation for interleaving group of memory instructions.
- unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+ InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
/// The cost computation for Gather/Scatter instruction.
- unsigned getGatherScatterCost(Instruction *I, unsigned VF);
+ InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
/// The cost computation for widening instruction \p I with consecutive
/// memory access.
- unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
+ InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
/// The cost calculation for Load/Store instruction \p I with uniform pointer -
/// Load: scalar load + broadcast.
/// Store: scalar store + (loop invariant value stored? 0 : extract of last
/// element)
- unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
+ InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
- unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
+ InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
@@ -1394,7 +1703,7 @@ private:
/// A type representing the costs for instructions if they were to be
/// scalarized rather than vectorized. The entries are Instruction-Cost
/// pairs.
- using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
+ using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.
@@ -1416,19 +1725,30 @@ private:
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
/// vectorization factor. The entries are VF-ScalarCostTy pairs.
- DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
+ DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
/// Holds the instructions known to be uniform after vectorization.
/// The data is collected per VF.
- DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
+ DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
/// Holds the instructions known to be scalar after vectorization.
/// The data is collected per VF.
- DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+ DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
/// Holds the instructions (address computations) that are forced to be
/// scalarized.
- DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+ DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
+ /// PHINodes of the reductions that should be expanded in-loop along with
+ /// their associated chains of reduction operations, in program order from top
+ /// (PHI) to bottom
+ ReductionChainMap InLoopReductionChains;
+
+ /// A Map of inloop reduction operations and their immediate chain operand.
+ /// FIXME: This can be removed once reductions can be costed correctly in
+ /// vplan. This was added to allow quick lookup to the inloop operations,
+ /// without having to loop through InLoopReductionChains.
+ DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
/// Returns the expected difference in cost from scalarizing the expression
/// feeding a predicated instruction \p PredInst. The instructions to
@@ -1436,7 +1756,7 @@ private:
/// non-negative return value implies the expression will be scalarized.
/// Currently, only single-use chains are considered for scalarization.
int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
- unsigned VF);
+ ElementCount VF);
/// Collect the instructions that are uniform after vectorization. An
/// instruction is uniform if we represent it with a single scalar value in
@@ -1447,27 +1767,28 @@ private:
/// scalarized instruction will be represented by VF scalar values in the
/// vectorized loop, each corresponding to an iteration of the original
/// scalar loop.
- void collectLoopUniforms(unsigned VF);
+ void collectLoopUniforms(ElementCount VF);
/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
/// during vectorization. Non-uniform scalarized instructions will be
/// represented by VF values in the vectorized loop, each corresponding to an
/// iteration of the original scalar loop.
- void collectLoopScalars(unsigned VF);
+ void collectLoopScalars(ElementCount VF);
/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
- using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
- std::pair<InstWidening, unsigned>>;
+ using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
+ std::pair<InstWidening, InstructionCost>>;
DecisionList WideningDecisions;
/// Returns true if \p V is expected to be vectorized and it needs to be
/// extracted.
- bool needsExtract(Value *V, unsigned VF) const {
+ bool needsExtract(Value *V, ElementCount VF) const {
Instruction *I = dyn_cast<Instruction>(V);
- if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
+ if (VF.isScalar() || !I || !TheLoop->contains(I) ||
+ TheLoop->isLoopInvariant(I))
return false;
// Assume we can vectorize V (and hence we need extraction) if the
@@ -1482,11 +1803,21 @@ private:
/// Returns a range containing only operands needing to be extracted.
SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
- unsigned VF) {
+ ElementCount VF) {
return SmallVector<Value *, 4>(make_filter_range(
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}
+ /// Determines if we have the infrastructure to vectorize loop \p L and its
+ /// epilogue, assuming the main loop is vectorized by \p VF.
+ bool isCandidateForEpilogueVectorization(const Loop &L,
+ const ElementCount VF) const;
+
+ /// Returns true if epilogue vectorization is considered profitable, and
+ /// false otherwise.
+ /// \p VF is the vectorization factor chosen for the original loop.
+ bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
+
public:
/// The loop that we evaluate.
Loop *TheLoop;
@@ -1529,6 +1860,9 @@ public:
/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+
+ /// Profitable vector factors.
+ SmallVector<VectorizationFactor, 8> ProfitableVFs;
};
} // end namespace llvm
@@ -1549,7 +1883,7 @@ public:
// representation for pragma 'omp simd' is introduced.
static bool isExplicitVecOuterLoop(Loop *OuterLp,
OptimizationRemarkEmitter *ORE) {
- assert(!OuterLp->empty() && "This is not an outer loop");
+ assert(!OuterLp->isInnermost() && "This is not an outer loop");
LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
// Only outer loops with an explicit vectorization hint are supported.
@@ -1582,7 +1916,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
// now, only collect outer loops that have explicit vectorization hints. If we
// are stress testing the VPlan H-CFG construction, we collect the outermost
// loop of every loop nest.
- if (L.empty() || VPlanBuildStressTest ||
+ if (L.isInnermost() || VPlanBuildStressTest ||
(EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
LoopBlocksRPO RPOT(&L);
RPOT.perform(LI);
@@ -1696,10 +2030,10 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
}
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
- const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
+ const InductionDescriptor &II, Value *Step, Value *Start,
+ Instruction *EntryVal) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
"Expected either an induction phi-node or a truncate of it!");
- Value *Start = II.getStartValue();
// Construct the initial value of the vector IV in the vector loop preheader
auto CurrIP = Builder.saveIP();
@@ -1729,7 +2063,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// Multiply the vectorization factor by the step using integer or
// floating-point arithmetic as appropriate.
- Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
+ Value *ConstVF =
+ getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
// Create a vector splat to use in the induction update.
@@ -1737,10 +2072,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// FIXME: If the step is non-constant, we create the vector splat with
// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
// handle a constant vector splat.
- Value *SplatVF =
- isa<Constant>(Mul)
- ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
- : Builder.CreateVectorSplat(VF, Mul);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ Value *SplatVF = isa<Constant>(Mul)
+ ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(VF, Mul);
Builder.restoreIP(CurrIP);
// We may need to add the step a number of times, depending on the unroll
@@ -1816,7 +2151,8 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
}
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
+ TruncInst *Trunc) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
@@ -1874,8 +2210,10 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *EntryPart =
- getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
+ getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
+ ID.getInductionOpcode());
VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
if (Trunc)
addMetadata(EntryPart, Trunc);
@@ -1885,7 +2223,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
// Now do the actual transformations, and start with creating the step value.
Value *Step = CreateStepValue(ID.getStep());
- if (VF <= 1) {
+ if (VF.isZero() || VF.isScalar()) {
Value *ScalarIV = CreateScalarIV(Step);
CreateSplatIV(ScalarIV, Step);
return;
@@ -1896,7 +2234,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
// least one user in the loop that is not widened.
auto NeedsScalarIV = needsScalarInduction(EntryVal);
if (!NeedsScalarIV) {
- createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
return;
}
@@ -1904,7 +2242,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (!shouldScalarizeInstruction(EntryVal)) {
- createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
Value *ScalarIV = CreateScalarIV(Step);
// Create scalar steps that can be used by instructions we will later
// scalarize. Note that the addition of the scalar steps will not increase
@@ -1926,7 +2264,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps BinOp) {
// Create and check the types.
- auto *ValVTy = cast<VectorType>(Val->getType());
+ auto *ValVTy = cast<FixedVectorType>(Val->getType());
int VLen = ValVTy->getNumElements();
Type *STy = Val->getType()->getScalarType();
@@ -1983,8 +2321,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Instruction *EntryVal,
const InductionDescriptor &ID) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
- assert(VF > 1 && "VF should be greater than one");
-
+ assert(VF.isVector() && "VF should be greater than one");
// Get the value type and ensure it and the step have the same integer type.
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
assert(ScalarIVTy == Step->getType() &&
@@ -2006,12 +2343,27 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
unsigned Lanes =
- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
- : VF;
+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
+ ? 1
+ : VF.getKnownMinValue();
+ assert((!VF.isScalable() || Lanes == 1) &&
+ "Should never scalarize a scalable vector");
// Compute the scalar steps and save the results in VectorLoopValueMap.
for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
+ auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
+ ScalarIVTy->getScalarSizeInBits());
+ Value *StartIdx =
+ createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+ if (ScalarIVTy->isFloatingPointTy())
+ StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
+ StartIdx = addFastMathFlag(Builder.CreateBinOp(
+ AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
+ // The step returned by `createStepForVF` is a runtime-evaluated value
+ // when VF is scalable. Otherwise, it should be folded into a Constant.
+ assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
+ "Expected StartIdx to be folded to a constant when VF is not "
+ "scalable");
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
@@ -2045,7 +2397,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
// If we aren't vectorizing, we can just copy the scalar map values over to
// the vector map.
- if (VF == 1) {
+ if (VF.isScalar()) {
VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
return ScalarValue;
}
@@ -2054,7 +2406,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
// is known to be uniform after vectorization, this corresponds to lane zero
// of the Part unroll iteration. Otherwise, the last instruction is the one
// we created for the last vector lane of the Part unroll iteration.
- unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
+ unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
+ ? 0
+ : VF.getKnownMinValue() - 1;
+ assert((!VF.isScalable() || LastLane == 0) &&
+ "Scalable vectorization can't lead to any scalarized values.");
auto *LastInst = cast<Instruction>(
VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
@@ -2075,10 +2431,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
VectorValue = getBroadcastInstrs(ScalarValue);
VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
} else {
- // Initialize packing with insertelements to start from undef.
- Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
- VectorLoopValueMap.setVectorValue(V, Part, Undef);
- for (unsigned Lane = 0; Lane < VF; ++Lane)
+ // Initialize packing with insertelements to start from poison.
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF));
+ VectorLoopValueMap.setVectorValue(V, Part, Poison);
+ for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
packScalarIntoVectorValue(V, {Part, Lane});
VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
}
@@ -2117,7 +2474,7 @@ InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
// extractelement instruction.
auto *U = getOrCreateVectorValue(V, Instance.Part);
if (!U->getType()->isVectorTy()) {
- assert(VF == 1 && "Value not scalarized has non-vector type");
+ assert(VF.isScalar() && "Value not scalarized has non-vector type");
return U;
}
@@ -2142,12 +2499,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type");
+ assert(!VF.isScalable() && "Cannot reverse scalable vectors");
SmallVector<int, 8> ShuffleMask;
- for (unsigned i = 0; i < VF; ++i)
- ShuffleMask.push_back(VF - i - 1);
+ for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
+ ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
- return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
- ShuffleMask, "reverse");
+ return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
}
// Return whether we allow using masked interleave-groups (for dealing with
@@ -2172,9 +2529,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
// }
// To:
// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
-// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
-// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
-// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
+// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
+// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
+// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
//
// Or translate following interleaved store group (factor = 3):
// for (i = 0; i < N; i+=3) {
@@ -2185,20 +2542,22 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
// }
// To:
// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
-// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void InnerLoopVectorizer::vectorizeInterleaveGroup(
- const InterleaveGroup<Instruction> *Group, VPTransformState &State,
- VPValue *Addr, VPValue *BlockInMask) {
+ const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
+ VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
+ VPValue *BlockInMask) {
Instruction *Instr = Group->getInsertPos();
const DataLayout &DL = Instr->getModule()->getDataLayout();
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getMemInstValueType(Instr);
unsigned InterleaveFactor = Group->getFactor();
- auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
// Prepare for the new pointers.
SmallVector<Value *, 2> AddrParts;
@@ -2214,8 +2573,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
// pointer operand of the interleaved access is supposed to be uniform. For
// uniform instructions, we're only required to generate a value for the
// first vector lane in each unroll iteration.
+ assert(!VF.isScalable() &&
+ "scalable vector reverse operation is not implemented");
if (Group->isReverse())
- Index += (VF - 1) * Group->getFactor();
+ Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
for (unsigned Part = 0; Part < UF; Part++) {
Value *AddrPart = State.get(Addr, {Part, 0});
@@ -2246,11 +2607,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
setDebugLocFromInst(Builder, Instr);
- Value *UndefVec = UndefValue::get(VecTy);
+ Value *PoisonVec = PoisonValue::get(VecTy);
Value *MaskForGaps = nullptr;
if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
- MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
}
@@ -2266,10 +2628,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
Value *GroupMask = MaskForGaps;
if (BlockInMask) {
Value *BlockInMaskPart = State.get(BlockInMask, Part);
- auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart, Undefs,
- createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
+ BlockInMaskPart,
+ createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
+ "interleaved.mask");
GroupMask = MaskForGaps
? Builder.CreateBinOp(Instruction::And, ShuffledMask,
MaskForGaps)
@@ -2277,7 +2640,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
NewLoad =
Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
- GroupMask, UndefVec, "wide.masked.vec");
+ GroupMask, PoisonVec, "wide.masked.vec");
}
else
NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
@@ -2288,6 +2651,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
// For each member in the group, shuffle out the appropriate data from the
// wide loads.
+ unsigned J = 0;
for (unsigned I = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
@@ -2295,28 +2659,33 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
if (!Member)
continue;
- auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ auto StrideMask =
+ createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
for (unsigned Part = 0; Part < UF; Part++) {
Value *StridedVec = Builder.CreateShuffleVector(
- NewLoads[Part], UndefVec, StrideMask, "strided.vec");
+ NewLoads[Part], StrideMask, "strided.vec");
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
- VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
}
if (Group->isReverse())
StridedVec = reverseVector(StridedVec);
- VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
+ State.set(VPDefs[J], Member, StridedVec, Part);
}
+ ++J;
}
return;
}
// The sub vector type for current instruction.
- auto *SubVT = FixedVectorType::get(ScalarTy, VF);
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ auto *SubVT = VectorType::get(ScalarTy, VF);
// Vectorize the interleaved store group.
for (unsigned Part = 0; Part < UF; Part++) {
@@ -2324,11 +2693,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
SmallVector<Value *, 4> StoredVecs;
for (unsigned i = 0; i < InterleaveFactor; i++) {
// Interleaved store group doesn't allow a gap, so each index has a member
- Instruction *Member = Group->getMember(i);
- assert(Member && "Fail to get a member from an interleaved store group");
+ assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
+
+ Value *StoredVec = State.get(StoredValues[i], Part);
- Value *StoredVec = getOrCreateVectorValue(
- cast<StoreInst>(Member)->getValueOperand(), Part);
if (Group->isReverse())
StoredVec = reverseVector(StoredVec);
@@ -2344,16 +2712,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
Value *WideVec = concatenateVectors(Builder, StoredVecs);
// Interleave the elements in the wide vector.
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *IVec = Builder.CreateShuffleVector(
- WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
+ WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
"interleaved.vec");
Instruction *NewStoreInstr;
if (BlockInMask) {
Value *BlockInMaskPart = State.get(BlockInMask, Part);
- auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
+ BlockInMaskPart,
+ createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
"interleaved.mask");
NewStoreInstr = Builder.CreateMaskedStore(
IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
@@ -2366,11 +2735,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
}
-void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
- VPTransformState &State,
- VPValue *Addr,
- VPValue *StoredValue,
- VPValue *BlockInMask) {
+void InnerLoopVectorizer::vectorizeMemoryInstruction(
+ Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
+ VPValue *StoredValue, VPValue *BlockInMask) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2387,7 +2754,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
"CM decision is not to widen the memory instruction");
Type *ScalarDataTy = getMemInstValueType(Instr);
- auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
+
+ auto *DataTy = VectorType::get(ScalarDataTy, VF);
const Align Alignment = getLoadStoreAlignment(Instr);
// Determine if the pointer operand of the access is either consecutive or
@@ -2419,19 +2787,23 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
InBounds = gep->isInBounds();
if (Reverse) {
+ assert(!VF.isScalable() &&
+ "Reversing vectors is not yet supported for scalable vectors.");
+
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
+ PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
+ ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
PartPtr->setIsInBounds(InBounds);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
+ PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
+ ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
PartPtr->setIsInBounds(InBounds);
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
} else {
+ Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
+ Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
PartPtr->setIsInBounds(InBounds);
}
@@ -2486,7 +2858,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(
- VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
+ VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
"wide.masked.load");
else
NewLI =
@@ -2497,7 +2869,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
if (Reverse)
NewLI = reverseVector(NewLI);
}
- VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
+
+ State.set(Def, Instr, NewLI, Part);
}
}
@@ -2507,6 +2880,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
VPTransformState &State) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+ // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
+ // the first lane and part.
+ if (isa<NoAliasScopeDeclInst>(Instr))
+ if (Instance.Lane != 0 || Instance.Part != 0)
+ return;
+
setDebugLocFromInst(Builder, Instr);
// Does this instruction return a value ?
@@ -2519,7 +2898,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
- auto *NewOp = State.get(User.getOperand(op), Instance);
+ auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
+ auto InputInstance = Instance;
+ if (!Operand || !OrigLoop->contains(Operand) ||
+ (Cost->isUniformAfterVectorization(Operand, State.VF)))
+ InputInstance.Lane = 0;
+ auto *NewOp = State.get(User.getOperand(op), InputInstance);
Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);
@@ -2527,7 +2911,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);
- // Add the cloned scalar to the scalar map entry.
+ // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
+ // representing scalar values in VPTransformState. Add the cloned scalar to
+ // the scalar map entry.
VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
// If we just cloned a new assumption, add it the assumption cache.
@@ -2564,7 +2950,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
Induction->addIncoming(Next, Latch);
// Create the compare.
Value *ICmp = Builder.CreateICmpEQ(Next, End);
- Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
+ Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
// Now we have two terminators. Remove the old one from the block.
Latch->getTerminator()->eraseFromParent();
@@ -2581,7 +2967,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
// Find the loop boundaries.
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
- assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
+ assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count");
Type *IdxTy = Legal->getWidestInductionType();
@@ -2627,7 +3013,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Type *Ty = TC->getType();
- Constant *Step = ConstantInt::get(Ty, VF * UF);
+ // This is where we can make the step a runtime constant.
+ Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
// If the tail is to be folded by masking, round the number of iterations N
// up to a multiple of Step instead of rounding down. This is done by first
@@ -2636,9 +3023,12 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// that it starts at zero and its Step is a power of two; the loop will then
// exit, with the last early-exit vector comparison also producing all-true.
if (Cost->foldTailByMasking()) {
- assert(isPowerOf2_32(VF * UF) &&
+ assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
"VF*UF must be a power of 2 when folding tail by masking");
- TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
+ assert(!VF.isScalable() &&
+ "Tail folding not yet supported for scalable vectors");
+ TC = Builder.CreateAdd(
+ TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
}
// Now we need to generate the expression for the part of the loop that the
@@ -2648,14 +3038,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// unroll factor (number of SIMD instructions).
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
- // If there is a non-reversed interleaved group that may speculatively access
- // memory out-of-bounds, we need to ensure that there will be at least one
- // iteration of the scalar epilogue loop. Thus, if the step evenly divides
+ // There are two cases where we need to ensure (at least) the last iteration
+ // runs in the scalar remainder loop. Thus, if the step evenly divides
// the trip count, we set the remainder to be equal to the step. If the step
// does not evenly divide the trip count, no adjustment is necessary since
// there will already be scalar iterations. Note that the minimum iterations
- // check ensures that N >= Step.
- if (VF > 1 && Cost->requiresScalarEpilogue()) {
+ // check ensures that N >= Step. The cases are:
+ // 1) If there is a non-reversed interleaved group that may speculatively
+ // access memory out-of-bounds.
+ // 2) If any instruction may follow a conditionally taken exit. That is, if
+ // the loop contains multiple exiting blocks, or a single exiting block
+ // which is not the latch.
+ if (VF.isVector() && Cost->requiresScalarEpilogue()) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}
@@ -2668,17 +3062,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
const DataLayout &DL) {
// Verify that V is a vector type with same number of elements as DstVTy.
- unsigned VF = DstVTy->getNumElements();
- VectorType *SrcVecTy = cast<VectorType>(V->getType());
+ auto *DstFVTy = cast<FixedVectorType>(DstVTy);
+ unsigned VF = DstFVTy->getNumElements();
+ auto *SrcVecTy = cast<FixedVectorType>(V->getType());
assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
Type *SrcElemTy = SrcVecTy->getElementType();
- Type *DstElemTy = DstVTy->getElementType();
+ Type *DstElemTy = DstFVTy->getElementType();
assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
"Vector elements must have same size");
// Do a direct cast if element types are castable.
if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
- return Builder.CreateBitOrPointerCast(V, DstVTy);
+ return Builder.CreateBitOrPointerCast(V, DstFVTy);
}
// V cannot be directly casted to desired vector type.
// May happen when V is a floating point vector but DstVTy is a vector of
@@ -2692,7 +3087,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
auto *VecIntTy = FixedVectorType::get(IntTy, VF);
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
- return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
+ return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
}
void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
@@ -2713,11 +3108,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
// If tail is to be folded, vector loop takes care of all iterations.
Value *CheckMinIters = Builder.getFalse();
- if (!Cost->foldTailByMasking())
- CheckMinIters = Builder.CreateICmp(
- P, Count, ConstantInt::get(Count->getType(), VF * UF),
- "min.iters.check");
-
+ if (!Cost->foldTailByMasking()) {
+ Value *Step =
+ createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
+ CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
+ }
// Create new preheader for vector loop.
LoopVectorPreHeader =
SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
@@ -2754,7 +3149,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
if (C->isZero())
return;
- assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
+ assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
+ (OptForSizeBasedOnProfile &&
+ Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size");
SCEVCheckBlock->setName("vector.scevcheck");
@@ -2792,15 +3189,8 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
if (!RtPtrChecking.Need)
return;
- Instruction *FirstCheckInst;
- Instruction *MemRuntimeCheck;
- std::tie(FirstCheckInst, MemRuntimeCheck) =
- addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
- RtPtrChecking.getChecks(), RtPtrChecking.getSE());
- assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
- "claimed checks are required");
- if (MemCheckBlock->getParent()->hasOptSize()) {
+ if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
"Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.");
@@ -2820,22 +3210,33 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
"vector.ph");
+ auto *CondBranch = cast<BranchInst>(
+ Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
+ ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
+ LoopBypassBlocks.push_back(MemCheckBlock);
+ AddedSafetyChecks = true;
+
// Update dominator only if this is first RT check.
if (LoopBypassBlocks.empty()) {
DT->changeImmediateDominator(Bypass, MemCheckBlock);
DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
}
- ReplaceInstWithInst(
- MemCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
- LoopBypassBlocks.push_back(MemCheckBlock);
- AddedSafetyChecks = true;
+ Instruction *FirstCheckInst;
+ Instruction *MemRuntimeCheck;
+ std::tie(FirstCheckInst, MemRuntimeCheck) =
+ addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
+ RtPtrChecking.getChecks(), RtPtrChecking.getSE());
+ assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
+ "claimed checks are required");
+ CondBranch->setCondition(MemRuntimeCheck);
// We currently don't use LoopVersioning for the actual loop cloning but we
// still use it to add the noalias metadata.
- LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
- PSE.getSE());
+ LVer = std::make_unique<LoopVersioning>(
+ *Legal->getLAI(),
+ Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
+ DT, PSE.getSE());
LVer->prepareNoAliasMetadata();
}
@@ -2939,74 +3340,35 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
llvm_unreachable("invalid enum");
}
-BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
- /*
- In this function we generate a new loop. The new loop will contain
- the vectorized instructions while the old loop will continue to run the
- scalar remainder.
-
- [ ] <-- loop iteration number check.
- / |
- / v
- | [ ] <-- vector loop bypass (may consist of multiple blocks).
- | / |
- | / v
- || [ ] <-- vector pre header.
- |/ |
- | v
- | [ ] \
- | [ ]_| <-- vector loop.
- | |
- | v
- | -[ ] <--- middle-block.
- | / |
- | / v
- -|- >[ ] <--- new preheader.
- | |
- | v
- | [ ] \
- | [ ]_| <-- old scalar loop to handle remainder.
- \ |
- \ v
- >[ ] <-- exit block.
- ...
- */
-
- MDNode *OrigLoopID = OrigLoop->getLoopID();
-
- // Some loops have a single integer induction variable, while other loops
- // don't. One example is c++ iterators that often have multiple pointer
- // induction variables. In the code below we also support a case where we
- // don't have a single induction variable.
- //
- // We try to obtain an induction variable from the original loop as hard
- // as possible. However if we don't find one that:
- // - is an integer
- // - counts from zero, stepping by one
- // - is the size of the widest induction variable type
- // then we create a new one.
- OldInduction = Legal->getPrimaryInduction();
- Type *IdxTy = Legal->getWidestInductionType();
-
- // Split the single block loop into the two loop structure described above.
+Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarBody = OrigLoop->getHeader();
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
- LoopExitBlock = OrigLoop->getExitBlock();
+ LoopExitBlock = OrigLoop->getUniqueExitBlock();
assert(LoopExitBlock && "Must have an exit block");
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopMiddleBlock =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
- LI, nullptr, "middle.block");
+ LI, nullptr, Twine(Prefix) + "middle.block");
LoopScalarPreHeader =
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
- nullptr, "scalar.ph");
+ nullptr, Twine(Prefix) + "scalar.ph");
+
+ // Set up branch from middle block to the exit and scalar preheader blocks.
+ // completeLoopSkeleton will update the condition to use an iteration check,
+ // if required to decide whether to execute the remainder.
+ BranchInst *BrInst =
+ BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
+ auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
+ BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
+ ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
+
// We intentionally don't let SplitBlock to update LoopInfo since
// LoopVectorBody should belong to another loop than LoopVectorPreHeader.
// LoopVectorBody is explicitly added to the correct place few lines later.
LoopVectorBody =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
- nullptr, nullptr, "vector.body");
+ nullptr, nullptr, Twine(Prefix) + "vector.body");
// Update dominator for loop exit.
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -3023,37 +3385,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
LI->addTopLevelLoop(Lp);
}
Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
+ return Lp;
+}
- // Find the loop boundaries.
- Value *Count = getOrCreateTripCount(Lp);
-
- Value *StartIdx = ConstantInt::get(IdxTy, 0);
-
- // Now, compare the new count to zero. If it is zero skip the vector loop and
- // jump to the scalar loop. This check also covers the case where the
- // backedge-taken count is uint##_max: adding one to it will overflow leading
- // to an incorrect trip count of zero. In this (rare) case we will also jump
- // to the scalar loop.
- emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
-
- // Generate the code to check any assumptions that we've made for SCEV
- // expressions.
- emitSCEVChecks(Lp, LoopScalarPreHeader);
-
- // Generate the code that checks in runtime if arrays overlap. We put the
- // checks into a separate block to make the more common case of few elements
- // faster.
- emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
- // Generate the induction variable.
- // The loop step is equal to the vectorization factor (num of SIMD elements)
- // times the unroll factor (num of SIMD instructions).
- Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
- Constant *Step = ConstantInt::get(IdxTy, VF * UF);
- Induction =
- createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
- getDebugLocFromInstOrOperands(OldInduction));
-
+void InnerLoopVectorizer::createInductionResumeValues(
+ Loop *L, Value *VectorTripCount,
+ std::pair<BasicBlock *, Value *> AdditionalBypass) {
+ assert(VectorTripCount && L && "Expected valid arguments");
+ assert(((AdditionalBypass.first && AdditionalBypass.second) ||
+ (!AdditionalBypass.first && !AdditionalBypass.second)) &&
+ "Inconsistent information about additional bypass.");
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
@@ -3061,10 +3402,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// iteration in the vectorized loop.
// If we come from a bypass edge then we need to start from the original
// start value.
-
- // This variable saves the new starting index for the scalar loop. It is used
- // to test if there are any tail iterations left once the vector loop has
- // completed.
for (auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
InductionDescriptor II = InductionEntry.second;
@@ -3076,20 +3413,32 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// Copy original phi DL over to the new one.
BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
Value *&EndValue = IVEndValues[OrigPhi];
+ Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
if (OrigPhi == OldInduction) {
// We know what the end value is.
- EndValue = CountRoundDown;
+ EndValue = VectorTripCount;
} else {
- IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
+ IRBuilder<> B(L->getLoopPreheader()->getTerminator());
Type *StepType = II.getStep()->getType();
Instruction::CastOps CastOp =
- CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
- Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
+ CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
+ Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
EndValue->setName("ind.end");
- }
+ // Compute the end value for the additional bypass (if applicable).
+ if (AdditionalBypass.first) {
+ B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
+ CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
+ StepType, true);
+ CRD =
+ B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
+ EndValueFromAdditionalBypass =
+ emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+ EndValueFromAdditionalBypass->setName("ind.end");
+ }
+ }
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
@@ -3099,42 +3448,44 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// value.
for (BasicBlock *BB : LoopBypassBlocks)
BCResumeVal->addIncoming(II.getStartValue(), BB);
+
+ if (AdditionalBypass.first)
+ BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
+ EndValueFromAdditionalBypass);
+
OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
}
+}
+
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
+ MDNode *OrigLoopID) {
+ assert(L && "Expected valid loop.");
- // We need the OrigLoop (scalar loop part) latch terminator to help
- // produce correct debug info for the middle block BB instructions.
- // The legality check stage guarantees that the loop will have a single
- // latch.
- assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
- "Scalar loop latch terminator isn't a branch");
- BranchInst *ScalarLatchBr =
- cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
+ // The trip counts should be cached by now.
+ Value *Count = getOrCreateTripCount(L);
+ Value *VectorTripCount = getOrCreateVectorTripCount(L);
+
+ auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
// If tail is to be folded, we know we don't need to run the remainder.
- Value *CmpN = Builder.getTrue();
if (!Cost->foldTailByMasking()) {
- CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
- CountRoundDown, "cmp.n",
- LoopMiddleBlock->getTerminator());
+ Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+ Count, VectorTripCount, "cmp.n",
+ LoopMiddleBlock->getTerminator());
- // Here we use the same DebugLoc as the scalar loop latch branch instead
+ // Here we use the same DebugLoc as the scalar loop latch terminator instead
// of the corresponding compare because they may have ended up with
// different line numbers and we want to avoid awkward line stepping while
// debugging. Eg. if the compare has got a line number inside the loop.
- cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
+ CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
+ cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
}
- BranchInst *BrInst =
- BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
- BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
- ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
-
// Get ready to start creating new instructions into the vectorized body.
- assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
+ assert(LoopVectorPreHeader == L->getLoopPreheader() &&
"Inconsistent vector loop preheader");
Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
@@ -3142,7 +3493,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupVectorized});
if (VectorizedLoopID.hasValue()) {
- Lp->setLoopID(VectorizedLoopID.getValue());
+ L->setLoopID(VectorizedLoopID.getValue());
// Do not setAlreadyVectorized if loop attributes have been defined
// explicitly.
@@ -3152,9 +3503,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
if (MDNode *LID = OrigLoop->getLoopID())
- Lp->setLoopID(LID);
+ L->setLoopID(LID);
- LoopVectorizeHints Hints(Lp, true, *ORE);
+ LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
#ifdef EXPENSIVE_CHECKS
@@ -3165,6 +3516,91 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
return LoopVectorPreHeader;
}
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+ /*
+ In this function we generate a new loop. The new loop will contain
+ the vectorized instructions while the old loop will continue to run the
+ scalar remainder.
+
+ [ ] <-- loop iteration number check.
+ / |
+ / v
+ | [ ] <-- vector loop bypass (may consist of multiple blocks).
+ | / |
+ | / v
+ || [ ] <-- vector pre header.
+ |/ |
+ | v
+ | [ ] \
+ | [ ]_| <-- vector loop.
+ | |
+ | v
+ | -[ ] <--- middle-block.
+ | / |
+ | / v
+ -|- >[ ] <--- new preheader.
+ | |
+ | v
+ | [ ] \
+ | [ ]_| <-- old scalar loop to handle remainder.
+ \ |
+ \ v
+ >[ ] <-- exit block.
+ ...
+ */
+
+ // Get the metadata of the original loop before it gets modified.
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
+
+ // Create an empty vector loop, and prepare basic blocks for the runtime
+ // checks.
+ Loop *Lp = createVectorLoopSkeleton("");
+
+ // Now, compare the new count to zero. If it is zero skip the vector loop and
+ // jump to the scalar loop. This check also covers the case where the
+ // backedge-taken count is uint##_max: adding one to it will overflow leading
+ // to an incorrect trip count of zero. In this (rare) case we will also jump
+ // to the scalar loop.
+ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
+
+ // Generate the code to check any assumptions that we've made for SCEV
+ // expressions.
+ emitSCEVChecks(Lp, LoopScalarPreHeader);
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+
+ // Some loops have a single integer induction variable, while other loops
+ // don't. One example is c++ iterators that often have multiple pointer
+ // induction variables. In the code below we also support a case where we
+ // don't have a single induction variable.
+ //
+ // We try to obtain an induction variable from the original loop as hard
+ // as possible. However if we don't find one that:
+ // - is an integer
+ // - counts from zero, stepping by one
+ // - is the size of the widest induction variable type
+ // then we create a new one.
+ OldInduction = Legal->getPrimaryInduction();
+ Type *IdxTy = Legal->getWidestInductionType();
+ Value *StartIdx = ConstantInt::get(IdxTy, 0);
+ // The loop step is equal to the vectorization factor (num of SIMD elements)
+ // times the unroll factor (num of SIMD instructions).
+ Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
+ Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
+ Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+ Induction =
+ createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+ getDebugLocFromInstOrOperands(OldInduction));
+
+ // Emit phis for the new starting index of the scalar loop.
+ createInductionResumeValues(Lp, CountRoundDown);
+
+ return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
// Fix up external users of the induction variable. At this point, we are
// in LCSSA form, with all external PHIs that use the IV having one input value,
// coming from the remainder loop. We need those PHIs to also have a correct
@@ -3178,7 +3614,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.
- assert(OrigLoop->getExitBlock() && "Expected a single exit block");
+ assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
DenseMap<Value *, Value *> MissingVals;
@@ -3284,9 +3720,10 @@ static void cse(BasicBlock *BB) {
}
}
-unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
- unsigned VF,
- bool &NeedToScalarize) {
+InstructionCost
+LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
+ bool &NeedToScalarize) {
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
@@ -3297,9 +3734,9 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
- unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
- TTI::TCK_RecipThroughput);
- if (VF == 1)
+ InstructionCost ScalarCallCost =
+ TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
+ if (VF.isScalar())
return ScalarCallCost;
// Compute corresponding vector type for return value and arguments.
@@ -3309,31 +3746,33 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
- unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
+ InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
- unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
+ InstructionCost Cost =
+ ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
NeedToScalarize = true;
- VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
+ VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
if (!TLI || CI->isNoBuiltin() || !VecFunc)
return Cost;
// If the corresponding vector cost is cheaper, return its cost.
- unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
- TTI::TCK_RecipThroughput);
+ InstructionCost VectorCallCost =
+ TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
if (VectorCallCost < Cost) {
NeedToScalarize = false;
- return VectorCallCost;
+ Cost = VectorCallCost;
}
return Cost;
}
-unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
+ ElementCount VF) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
@@ -3373,7 +3812,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
Type *ScalarTruncatedTy =
IntegerType::get(OriginalTy->getContext(), KV.second);
auto *TruncatedTy = FixedVectorType::get(
- ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
+ ScalarTruncatedTy,
+ cast<FixedVectorType>(OriginalTy)->getNumElements());
if (TruncatedTy == OriginalTy)
continue;
@@ -3423,13 +3863,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
break;
}
} else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
- auto Elements0 =
- cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
+ auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
+ ->getNumElements();
auto *O0 = B.CreateZExtOrTrunc(
SI->getOperand(0),
FixedVectorType::get(ScalarTruncatedTy, Elements0));
- auto Elements1 =
- cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
+ auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
+ ->getNumElements();
auto *O1 = B.CreateZExtOrTrunc(
SI->getOperand(1),
FixedVectorType::get(ScalarTruncatedTy, Elements1));
@@ -3439,16 +3879,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
// Don't do anything with the operands, just extend the result.
continue;
} else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
- auto Elements =
- cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
+ auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
+ ->getNumElements();
auto *O0 = B.CreateZExtOrTrunc(
IE->getOperand(0),
FixedVectorType::get(ScalarTruncatedTy, Elements));
auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
} else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
- auto Elements =
- cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
+ auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
+ ->getNumElements();
auto *O0 = B.CreateZExtOrTrunc(
EE->getOperand(0),
FixedVectorType::get(ScalarTruncatedTy, Elements));
@@ -3490,7 +3930,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
void InnerLoopVectorizer::fixVectorizedLoop() {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
- if (VF > 1)
+ if (VF.isVector())
truncateToMinimalBitwidths();
// Fix widened non-induction PHIs by setting up the PHI operands.
@@ -3531,9 +3971,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
// profile is not inherently precise anyway. Note also possible bypass of
// vector code caused by legality checks is ignored, assigning all the weight
// to the vector loop, optimistically.
- setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
- LI->getLoopFor(LoopVectorBody),
- LI->getLoopFor(LoopScalarBody), VF * UF);
+ //
+ // For scalable vectorization we can't know at compile time how many iterations
+ // of the loop are handled in one vector iteration, so instead assume a pessimistic
+ // vscale of '1'.
+ setProfileInfoAfterUnrolling(
+ LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
+ LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
}
void InnerLoopVectorizer::fixCrossIterationPHIs() {
@@ -3612,11 +4056,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Create a vector from the initial value.
auto *VectorInit = ScalarInit;
- if (VF > 1) {
+ if (VF.isVector()) {
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
VectorInit = Builder.CreateInsertElement(
- UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
- VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
+ PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
+ Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
}
// We constructed a temporary phi node in the first phase of vectorization.
@@ -3657,10 +4102,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// We will construct a vector for the recurrence by combining the values for
// the current and previous iterations. This is the required shuffle mask.
- SmallVector<int, 8> ShuffleMask(VF);
- ShuffleMask[0] = VF - 1;
- for (unsigned I = 1; I < VF; ++I)
- ShuffleMask[I] = I + VF - 1;
+ assert(!VF.isScalable());
+ SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
+ ShuffleMask[0] = VF.getKnownMinValue() - 1;
+ for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
+ ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
// The vector from which to take the initial value for the current iteration
// (actual or unrolled). Initially, this is the vector phi node.
@@ -3670,9 +4116,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
- auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
- ShuffleMask)
- : Incoming;
+ auto *Shuffle =
+ VF.isVector()
+ ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
+ : Incoming;
PhiPart->replaceAllUsesWith(Shuffle);
cast<Instruction>(PhiPart)->eraseFromParent();
VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
@@ -3685,10 +4132,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Extract the last vector element in the middle block. This will be the
// initial value for the recurrence when jumping to the scalar loop.
auto *ExtractForScalar = Incoming;
- if (VF > 1) {
+ if (VF.isVector()) {
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
ExtractForScalar = Builder.CreateExtractElement(
- ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
+ ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
+ "vector.recur.extract");
}
// Extract the second last element in the middle block if the
// Phi is used outside the loop. We need to extract the phi itself
@@ -3696,9 +4144,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// will be the value when jumping to the exit block from the LoopMiddleBlock,
// when the scalar loop is not run at all.
Value *ExtractForPhiUsedOutsideLoop = nullptr;
- if (VF > 1)
+ if (VF.isVector())
ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
- Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
+ Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
+ "vector.recur.extract.for.phi");
// When loop is unrolled without vectorizing, initialize
// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
// `Incoming`. This is analogous to the vectorized case above: extracting the
@@ -3722,69 +4171,31 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// vector recurrence we extracted in the middle block. Since the loop is in
// LCSSA form, we just need to find all the phi nodes for the original scalar
// recurrence in the exit block, and then add an edge for the middle block.
- for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
- if (LCSSAPhi.getIncomingValue(0) == Phi) {
+ // Note that LCSSA does not imply single entry when the original scalar loop
+ // had multiple exiting edges (as we always run the last iteration in the
+ // scalar epilogue); in that case, the exiting path through middle will be
+ // dynamically dead and the value picked for the phi doesn't matter.
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis())
+ if (any_of(LCSSAPhi.incoming_values(),
+ [Phi](Value *V) { return V == Phi; }))
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
- }
- }
}
void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
- Constant *Zero = Builder.getInt32(0);
-
// Get it's reduction variable descriptor.
assert(Legal->isReductionVariable(Phi) &&
"Unable to find the reduction variable");
RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
- RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+ RecurKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
- RdxDesc.getMinMaxRecurrenceKind();
setDebugLocFromInst(Builder, ReductionStartValue);
-
- // We need to generate a reduction vector from the incoming scalar.
- // To do so, we need to generate the 'identity' vector and override
- // one of the elements with the incoming scalar reduction. We need
- // to do it in the vector-loop preheader.
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
// This is the vector-clone of the value that leaves the loop.
Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
- // Find the reduction identity variable. Zero for addition, or, xor,
- // one for multiplication, -1 for And.
- Value *Identity;
- Value *VectorStart;
- if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
- RK == RecurrenceDescriptor::RK_FloatMinMax) {
- // MinMax reduction have the start value as their identify.
- if (VF == 1) {
- VectorStart = Identity = ReductionStartValue;
- } else {
- VectorStart = Identity =
- Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
- }
- } else {
- // Handle other reduction kinds:
- Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
- RK, VecTy->getScalarType());
- if (VF == 1) {
- Identity = Iden;
- // This vector is the Identity vector where the first element is the
- // incoming scalar reduction.
- VectorStart = ReductionStartValue;
- } else {
- Identity = ConstantVector::getSplat({VF, false}, Iden);
-
- // This vector is the Identity vector where the first element is the
- // incoming scalar reduction.
- VectorStart =
- Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
- }
- }
-
// Wrap flags are in general invalid after vectorization, clear them.
clearReductionWrapFlags(RdxDesc);
@@ -3798,10 +4209,6 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
Value *Val = getOrCreateVectorValue(LoopVal, Part);
- // Make sure to add the reduction start value only to the
- // first unroll part.
- Value *StartVal = (Part == 0) ? VectorStart : Identity;
- cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
cast<PHINode>(VecRdxPhi)
->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
}
@@ -3816,8 +4223,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// If tail is folded by masking, the vector value to leave the loop should be
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
- // instead of the former.
- if (Cost->foldTailByMasking()) {
+ // instead of the former. For an inloop reduction the reduction will already
+ // be predicated, and does not need to be handled here.
+ if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *VecLoopExitInst =
VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
@@ -3831,14 +4239,31 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
assert(Sel && "Reduction exit feeds no select");
VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+
+ // If the target can create a predicated operator for the reduction at no
+ // extra cost in the loop (for example a predicated vadd), it can be
+ // cheaper for the select to remain in the loop than be sunk out of it,
+ // and so use the select value for the phi instead of the old
+ // LoopExitValue.
+ RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
+ if (PreferPredicatedReductionSelect ||
+ TTI->preferPredicatedReductionSelect(
+ RdxDesc.getOpcode(), Phi->getType(),
+ TargetTransformInfo::ReductionFlags())) {
+ auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
+ VecRdxPhi->setIncomingValueForBlock(
+ LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
+ }
}
}
// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
- if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
- Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
+ if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
+ assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
Builder.SetInsertPoint(
LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
VectorParts RdxParts(UF);
@@ -3865,7 +4290,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// Reduce all of the unrolled parts into a single vector.
Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
- unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
+ unsigned Op = RecurrenceDescriptor::getOpcode(RK);
// The middle block terminator has already been assigned a DebugLoc here (the
// OrigLoop's single latch terminator). We want the whole middle block to
@@ -3884,14 +4309,14 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
ReducedPartRdx, "bin.rdx"),
RdxDesc.getFastMathFlags());
else
- ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
- RdxPart);
+ ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
}
- if (VF > 1) {
- bool NoNaN = Legal->hasFunNoNaNAttr();
+ // Create the reduction after the loop. Note that inloop reductions create the
+ // target reduction in the loop using a Reduction recipe.
+ if (VF.isVector() && !IsInLoopReductionPhi) {
ReducedPartRdx =
- createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
+ createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -3911,21 +4336,17 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
- // We know that the loop is in LCSSA form. We need to update the
- // PHI nodes in the exit blocks.
- for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
- // All PHINodes need to have a single entry edge, or two if
- // we already fixed them.
- assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
- // We found a reduction value exit-PHI. Update it with the
- // incoming bypass edge.
- if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
+ // We know that the loop is in LCSSA form. We need to update the PHI nodes
+ // in the exit blocks. See comment on analogous loop in
+ // fixFirstOrderRecurrence for a more complete explaination of the logic.
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis())
+ if (any_of(LCSSAPhi.incoming_values(),
+ [LoopExitInst](Value *V) { return V == LoopExitInst; }))
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
- } // end of the LCSSA phi scan.
- // Fix the scalar loop reduction variable with the incoming reduction sum
- // from the vector body and from the backedge value.
+ // Fix the scalar loop reduction variable with the incoming reduction sum
+ // from the vector body and from the backedge value.
int IncomingEdgeBlockIdx =
Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
@@ -3937,9 +4358,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
void InnerLoopVectorizer::clearReductionWrapFlags(
RecurrenceDescriptor &RdxDesc) {
- RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
- if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
- RK != RecurrenceDescriptor::RK_IntegerMult)
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ if (RK != RecurKind::Add && RK != RecurKind::Mul)
return;
Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
@@ -3968,22 +4388,27 @@ void InnerLoopVectorizer::clearReductionWrapFlags(
void InnerLoopVectorizer::fixLCSSAPHIs() {
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
- if (LCSSAPhi.getNumIncomingValues() == 1) {
- auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
- // Non-instruction incoming values will have only one value.
- unsigned LastLane = 0;
- if (isa<Instruction>(IncomingValue))
- LastLane = Cost->isUniformAfterVectorization(
- cast<Instruction>(IncomingValue), VF)
- ? 0
- : VF - 1;
- // Can be a loop invariant incoming value or the last scalar value to be
- // extracted from the vectorized loop.
- Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- Value *lastIncomingValue =
- getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
- LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
- }
+ if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
+ // Some phis were already hand updated by the reduction and recurrence
+ // code above, leave them alone.
+ continue;
+
+ auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
+ // Non-instruction incoming values will have only one value.
+ unsigned LastLane = 0;
+ if (isa<Instruction>(IncomingValue))
+ LastLane = Cost->isUniformAfterVectorization(
+ cast<Instruction>(IncomingValue), VF)
+ ? 0
+ : VF.getKnownMinValue() - 1;
+ assert((!VF.isScalable() || LastLane == 0) &&
+ "scalable vectors dont support non-uniform scalars yet");
+ // Can be a loop invariant incoming value or the last scalar value to be
+ // extracted from the vectorized loop.
+ Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+ Value *lastIncomingValue =
+ getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
+ LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
}
}
@@ -4087,9 +4512,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {
}
}
-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
- unsigned UF, unsigned VF,
- bool IsPtrLoopInvariant,
+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
+ VPUser &Operands, unsigned UF,
+ ElementCount VF, bool IsPtrLoopInvariant,
SmallBitVector &IsIndexLoopInvariant,
VPTransformState &State) {
// Construct a vector GEP by widening the operands of the scalar GEP as
@@ -4098,7 +4523,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
// is vector-typed. Thus, to keep the representation compact, we only use
// vector-typed operands for loop-varying values.
- if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+ if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
// If we are vectorizing, but the GEP has only loop-invariant operands,
// the GEP we build (by only using vector-typed operands for
// loop-varying values) would be a scalar pointer. Thus, to ensure we
@@ -4114,7 +4539,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
auto *Clone = Builder.Insert(GEP->clone());
for (unsigned Part = 0; Part < UF; ++Part) {
Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
- VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
+ State.set(VPDef, GEP, EntryPart, Part);
addMetadata(EntryPart, GEP);
}
} else {
@@ -4149,16 +4574,19 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
Indices)
: Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
- assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+ assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector");
- VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
+ State.set(VPDef, GEP, NewGEP, Part);
addMetadata(NewGEP, GEP);
}
}
}
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
- unsigned VF) {
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
+ RecurrenceDescriptor *RdxDesc,
+ Value *StartV, unsigned UF,
+ ElementCount VF) {
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
PHINode *P = cast<PHINode>(PN);
if (EnableVPlanNativePath) {
// Currently we enter here in the VPlan-native path for non-induction
@@ -4166,7 +4594,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
// Create a vector phi with no operands - the vector phi operands will be
// set at the end of vector code generation.
Type *VecTy =
- (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
+ (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
OrigPHIsToFix.push_back(P);
@@ -4181,18 +4609,60 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
- if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
+ if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
+ Value *Iden = nullptr;
+ bool ScalarPHI =
+ (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
+ Type *VecTy =
+ ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
+
+ if (RdxDesc) {
+ assert(Legal->isReductionVariable(P) && StartV &&
+ "RdxDesc should only be set for reduction variables; in that case "
+ "a StartV is also required");
+ RecurKind RK = RdxDesc->getRecurrenceKind();
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
+ // MinMax reduction have the start value as their identify.
+ if (ScalarPHI) {
+ Iden = StartV;
+ } else {
+ IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident");
+ }
+ } else {
+ Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
+ RK, VecTy->getScalarType());
+ Iden = IdenC;
+
+ if (!ScalarPHI) {
+ Iden = ConstantVector::getSplat(VF, IdenC);
+ IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ Constant *Zero = Builder.getInt32(0);
+ StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
+ }
+ }
+ }
+
for (unsigned Part = 0; Part < UF; ++Part) {
// This is phase one of vectorizing PHIs.
- Type *VecTy =
- (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
Value *EntryPart = PHINode::Create(
VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
+ if (StartV) {
+ // Make sure to add the reduction start value only to the
+ // first unroll part.
+ Value *StartVal = (Part == 0) ? StartV : Iden;
+ cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
+ }
}
return;
}
+ assert(!Legal->isReductionVariable(P) &&
+ "reductions should be handled above");
+
setDebugLocFromInst(Builder, P);
// This PHINode must be an induction variable.
@@ -4213,26 +4683,74 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
case InductionDescriptor::IK_PtrInduction: {
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
- // This is the normalized GEP that starts counting at zero.
- Value *PtrInd = Induction;
- PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If the instruction is uniform, we only need to generate the
- // first lane. Otherwise, we generate all VF values.
- unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
- // These are the scalar results. Notice that we don't generate vector GEPs
- // because scalar GEPs result in better code.
- for (unsigned Part = 0; Part < UF; ++Part) {
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
- Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
- Value *SclrGep =
- emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
- SclrGep->setName("next.gep");
- VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+
+ if (Cost->isScalarAfterVectorization(P, VF)) {
+ // This is the normalized GEP that starts counting at zero.
+ Value *PtrInd =
+ Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If the instruction is uniform, we only need to generate the
+ // first lane. Otherwise, we generate all VF values.
+ unsigned Lanes =
+ Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ Constant *Idx = ConstantInt::get(PtrInd->getType(),
+ Lane + Part * VF.getKnownMinValue());
+ Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+ Value *SclrGep =
+ emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
+ SclrGep->setName("next.gep");
+ VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+ }
}
+ return;
+ }
+ assert(isa<SCEVConstant>(II.getStep()) &&
+ "Induction step not a SCEV constant!");
+ Type *PhiType = II.getStep()->getType();
+
+ // Build a pointer phi
+ Value *ScalarStartValue = II.getStartValue();
+ Type *ScStValueType = ScalarStartValue->getType();
+ PHINode *NewPointerPhi =
+ PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
+ NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
+
+ // A pointer induction, performed by using a gep
+ BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
+ Instruction *InductionLoc = LoopLatch->getTerminator();
+ const SCEV *ScalarStep = II.getStep();
+ SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+ Value *ScalarStepValue =
+ Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
+ Value *InductionGEP = GetElementPtrInst::Create(
+ ScStValueType->getPointerElementType(), NewPointerPhi,
+ Builder.CreateMul(
+ ScalarStepValue,
+ ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
+ "ptr.ind", InductionLoc);
+ NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
+
+ // Create UF many actual address geps that use the pointer
+ // phi as base and a vectorized version of the step value
+ // (<step*0, ..., step*N>) as offset.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Constant *, 8> Indices;
+ // Create a vector of consecutive numbers from zero to VF.
+ for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
+ Indices.push_back(
+ ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
+ Constant *StartOffset = ConstantVector::get(Indices);
+
+ Value *GEP = Builder.CreateGEP(
+ ScStValueType->getPointerElementType(), NewPointerPhi,
+ Builder.CreateMul(
+ StartOffset,
+ Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
+ "vector.gep"));
+ VectorLoopValueMap.setVectorValue(P, Part, GEP);
}
- return;
}
}
}
@@ -4255,7 +4773,8 @@ static bool mayDivideByZero(Instruction &I) {
return !CInt || CInt->isZero();
}
-void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
+void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
+ VPUser &User,
VPTransformState &State) {
switch (I.getOpcode()) {
case Instruction::Call:
@@ -4297,7 +4816,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
VecOp->copyIRFlags(&I);
// Use this vector value for all users of the original instruction.
- VectorLoopValueMap.setVectorValue(&I, Part, V);
+ State.set(Def, &I, V, Part);
addMetadata(V, &I);
}
@@ -4321,7 +4840,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
} else {
C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
}
- VectorLoopValueMap.setVectorValue(&I, Part, C);
+ State.set(Def, &I, C, Part);
addMetadata(C, &I);
}
@@ -4345,12 +4864,12 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
/// Vectorize casts.
Type *DestTy =
- (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
+ (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *A = State.get(User.getOperand(0), Part);
Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
- VectorLoopValueMap.setVectorValue(&I, Part, Cast);
+ State.set(Def, &I, Cast, Part);
addMetadata(Cast, &I);
}
break;
@@ -4362,7 +4881,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
} // end of switch.
}
-void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
+ VPUser &ArgOperands,
VPTransformState &State) {
assert(!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
@@ -4373,7 +4893,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
SmallVector<Type *, 4> Tys;
for (Value *ArgOperand : CI->arg_operands())
- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+ Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
@@ -4381,11 +4901,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;
- unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
- bool UseVectorIntrinsic =
- ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
+ InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
+ bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
assert((UseVectorIntrinsic || !NeedToScalarize) &&
"Instruction should be scalarized elsewhere.");
+ assert(IntrinsicCost.isValid() && CallCost.isValid() &&
+ "Cannot have invalid costs while widening");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
@@ -4404,15 +4926,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
if (UseVectorIntrinsic) {
// Use vector version of the intrinsic.
Type *TysForDecl[] = {CI->getType()};
- if (VF > 1)
- TysForDecl[0] =
- FixedVectorType::get(CI->getType()->getScalarType(), VF);
+ if (VF.isVector()) {
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+ }
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");
} else {
// Use vector version of the function call.
- const VFShape Shape =
- VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
+ const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
#ifndef NDEBUG
assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.");
@@ -4426,12 +4948,12 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
if (isa<FPMathOperator>(V))
V->copyFastMathFlags(CI);
- VectorLoopValueMap.setVectorValue(&I, Part, V);
+ State.set(Def, &I, V, Part);
addMetadata(V, &I);
}
}
-void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
+void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
VPUser &Operands,
bool InvariantCond,
VPTransformState &State) {
@@ -4450,16 +4972,16 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
Value *Op0 = State.get(Operands.getOperand(1), Part);
Value *Op1 = State.get(Operands.getOperand(2), Part);
Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
- VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+ State.set(VPDef, &I, Sel, Part);
addMetadata(Sel, &I);
}
}
-void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
+void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.
- assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
+ assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF");
SmallSetVector<Instruction *, 8> Worklist;
@@ -4468,6 +4990,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// accesses that will remain scalar.
SmallSetVector<Instruction *, 8> ScalarPtrs;
SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+ auto *Latch = TheLoop->getLoopLatch();
// A helper that returns true if the use of Ptr by MemAccess will be scalar.
// The pointer operands of loads and stores will be scalar as long as the
@@ -4493,11 +5016,33 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
!TheLoop->isLoopInvariant(V);
};
- // A helper that evaluates a memory access's use of a pointer. If the use
- // will be a scalar use, and the pointer is only used by memory accesses, we
- // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
- // PossibleNonScalarPtrs.
+ auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
+ if (!isa<PHINode>(Ptr) ||
+ !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
+ return false;
+ auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
+ if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
+ return false;
+ return isScalarUse(MemAccess, Ptr);
+ };
+
+ // A helper that evaluates a memory access's use of a pointer. If the
+ // pointer is actually the pointer induction of a loop, it is being
+ // inserted into Worklist. If the use will be a scalar use, and the
+ // pointer is only used by memory accesses, we place the pointer in
+ // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+ if (isScalarPtrInduction(MemAccess, Ptr)) {
+ Worklist.insert(cast<Instruction>(Ptr));
+ Instruction *Update = cast<Instruction>(
+ cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
+ Worklist.insert(Update);
+ LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
+ << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
+ << "\n");
+ return;
+ }
// We only care about bitcast and getelementptr instructions contained in
// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -4521,10 +5066,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
};
// We seed the scalars analysis with three classes of instructions: (1)
- // instructions marked uniform-after-vectorization, (2) bitcast and
- // getelementptr instructions used by memory accesses requiring a scalar use,
- // and (3) pointer induction variables and their update instructions (we
- // currently only scalarize these).
+ // instructions marked uniform-after-vectorization and (2) bitcast,
+ // getelementptr and (pointer) phi instructions used by memory accesses
+ // requiring a scalar use.
//
// (1) Add to the worklist all instructions that have been identified as
// uniform-after-vectorization.
@@ -4550,24 +5094,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
Worklist.insert(I);
}
- // (3) Add to the worklist all pointer induction variables and their update
- // instructions.
- //
- // TODO: Once we are able to vectorize pointer induction variables we should
- // no longer insert them into the worklist here.
- auto *Latch = TheLoop->getLoopLatch();
- for (auto &Induction : Legal->getInductionVars()) {
- auto *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
- continue;
- Worklist.insert(Ind);
- Worklist.insert(IndUpdate);
- LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
- LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
- << "\n");
- }
-
// Insert the forced scalars.
// FIXME: Currently widenPHIInstruction() often creates a dead vector
// induction variable when the PHI user is scalarized.
@@ -4603,14 +5129,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- // We already considered pointer induction variables, so there's no reason
- // to look at their users again.
- //
- // TODO: Once we are able to vectorize pointer induction variables we
- // should no longer skip over them here.
- if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
- continue;
-
// If tail-folding is applied, the primary induction variable will be used
// to feed a vector compare.
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
@@ -4646,7 +5164,8 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
+ ElementCount VF) {
if (!blockNeedsPredication(I->getParent()))
return false;
switch(I->getOpcode()) {
@@ -4660,7 +5179,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
auto *Ty = getMemInstValueType(I);
// We have already decided how to vectorize this instruction, get that
// result.
- if (VF > 1) {
+ if (VF.isVector()) {
InstWidening WideningDecision = getWideningDecision(I, VF);
assert(WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment");
@@ -4681,8 +5200,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
return false;
}
-bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
- unsigned VF) {
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
+ Instruction *I, ElementCount VF) {
assert(isAccessInterleaved(I) && "Expecting interleaved access.");
assert(getWideningDecision(I, VF) == CM_Unknown &&
"Decision should not be set yet.");
@@ -4718,8 +5237,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
: TTI.isLegalMaskedStore(Ty, Alignment);
}
-bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
- unsigned VF) {
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
+ Instruction *I, ElementCount VF) {
// Get and ensure we have a valid memory instruction.
LoadInst *LI = dyn_cast<LoadInst>(I);
StoreInst *SI = dyn_cast<StoreInst>(I);
@@ -4746,13 +5265,13 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
return true;
}
-void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
+void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
// already does this check. Collecting Uniforms for VF=1 does not make any
// sense.
- assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
+ assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
"This function should not be visited twice for the same VF");
// Visit the list of Uniforms. If we'll not find any uniform value, we'll
@@ -4778,6 +5297,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// replicating region where only a single instance out of VF should be formed.
// TODO: optimize such seldom cases if found important, see PR40816.
auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
+ if (isOutOfScope(I)) {
+ LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
+ << *I << "\n");
+ return;
+ }
if (isScalarWithPredication(I, VF)) {
LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n");
@@ -4794,65 +5318,71 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
addToWorklistIfAllowed(Cmp);
- // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
- // are pointers that are treated like consecutive pointers during
- // vectorization. The pointer operands of interleaved accesses are an
- // example.
- SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
-
- // Holds pointer operands of instructions that are possibly non-uniform.
- SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
-
- auto isUniformDecision = [&](Instruction *I, unsigned VF) {
+ auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
InstWidening WideningDecision = getWideningDecision(I, VF);
assert(WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment");
+ // A uniform memory op is itself uniform. We exclude uniform stores
+ // here as they demand the last lane, not the first one.
+ if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
+ assert(WideningDecision == CM_Scalarize);
+ return true;
+ }
+
return (WideningDecision == CM_Widen ||
WideningDecision == CM_Widen_Reverse ||
WideningDecision == CM_Interleave);
};
- // Iterate over the instructions in the loop, and collect all
- // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
- // that a consecutive-like pointer operand will be scalarized, we collect it
- // in PossibleNonUniformPtrs instead. We use two sets here because a single
- // getelementptr instruction can be used by both vectorized and scalarized
- // memory instructions. For example, if a loop loads and stores from the same
- // location, but the store is conditional, the store will be scalarized, and
- // the getelementptr won't remain uniform.
+
+
+ // Returns true if Ptr is the pointer operand of a memory access instruction
+ // I, and I is known to not require scalarization.
+ auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
+ return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
+ };
+
+ // Holds a list of values which are known to have at least one uniform use.
+ // Note that there may be other uses which aren't uniform. A "uniform use"
+ // here is something which only demands lane 0 of the unrolled iterations;
+ // it does not imply that all lanes produce the same value (e.g. this is not
+ // the usual meaning of uniform)
+ SmallPtrSet<Value *, 8> HasUniformUse;
+
+ // Scan the loop for instructions which are either a) known to have only
+ // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
// If there's no pointer operand, there's nothing to do.
- auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
+ auto *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)
continue;
- // True if all users of Ptr are memory accesses that have Ptr as their
- // pointer operand.
- auto UsersAreMemAccesses =
- llvm::all_of(Ptr->users(), [&](User *U) -> bool {
- return getLoadStorePointerOperand(U) == Ptr;
- });
-
- // Ensure the memory instruction will not be scalarized or used by
- // gather/scatter, making its pointer operand non-uniform. If the pointer
- // operand is used by any instruction other than a memory access, we
- // conservatively assume the pointer operand may be non-uniform.
- if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
- PossibleNonUniformPtrs.insert(Ptr);
-
- // If the memory instruction will be vectorized and its pointer operand
- // is consecutive-like, or interleaving - the pointer operand should
- // remain uniform.
- else
- ConsecutiveLikePtrs.insert(Ptr);
+ // A uniform memory op is itself uniform. We exclude uniform stores
+ // here as they demand the last lane, not the first one.
+ if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
+ addToWorklistIfAllowed(&I);
+
+ if (isUniformDecision(&I, VF)) {
+ assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
+ HasUniformUse.insert(Ptr);
+ }
}
- // Add to the Worklist all consecutive and consecutive-like pointers that
- // aren't also identified as possibly non-uniform.
- for (auto *V : ConsecutiveLikePtrs)
- if (!PossibleNonUniformPtrs.count(V))
- addToWorklistIfAllowed(V);
+ // Add to the worklist any operands which have *only* uniform (e.g. lane 0
+ // demanding) users. Since loops are assumed to be in LCSSA form, this
+ // disallows uses outside the loop as well.
+ for (auto *V : HasUniformUse) {
+ if (isOutOfScope(V))
+ continue;
+ auto *I = cast<Instruction>(V);
+ auto UsersAreMemAccesses =
+ llvm::all_of(I->users(), [&](User *U) -> bool {
+ return isVectorizedMemAccessUse(cast<Instruction>(U), V);
+ });
+ if (UsersAreMemAccesses)
+ addToWorklistIfAllowed(I);
+ }
// Expand Worklist in topological order: whenever a new instruction
// is added , its users should be already inside Worklist. It ensures
@@ -4875,20 +5405,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
auto *OI = cast<Instruction>(OV);
if (llvm::all_of(OI->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
- return Worklist.count(J) ||
- (OI == getLoadStorePointerOperand(J) &&
- isUniformDecision(J, VF));
+ return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
}))
addToWorklistIfAllowed(OI);
}
}
- // Returns true if Ptr is the pointer operand of a memory access instruction
- // I, and I is known to not require scalarization.
- auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
- return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
- };
-
// For an instruction to be added into Worklist above, all its users inside
// the loop should also be in Worklist. However, this condition cannot be
// true for phi nodes that form a cyclic dependence. We must process phi
@@ -4961,8 +5483,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
- unsigned UserIC) {
+Optional<ElementCount>
+LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
// uniform if the target can skip.
@@ -4982,9 +5504,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
return None;
}
+ ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
+
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return UserVF ? UserVF : computeFeasibleMaxVF(TC);
+ return MaxVF;
+ case CM_ScalarEpilogueNotAllowedUsePredicate:
+ LLVM_FALLTHROUGH;
case CM_ScalarEpilogueNotNeededUsePredicate:
LLVM_DEBUG(
dbgs() << "LV: vector predicate hint/switch found.\n"
@@ -5005,9 +5531,26 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
// for size.
if (runtimeChecksRequired())
return None;
+
break;
}
+ // The only loops we can vectorize without a scalar epilogue, are loops with
+ // a bottom-test and a single exiting block. We'd have to handle the fact
+ // that not every instruction executes on the last iteration. This will
+ // require a lane mask which varies through the vector loop body. (TODO)
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ // If there was a tail-folding hint/switch, but we can't fold the tail by
+ // masking, fallback to a vectorization with a scalar epilogue.
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
+ "scalar epilogue instead.\n");
+ ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+ return MaxVF;
+ }
+ return None;
+ }
+
// Now try the tail folding
// Invalidate interleave groups that require an epilogue if we can't mask
@@ -5020,10 +5563,21 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
- assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
- unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
- if (TC > 0 && TC % MaxVFtimesIC == 0) {
+ assert(!MaxVF.isScalable() &&
+ "Scalable vectors do not yet support tail folding");
+ assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
+ "MaxVF must be a power of 2");
+ unsigned MaxVFtimesIC =
+ UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
+ // Avoid tail folding if the trip count is known to be a multiple of any VF we
+ // chose.
+ ScalarEvolution *SE = PSE.getSE();
+ const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+ const SCEV *ExitCount = SE->getAddExpr(
+ BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+ const SCEV *Rem = SE->getURemExpr(
+ ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
+ if (Rem->isZero()) {
// Accept MaxVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
return MaxVF;
@@ -5038,6 +5592,20 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
return MaxVF;
}
+ // If there was a tail-folding hint/switch, but we can't fold the tail by
+ // masking, fallback to a vectorization with a scalar epilogue.
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
+ "scalar epilogue instead.\n");
+ ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+ return MaxVF;
+ }
+
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
+ LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
+ return None;
+ }
+
if (TC == 0) {
reportVectorizationFailure(
"Unable to calculate the loop count due to complex control flow",
@@ -5055,8 +5623,33 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
return None;
}
-unsigned
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
+ElementCount
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF) {
+ bool IgnoreScalableUserVF = UserVF.isScalable() &&
+ !TTI.supportsScalableVectors() &&
+ !ForceTargetSupportsScalableVectors;
+ if (IgnoreScalableUserVF) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Ignoring VF=" << UserVF
+ << " because target does not support scalable vectors.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "Ignoring VF=" << ore::NV("UserVF", UserVF)
+ << " because target does not support scalable vectors.";
+ });
+ }
+
+ // Beyond this point two scenarios are handled. If UserVF isn't specified
+ // then a suitable VF is chosen. If UserVF is specified and there are
+ // dependencies, check if it's legal. However, if a UserVF is specified and
+ // there are no dependencies, then there's nothing to do.
+ if (UserVF.isNonZero() && !IgnoreScalableUserVF &&
+ Legal->isSafeForAnyVectorWidth())
+ return UserVF;
+
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5066,9 +5659,62 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
- unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
+ unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
+
+ // If the user vectorization factor is legally unsafe, clamp it to a safe
+ // value. Otherwise, return as is.
+ if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
+ unsigned MaxSafeElements =
+ PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+ ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
+
+ if (UserVF.isScalable()) {
+ Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+
+ // Scale VF by vscale before checking if it's safe.
+ MaxSafeVF = ElementCount::getScalable(
+ MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+
+ if (MaxSafeVF.isZero()) {
+ // The dependence distance is too small to use scalable vectors,
+ // fallback on fixed.
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Max legal vector width too small, scalable vectorization "
+ "unfeasible. Using fixed-width vectorization instead.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "Max legal vector width too small, scalable vectorization "
+ << "unfeasible. Using fixed-width vectorization instead.";
+ });
+ return computeFeasibleMaxVF(
+ ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+ if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
+ return UserVF;
+
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is unsafe, clamping to max safe VF=" << MaxSafeVF
+ << ".\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is unsafe, clamping to maximum safe vectorization factor "
+ << ore::NV("VectorizationFactor", MaxSafeVF);
+ });
+ return MaxSafeVF;
+ }
- WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
+ WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
// Note that both WidestRegister and WidestType may not be a powers of 2.
@@ -5079,12 +5725,13 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n");
- assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
- " into one vector!");
+ assert(MaxVectorSize <= WidestRegister &&
+ "Did not expect to pack so many elements"
+ " into one vector!");
if (MaxVectorSize == 0) {
LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
MaxVectorSize = 1;
- return MaxVectorSize;
+ return ElementCount::getFixed(MaxVectorSize);
} else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
isPowerOf2_32(ConstTripCount)) {
// We need to clamp the VF to be the ConstTripCount. There is no point in
@@ -5092,7 +5739,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n");
MaxVectorSize = ConstTripCount;
- return MaxVectorSize;
+ return ElementCount::getFixed(MaxVectorSize);
}
unsigned MaxVF = MaxVectorSize;
@@ -5100,10 +5747,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
(MaximizeBandwidth && isScalarEpilogueAllowed())) {
// Collect all viable vectorization factors larger than the default MaxVF
// (i.e. MaxVectorSize).
- SmallVector<unsigned, 8> VFs;
+ SmallVector<ElementCount, 8> VFs;
unsigned NewMaxVectorSize = WidestRegister / SmallestType;
for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
- VFs.push_back(VS);
+ VFs.push_back(ElementCount::getFixed(VS));
// For each VF calculate its register usage.
auto RUs = calculateRegisterUsage(VFs);
@@ -5118,7 +5765,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
Selected = false;
}
if (Selected) {
- MaxVF = VFs[i];
+ MaxVF = VFs[i].getKnownMinValue();
break;
}
}
@@ -5130,30 +5777,39 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
}
}
}
- return MaxVF;
+ return ElementCount::getFixed(MaxVF);
}
VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
- float Cost = expectedCost(1).first;
- const float ScalarCost = Cost;
+LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
+ // FIXME: This can be fixed for scalable vectors later, because at this stage
+ // the LoopVectorizer will only consider vectorizing a loop with scalable
+ // vectors when the loop has a hint to enable vectorization for a given VF.
+ assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
+
+ InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
+ LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
+ assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
+
unsigned Width = 1;
- LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+ const float ScalarCost = *ExpectedCost.getValue();
+ float Cost = ScalarCost;
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
- if (ForceVectorization && MaxVF > 1) {
+ if (ForceVectorization && MaxVF.isVector()) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
// evaluation.
Cost = std::numeric_limits<float>::max();
}
- for (unsigned i = 2; i <= MaxVF; i *= 2) {
+ for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
// Notice that the vector loop needs to be executed less times, so
// we need to divide the cost of the vector loops by the width of
// the vector elements.
- VectorizationCostTy C = expectedCost(i);
- float VectorCost = C.first / (float)i;
+ VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
+ assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
+ float VectorCost = *C.first.getValue() / (float)i;
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
<< " costs: " << (int)VectorCost << ".\n");
if (!C.second && !ForceVectorization) {
@@ -5162,6 +5818,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
<< " because it will not generate any vector instructions.\n");
continue;
}
+
+ // If profitable add it to ProfitableVF list.
+ if (VectorCost < ScalarCost) {
+ ProfitableVFs.push_back(VectorizationFactor(
+ {ElementCount::getFixed(i), (unsigned)VectorCost}));
+ }
+
if (VectorCost < Cost) {
Cost = VectorCost;
Width = i;
@@ -5180,10 +5843,131 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
- VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
+ VectorizationFactor Factor = {ElementCount::getFixed(Width),
+ (unsigned)(Width * Cost)};
return Factor;
}
+bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
+ const Loop &L, ElementCount VF) const {
+ // Cross iteration phis such as reductions need special handling and are
+ // currently unsupported.
+ if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
+ return Legal->isFirstOrderRecurrence(&Phi) ||
+ Legal->isReductionVariable(&Phi);
+ }))
+ return false;
+
+ // Phis with uses outside of the loop require special handling and are
+ // currently unsupported.
+ for (auto &Entry : Legal->getInductionVars()) {
+ // Look for uses of the value of the induction at the last iteration.
+ Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
+ for (User *U : PostInc->users())
+ if (!L.contains(cast<Instruction>(U)))
+ return false;
+ // Look for uses of penultimate value of the induction.
+ for (User *U : Entry.first->users())
+ if (!L.contains(cast<Instruction>(U)))
+ return false;
+ }
+
+ // Induction variables that are widened require special handling that is
+ // currently not supported.
+ if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
+ return !(this->isScalarAfterVectorization(Entry.first, VF) ||
+ this->isProfitableToScalarize(Entry.first, VF));
+ }))
+ return false;
+
+ return true;
+}
+
+bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
+ const ElementCount VF) const {
+ // FIXME: We need a much better cost-model to take different parameters such
+ // as register pressure, code size increase and cost of extra branches into
+ // account. For now we apply a very crude heuristic and only consider loops
+ // with vectorization factors larger than a certain value.
+ // We also consider epilogue vectorization unprofitable for targets that don't
+ // consider interleaving beneficial (eg. MVE).
+ if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
+ return false;
+ if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
+ return true;
+ return false;
+}
+
+VectorizationFactor
+LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
+ const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
+ VectorizationFactor Result = VectorizationFactor::Disabled();
+ if (!EnableEpilogueVectorization) {
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
+ return Result;
+ }
+
+ if (!isScalarEpilogueAllowed()) {
+ LLVM_DEBUG(
+ dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
+ "allowed.\n";);
+ return Result;
+ }
+
+ // FIXME: This can be fixed for scalable vectors later, because at this stage
+ // the LoopVectorizer will only consider vectorizing a loop with scalable
+ // vectors when the loop has a hint to enable vectorization for a given VF.
+ if (MainLoopVF.isScalable()) {
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
+ "yet supported.\n");
+ return Result;
+ }
+
+ // Not really a cost consideration, but check for unsupported cases here to
+ // simplify the logic.
+ if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
+ LLVM_DEBUG(
+ dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
+ "not a supported candidate.\n";);
+ return Result;
+ }
+
+ if (EpilogueVectorizationForceVF > 1) {
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
+ if (LVP.hasPlanWithVFs(
+ {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
+ return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
+ else {
+ LLVM_DEBUG(
+ dbgs()
+ << "LEV: Epilogue vectorization forced factor is not viable.\n";);
+ return Result;
+ }
+ }
+
+ if (TheLoop->getHeader()->getParent()->hasOptSize() ||
+ TheLoop->getHeader()->getParent()->hasMinSize()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
+ return Result;
+ }
+
+ if (!isEpilogueVectorizationProfitable(MainLoopVF))
+ return Result;
+
+ for (auto &NextVF : ProfitableVFs)
+ if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+ (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
+ LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
+ Result = NextVF;
+
+ if (Result != VectorizationFactor::Disabled())
+ LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
+ << Result.Width.getFixedValue() << "\n";);
+ return Result;
+}
+
std::pair<unsigned, unsigned>
LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned MinWidth = -1U;
@@ -5210,6 +5994,11 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
if (!Legal->isReductionVariable(PN))
continue;
RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
+ if (PreferInLoopReductions ||
+ TTI.preferInLoopReduction(RdxDesc.getOpcode(),
+ RdxDesc.getRecurrenceType(),
+ TargetTransformInfo::ReductionFlags()))
+ continue;
T = RdxDesc.getRecurrenceType();
}
@@ -5240,7 +6029,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
return {MinWidth, MaxWidth};
}
-unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
unsigned LoopCost) {
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -5263,10 +6052,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
if (Legal->getMaxSafeDepDistBytes() != -1U)
return 1;
- // Do not interleave loops with a relatively small known or estimated trip
- // count.
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
- if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
+ const bool HasReductions = !Legal->getReductionVars().empty();
+ // Do not interleave loops with a relatively small known or estimated trip
+ // count. But we will interleave when InterleaveSmallLoopScalarReduction is
+ // enabled, and the code has scalar reductions(HasReductions && VF = 1),
+ // because with the above conditions interleaving can expose ILP and break
+ // cross iteration dependences for reductions.
+ if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
+ !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
return 1;
RegisterUsage R = calculateRegisterUsage({VF})[0];
@@ -5294,7 +6088,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
<< " registers of "
<< TTI.getRegisterClassName(pair.first) << " register class\n");
- if (VF == 1) {
+ if (VF.isScalar()) {
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
TargetNumRegisters = ForceTargetNumScalarRegs;
} else {
@@ -5318,10 +6112,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
}
// Clamp the interleave ranges to reasonable counts.
- unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
+ unsigned MaxInterleaveCount =
+ TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
// Check if the user has overridden the max.
- if (VF == 1) {
+ if (VF.isScalar()) {
if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
} else {
@@ -5330,28 +6125,47 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
}
// If trip count is known or estimated compile time constant, limit the
- // interleave count to be less than the trip count divided by VF.
+ // interleave count to be less than the trip count divided by VF, provided it
+ // is at least 1.
+ //
+ // For scalable vectors we can't know if interleaving is beneficial. It may
+ // not be beneficial for small loops if none of the lanes in the second vector
+ // iterations is enabled. However, for larger loops, there is likely to be a
+ // similar benefit as for fixed-width vectors. For now, we choose to leave
+ // the InterleaveCount as if vscale is '1', although if some information about
+ // the vector is known (e.g. min vector size), we can make a better decision.
if (BestKnownTC) {
- MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
+ MaxInterleaveCount =
+ std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
+ // Make sure MaxInterleaveCount is greater than 0.
+ MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
}
- // If we did not calculate the cost for VF (because the user selected the VF)
- // then we calculate the cost of VF here.
- if (LoopCost == 0)
- LoopCost = expectedCost(VF).first;
-
- assert(LoopCost && "Non-zero loop cost expected");
+ assert(MaxInterleaveCount > 0 &&
+ "Maximum interleave count must be greater than 0");
// Clamp the calculated IC to be between the 1 and the max interleave count
// that the target and trip count allows.
if (IC > MaxInterleaveCount)
IC = MaxInterleaveCount;
- else if (IC < 1)
- IC = 1;
+ else
+ // Make sure IC is greater than 0.
+ IC = std::max(1u, IC);
+
+ assert(IC > 0 && "Interleave count must be greater than 0.");
+
+ // If we did not calculate the cost for VF (because the user selected the VF)
+ // then we calculate the cost of VF here.
+ if (LoopCost == 0) {
+ assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
+ LoopCost = *expectedCost(VF).first.getValue();
+ }
+
+ assert(LoopCost && "Non-zero loop cost expected");
// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
- if (VF > 1 && !Legal->getReductionVars().empty()) {
+ if (VF.isVector() && HasReductions) {
LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
return IC;
}
@@ -5359,11 +6173,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
// Note that if we've already vectorized the loop we will have done the
// runtime check and so interleaving won't require further checks.
bool InterleavingRequiresRuntimePointerCheck =
- (VF == 1 && Legal->getRuntimePointerChecking()->Need);
+ (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
// We want to interleave small loops in order to reduce the loop overhead and
// potentially expose ILP opportunities.
- LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
+ << "LV: IC is " << IC << '\n'
+ << "LV: VF is " << VF << '\n');
+ const bool AggressivelyInterleaveReductions =
+ TTI.enableAggressiveInterleaving(HasReductions);
if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the
@@ -5382,7 +6200,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
// by this point), we can increase the critical path length if the loop
// we're interleaving is inside another loop. Limit, by default to 2, so the
// critical path only gets increased by one reduction operation.
- if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
+ if (HasReductions && TheLoop->getLoopDepth() > 1) {
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
SmallIC = std::min(SmallIC, F);
StoresIC = std::min(StoresIC, F);
@@ -5396,14 +6214,23 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
return std::max(StoresIC, LoadsIC);
}
- LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
- return SmallIC;
+ // If there are scalar reductions and TTI has enabled aggressive
+ // interleaving for reductions, we will interleave to expose ILP.
+ if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
+ AggressivelyInterleaveReductions) {
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+ // Interleave no less than SmallIC but not as aggressive as the normal IC
+ // to satisfy the rare situation when resources are too limited.
+ return std::max(IC / 2, SmallIC);
+ } else {
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+ return SmallIC;
+ }
}
// Interleave if this is a large loop (small loops are already dealt with by
// this point) that could benefit from interleaving.
- bool HasReductions = !Legal->getReductionVars().empty();
- if (TTI.enableAggressiveInterleaving(HasReductions)) {
+ if (AggressivelyInterleaveReductions) {
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
return IC;
}
@@ -5413,7 +6240,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
}
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
-LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
+LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
// This function calculates the register usage by measuring the highest number
// of values that are alive at a single location. Obviously, this is a very
// rough estimation. We scan the loop in a topological order in order and
@@ -5485,26 +6312,17 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
TransposeEnds[Interval.second].push_back(Interval.first);
SmallPtrSet<Instruction *, 8> OpenIntervals;
-
- // Get the size of the widest register.
- unsigned MaxSafeDepDist = -1U;
- if (Legal->getMaxSafeDepDistBytes() != -1U)
- MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
- unsigned WidestRegister =
- std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
- const DataLayout &DL = TheFunction->getParent()->getDataLayout();
-
SmallVector<RegisterUsage, 8> RUs(VFs.size());
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
// A lambda that gets the register usage for the given type and VF.
- auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
- if (Ty->isTokenTy())
+ const auto &TTICapture = TTI;
+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
+ if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
return 0U;
- unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
- return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
+ return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
};
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -5528,7 +6346,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// Count the number of live intervals.
SmallMapVector<unsigned, unsigned, 4> RegUsage;
- if (VFs[j] == 1) {
+ if (VFs[j].isScalar()) {
for (auto Inst : OpenIntervals) {
unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
if (RegUsage.find(ClassID) == RegUsage.end())
@@ -5557,7 +6375,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
}
}
}
-
+
for (auto& pair : RegUsage) {
if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
@@ -5575,10 +6393,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
SmallMapVector<unsigned, unsigned, 4> Invariant;
-
+
for (auto Inst : LoopInvariants) {
- unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
- unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
+ unsigned Usage =
+ VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+ unsigned ClassID =
+ TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
if (Invariant.find(ClassID) == Invariant.end())
Invariant[ClassID] = Usage;
else
@@ -5626,12 +6446,13 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
NumPredStores > NumberOfStoresToPredicate);
}
-void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
+void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
// expected cost for interleaving.
- if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
+ if (VF.isScalar() || VF.isZero() ||
+ InstsToScalarize.find(VF) != InstsToScalarize.end())
return;
// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
@@ -5660,14 +6481,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
}
int LoopVectorizationCostModel::computePredInstDiscount(
- Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
- unsigned VF) {
+ Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
assert(!isUniformAfterVectorization(PredInst, VF) &&
"Instruction marked uniform-after-vectorization will be predicated");
// Initialize the discount to zero, meaning that the scalar version and the
// vector version cost the same.
- int Discount = 0;
+ InstructionCost Discount = 0;
// Holds instructions to analyze. The instructions we visit are mapped in
// ScalarCosts. Those instructions are the ones that would be scalarized if
@@ -5722,22 +6542,27 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the cost of the vector instruction. Note that this cost already
// includes the scalarization overhead of the predicated instruction.
- unsigned VectorCost = getInstructionCost(I, VF).first;
+ InstructionCost VectorCost = getInstructionCost(I, VF).first;
// Compute the cost of the scalarized instruction. This cost is the cost of
// the instruction as if it wasn't if-converted and instead remained in the
// predicated block. We will scale this cost by block probability after
// computing the scalarization overhead.
- unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ InstructionCost ScalarCost =
+ VF.getKnownMinValue() *
+ getInstructionCost(I, ElementCount::getFixed(1)).first;
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(I->getType(), VF)),
- APInt::getAllOnesValue(VF), true, false);
- ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
- TTI::TCK_RecipThroughput);
+ APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ ScalarCost +=
+ VF.getKnownMinValue() *
+ TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
}
// Compute the scalarization overhead of needed extractelement
@@ -5750,10 +6575,12 @@ int LoopVectorizationCostModel::computePredInstDiscount(
"Instruction has non-scalar type");
if (canBeScalarized(J))
Worklist.push_back(J);
- else if (needsExtract(J, VF))
+ else if (needsExtract(J, VF)) {
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(J->getType(), VF)),
- APInt::getAllOnesValue(VF), false, true);
+ APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
+ }
}
// Scale the total scalar cost by block probability.
@@ -5765,11 +6592,11 @@ int LoopVectorizationCostModel::computePredInstDiscount(
ScalarCosts[I] = ScalarCost;
}
- return Discount;
+ return *Discount.getValue();
}
LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(unsigned VF) {
+LoopVectorizationCostModel::expectedCost(ElementCount VF) {
VectorizationCostTy Cost;
// For each block.
@@ -5779,14 +6606,15 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
// For each instruction in the old loop.
for (Instruction &I : BB->instructionsWithoutDebug()) {
// Skip ignored values.
- if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
+ if (ValuesToIgnore.count(&I) ||
+ (VF.isVector() && VecValuesToIgnore.count(&I)))
continue;
VectorizationCostTy C = getInstructionCost(&I, VF);
// Check if we should override the cost.
if (ForceTargetInstructionCost.getNumOccurrences() > 0)
- C.first = ForceTargetInstructionCost;
+ C.first = InstructionCost(ForceTargetInstructionCost);
BlockCost.first += C.first;
BlockCost.second |= C.second;
@@ -5799,9 +6627,10 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
// if-converted. This means that the block's instructions (aside from
// stores and instructions that may divide by zero) will now be
// unconditionally executed. For the scalar case, we may not always execute
- // the predicated block. Thus, scale the block's cost by the probability of
- // executing it.
- if (VF == 1 && blockNeedsPredication(BB))
+ // the predicated block, if it is an if-else block. Thus, scale the block's
+ // cost by the probability of executing it. blockNeedsPredication from
+ // Legal is used so as to not include all blocks in tail folded loops.
+ if (VF.isScalar() && Legal->blockNeedsPredication(BB))
BlockCost.first /= getReciprocalPredBlockProb();
Cost.first += BlockCost.first;
@@ -5846,9 +6675,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
Legal->hasStride(I->getOperand(1));
}
-unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
- unsigned VF) {
- assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
+InstructionCost
+LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+ ElementCount VF) {
+ assert(VF.isVector() &&
+ "Scalarization cost of instruction implies vectorization.");
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
Type *ValTy = getMemInstValueType(I);
auto SE = PSE.getSE();
@@ -5861,14 +6693,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
// Get the cost of the scalar memory instruction and address computation.
- unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+ InstructionCost Cost =
+ VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
const Align Alignment = getLoadStoreAlignment(I);
- Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
- Alignment, AS,
- TTI::TCK_RecipThroughput);
+ Cost += VF.getKnownMinValue() *
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
+ AS, TTI::TCK_RecipThroughput);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
@@ -5889,8 +6722,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
return Cost;
}
-unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
+ ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
Value *Ptr = getLoadStorePointerOperand(I);
@@ -5901,7 +6735,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
const Align Alignment = getLoadStoreAlignment(I);
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
if (Legal->isMaskRequired(I))
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
CostKind);
@@ -5915,8 +6749,11 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
return Cost;
}
-unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
+ ElementCount VF) {
+ assert(Legal->isUniformMemOp(*I));
+
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
@@ -5937,11 +6774,12 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
(isLoopInvariantStoreValue
? 0
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
- VF - 1));
+ VF.getKnownMinValue() - 1));
}
-unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
+ ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
@@ -5953,8 +6791,9 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
TargetTransformInfo::TCK_RecipThroughput, I);
}
-unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+ ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
@@ -5963,7 +6802,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
assert(Group && "Fail to get an interleaved access group.");
unsigned InterleaveFactor = Group->getFactor();
- auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
// Holds the indices of existing members in an interleaved load group.
// An interleaved store group doesn't need this as it doesn't allow gaps.
@@ -5977,7 +6817,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
// Calculate the cost of the whole interleaved group.
bool UseMaskForGaps =
Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
- unsigned Cost = TTI.getInterleavedMemoryOpCost(
+ InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
@@ -5991,11 +6831,122 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
return Cost;
}
-unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
- unsigned VF) {
+InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
+ Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
+ // Early exit for no inloop reductions
+ if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
+ return InstructionCost::getInvalid();
+ auto *VectorTy = cast<VectorType>(Ty);
+
+ // We are looking for a pattern of, and finding the minimal acceptable cost:
+ // reduce(mul(ext(A), ext(B))) or
+ // reduce(mul(A, B)) or
+ // reduce(ext(A)) or
+ // reduce(A).
+ // The basic idea is that we walk down the tree to do that, finding the root
+ // reduction instruction in InLoopReductionImmediateChains. From there we find
+ // the pattern of mul/ext and test the cost of the entire pattern vs the cost
+ // of the components. If the reduction cost is lower then we return it for the
+ // reduction instruction and 0 for the other instructions in the pattern. If
+ // it is not we return an invalid cost specifying the orignal cost method
+ // should be used.
+ Instruction *RetI = I;
+ if ((RetI->getOpcode() == Instruction::SExt ||
+ RetI->getOpcode() == Instruction::ZExt)) {
+ if (!RetI->hasOneUser())
+ return InstructionCost::getInvalid();
+ RetI = RetI->user_back();
+ }
+ if (RetI->getOpcode() == Instruction::Mul &&
+ RetI->user_back()->getOpcode() == Instruction::Add) {
+ if (!RetI->hasOneUser())
+ return InstructionCost::getInvalid();
+ RetI = RetI->user_back();
+ }
+
+ // Test if the found instruction is a reduction, and if not return an invalid
+ // cost specifying the parent to use the original cost modelling.
+ if (!InLoopReductionImmediateChains.count(RetI))
+ return InstructionCost::getInvalid();
+
+ // Find the reduction this chain is a part of and calculate the basic cost of
+ // the reduction on its own.
+ Instruction *LastChain = InLoopReductionImmediateChains[RetI];
+ Instruction *ReductionPhi = LastChain;
+ while (!isa<PHINode>(ReductionPhi))
+ ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
+
+ RecurrenceDescriptor RdxDesc =
+ Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
+ unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(),
+ VectorTy, false, CostKind);
+
+ // Get the operand that was not the reduction chain and match it to one of the
+ // patterns, returning the better cost if it is found.
+ Instruction *RedOp = RetI->getOperand(1) == LastChain
+ ? dyn_cast<Instruction>(RetI->getOperand(0))
+ : dyn_cast<Instruction>(RetI->getOperand(1));
+
+ VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
+
+ if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
+ !TheLoop->isLoopInvariant(RedOp)) {
+ bool IsUnsigned = isa<ZExtInst>(RedOp);
+ auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
+ InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+ /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
+ CostKind);
+
+ unsigned ExtCost =
+ TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
+ TTI::CastContextHint::None, CostKind, RedOp);
+ if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
+ return I == RetI ? *RedCost.getValue() : 0;
+ } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
+ Instruction *Mul = RedOp;
+ Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
+ Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
+ if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
+ Op0->getOpcode() == Op1->getOpcode() &&
+ Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
+ !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
+ bool IsUnsigned = isa<ZExtInst>(Op0);
+ auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
+ // reduce(mul(ext, ext))
+ unsigned ExtCost =
+ TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
+ TTI::CastContextHint::None, CostKind, Op0);
+ unsigned MulCost =
+ TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
+
+ InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+ /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
+ CostKind);
+
+ if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
+ return I == RetI ? *RedCost.getValue() : 0;
+ } else {
+ unsigned MulCost =
+ TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
+
+ InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+ /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
+ CostKind);
+
+ if (RedCost.isValid() && RedCost < MulCost + BaseCost)
+ return I == RetI ? *RedCost.getValue() : 0;
+ }
+ }
+
+ return I == RetI ? BaseCost : InstructionCost::getInvalid();
+}
+
+InstructionCost
+LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
+ ElementCount VF) {
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
- if (VF == 1) {
+ if (VF.isScalar()) {
Type *ValTy = getMemInstValueType(I);
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
@@ -6008,43 +6959,52 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
}
LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+ ElementCount VF) {
// If we know that this instruction will remain uniform, check the cost of
// the scalar version.
if (isUniformAfterVectorization(I, VF))
- VF = 1;
+ VF = ElementCount::getFixed(1);
- if (VF > 1 && isProfitableToScalarize(I, VF))
+ if (VF.isVector() && isProfitableToScalarize(I, VF))
return VectorizationCostTy(InstsToScalarize[VF][I], false);
// Forced scalars do not have any scalarization overhead.
auto ForcedScalar = ForcedScalars.find(VF);
- if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
+ if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
auto InstSet = ForcedScalar->second;
if (InstSet.count(I))
- return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+ return VectorizationCostTy(
+ (getInstructionCost(I, ElementCount::getFixed(1)).first *
+ VF.getKnownMinValue()),
+ false);
}
Type *VectorTy;
- unsigned C = getInstructionCost(I, VF, VectorTy);
+ InstructionCost C = getInstructionCost(I, VF, VectorTy);
bool TypeNotScalarized =
- VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
+ VF.isVector() && VectorTy->isVectorTy() &&
+ TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
return VectorizationCostTy(C, TypeNotScalarized);
}
-unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
- unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+ ElementCount VF) {
- if (VF == 1)
+ assert(!VF.isScalable() &&
+ "cannot compute scalarization overhead for scalable vectorization");
+ if (VF.isScalar())
return 0;
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(
- cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
+ cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
+ true, false);
// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6061,11 +7021,11 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
// Skip operands that do not require extraction/scalarization and do not incur
// any overhead.
return Cost + TTI.getOperandsScalarizationOverhead(
- filterExtractingOperands(Ops, VF), VF);
+ filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
}
-void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
- if (VF == 1)
+void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
+ if (VF.isScalar())
return;
NumPredStores = 0;
for (BasicBlock *BB : TheLoop->blocks()) {
@@ -6082,23 +7042,19 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
NumPredStores++;
- if (Legal->isUniform(Ptr) &&
- // Conditional loads and stores should be scalarized and predicated.
- // isScalarWithPredication cannot be used here since masked
- // gather/scatters are not considered scalar with predication.
- !Legal->blockNeedsPredication(I.getParent())) {
+ if (Legal->isUniformMemOp(I)) {
// TODO: Avoid replicating loads and stores instead of
// relying on instcombine to remove them.
// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
- unsigned Cost = getUniformMemOpCost(&I, VF);
+ InstructionCost Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
continue;
}
// We assume that widening is the best solution when possible.
if (memoryInstructionCanBeWidened(&I, VF)) {
- unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+ InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
int ConsecutiveStride =
Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
@@ -6110,7 +7066,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
// Choose between Interleaving, Gather/Scatter or Scalarization.
- unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
+ InstructionCost InterleaveCost = std::numeric_limits<int>::max();
unsigned NumAccesses = 1;
if (isAccessInterleaved(&I)) {
auto Group = getInterleavedAccessGroup(&I);
@@ -6125,17 +7081,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
InterleaveCost = getInterleaveGroupCost(&I, VF);
}
- unsigned GatherScatterCost =
+ InstructionCost GatherScatterCost =
isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
- : std::numeric_limits<unsigned>::max();
+ : std::numeric_limits<int>::max();
- unsigned ScalarizationCost =
+ InstructionCost ScalarizationCost =
getMemInstScalarizationCost(&I, VF) * NumAccesses;
// Choose better solution for the current VF,
// write down this decision and use it during vectorization.
- unsigned Cost;
+ InstructionCost Cost;
InstWidening Decision;
if (InterleaveCost <= GatherScatterCost &&
InterleaveCost < ScalarizationCost) {
@@ -6179,8 +7135,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// Add all instructions used to generate the addresses.
SmallVector<Instruction *, 4> Worklist;
- for (auto *I : AddrDefs)
- Worklist.push_back(I);
+ append_range(Worklist, AddrDefs);
while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();
for (auto &Op : I->operands())
@@ -6199,14 +7154,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
InstWidening Decision = getWideningDecision(I, VF);
if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
// Scalarize a widened load of address.
- setWideningDecision(I, VF, CM_Scalarize,
- (VF * getMemoryInstructionCost(I, 1)));
+ setWideningDecision(
+ I, VF, CM_Scalarize,
+ (VF.getKnownMinValue() *
+ getMemoryInstructionCost(I, ElementCount::getFixed(1))));
else if (auto Group = getInterleavedAccessGroup(I)) {
// Scalarize an interleave group of address loads.
for (unsigned I = 0; I < Group->getFactor(); ++I) {
if (Instruction *Member = Group->getMember(I))
- setWideningDecision(Member, VF, CM_Scalarize,
- (VF * getMemoryInstructionCost(Member, 1)));
+ setWideningDecision(
+ Member, VF, CM_Scalarize,
+ (VF.getKnownMinValue() *
+ getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
}
}
} else
@@ -6216,9 +7175,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
}
-unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
- unsigned VF,
- Type *&VectorTy) {
+InstructionCost
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
+ Type *&VectorTy) {
Type *RetTy = I->getType();
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6240,19 +7199,22 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// blocks requires also an extract of its vector compare i1 element.
bool ScalarPredicatedBB = false;
BranchInst *BI = cast<BranchInst>(I);
- if (VF > 1 && BI->isConditional() &&
+ if (VF.isVector() && BI->isConditional() &&
(PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
ScalarPredicatedBB = true;
if (ScalarPredicatedBB) {
// Return cost for branches around scalarized and predicated blocks.
+ assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *Vec_i1Ty =
- FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
- return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
- false, true) +
- (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
- } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
+ VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+ return (TTI.getScalarizationOverhead(
+ Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
+ false, true) +
+ (TTI.getCFInstrCost(Instruction::Br, CostKind) *
+ VF.getKnownMinValue()));
+ } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
// The back-edge branch will remain, as will all scalar branches.
return TTI.getCFInstrCost(Instruction::Br, CostKind);
else
@@ -6267,20 +7229,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// First-order recurrences are replaced by vector shuffles inside the loop.
// NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
- if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
- return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- cast<VectorType>(VectorTy), VF - 1,
- FixedVectorType::get(RetTy, 1));
+ if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
+ return TTI.getShuffleCost(
+ TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
+ VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
// converted into select instructions. We require N - 1 selects per phi
// node, where N is the number of incoming values.
- if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
+ if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
return (Phi->getNumIncomingValues() - 1) *
TTI.getCmpSelInstrCost(
Instruction::Select, ToVectorTy(Phi->getType(), VF),
ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
- CostKind);
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
return TTI.getCFInstrCost(Instruction::PHI, CostKind);
}
@@ -6292,17 +7254,19 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// vector lane. Get the scalarization cost and scale this amount by the
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
- if (VF > 1 && isScalarWithPredication(I)) {
- unsigned Cost = 0;
+ if (VF.isVector() && isScalarWithPredication(I)) {
+ InstructionCost Cost = 0;
// These instructions have a non-void type, so account for the phi nodes
// that we will create. This cost is likely to be zero. The phi node
// cost, if any, should be scaled by the block probability because it
// models a copy at the end of each predicated block.
- Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
+ Cost += VF.getKnownMinValue() *
+ TTI.getCFInstrCost(Instruction::PHI, CostKind);
// The cost of the non-predicated instruction.
- Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
+ Cost += VF.getKnownMinValue() *
+ TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
// The cost of insertelement and extractelement instructions needed for
// scalarization.
@@ -6331,6 +7295,13 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// Since we will replace the stride by 1 the multiplication should go away.
if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
return 0;
+
+ // Detect reduction patterns
+ InstructionCost RedCost;
+ if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ .isValid())
+ return RedCost;
+
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
Value *Op2 = I->getOperand(1);
@@ -6341,14 +7312,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
Op2VK = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
- unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+ unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
return N * TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
- unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
return N * TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
@@ -6362,10 +7334,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
Type *CondTy = SI->getCondition()->getType();
if (!ScalarCond)
- CondTy = FixedVectorType::get(CondTy, VF);
-
+ CondTy = VectorType::get(CondTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
- CostKind, I);
+ CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
@@ -6374,18 +7345,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
VectorTy = ToVectorTy(ValTy, VF);
- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
- I);
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
}
case Instruction::Store:
case Instruction::Load: {
- unsigned Width = VF;
- if (Width > 1) {
+ ElementCount Width = VF;
+ if (Width.isVector()) {
InstWidening Decision = getWideningDecision(I, Width);
assert(Decision != CM_Unknown &&
"CM decision should be taken at this point");
if (Decision == CM_Scalarize)
- Width = 1;
+ Width = ElementCount::getFixed(1);
}
VectorTy = ToVectorTy(getMemInstValueType(I), Width);
return getMemoryInstructionCost(I, VF);
@@ -6402,15 +7373,62 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
+ // Computes the CastContextHint from a Load/Store instruction.
+ auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
+ assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ "Expected a load or a store!");
+
+ if (VF.isScalar() || !TheLoop->contains(I))
+ return TTI::CastContextHint::Normal;
+
+ switch (getWideningDecision(I, VF)) {
+ case LoopVectorizationCostModel::CM_GatherScatter:
+ return TTI::CastContextHint::GatherScatter;
+ case LoopVectorizationCostModel::CM_Interleave:
+ return TTI::CastContextHint::Interleave;
+ case LoopVectorizationCostModel::CM_Scalarize:
+ case LoopVectorizationCostModel::CM_Widen:
+ return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
+ : TTI::CastContextHint::Normal;
+ case LoopVectorizationCostModel::CM_Widen_Reverse:
+ return TTI::CastContextHint::Reversed;
+ case LoopVectorizationCostModel::CM_Unknown:
+ llvm_unreachable("Instr did not go through cost modelling?");
+ }
+
+ llvm_unreachable("Unhandled case!");
+ };
+
+ unsigned Opcode = I->getOpcode();
+ TTI::CastContextHint CCH = TTI::CastContextHint::None;
+ // For Trunc, the context is the only user, which must be a StoreInst.
+ if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
+ if (I->hasOneUse())
+ if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
+ CCH = ComputeCCH(Store);
+ }
+ // For Z/Sext, the context is the operand, which must be a LoadInst.
+ else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
+ Opcode == Instruction::FPExt) {
+ if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
+ CCH = ComputeCCH(Load);
+ }
+
// We optimize the truncation of induction variables having constant
// integer steps. The cost of these truncations is the same as the scalar
// operation.
if (isOptimizableIVTruncate(I, VF)) {
auto *Trunc = cast<TruncInst>(I);
return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
- Trunc->getSrcTy(), CostKind, Trunc);
+ Trunc->getSrcTy(), CCH, CostKind, Trunc);
}
+ // Detect reduction patterns
+ InstructionCost RedCost;
+ if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ .isValid())
+ return RedCost;
+
Type *SrcScalarTy = I->getOperand(0)->getType();
Type *SrcVecTy =
VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
@@ -6421,35 +7439,39 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
//
// Calculate the modified src and dest types.
Type *MinVecTy = VectorTy;
- if (I->getOpcode() == Instruction::Trunc) {
+ if (Opcode == Instruction::Trunc) {
SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
VectorTy =
largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
- } else if (I->getOpcode() == Instruction::ZExt ||
- I->getOpcode() == Instruction::SExt) {
+ } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
VectorTy =
smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
}
}
- unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
- return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
- CostKind, I);
+ assert(!VF.isScalable() && "VF is assumed to be non scalable");
+ unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
+ return N *
+ TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
case Instruction::Call: {
bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);
- unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
- if (getVectorIntrinsicIDForCall(CI, TLI))
- return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
+ InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
+ if (getVectorIntrinsicIDForCall(CI, TLI)) {
+ InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+ return std::min(CallCost, IntrinsicCost);
+ }
return CallCost;
}
+ case Instruction::ExtractValue:
+ return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
default:
// The cost of executing VF copies of the scalar instruction. This opcode
// is unknown. Assume that it is the same as 'mul'.
- return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
- CostKind) +
+ return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
+ Instruction::Mul, VectorTy, CostKind) +
getScalarizationOverhead(I, VF);
} // end of switch.
}
@@ -6502,7 +7524,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// detection.
for (auto &Reduction : Legal->getReductionVars()) {
RecurrenceDescriptor &RedDes = Reduction.second;
- SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+ const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Ignore type-casting instructions we identified during induction
@@ -6514,6 +7536,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}
}
+void LoopVectorizationCostModel::collectInLoopReductions() {
+ for (auto &Reduction : Legal->getReductionVars()) {
+ PHINode *Phi = Reduction.first;
+ RecurrenceDescriptor &RdxDesc = Reduction.second;
+
+ // We don't collect reductions that are type promoted (yet).
+ if (RdxDesc.getRecurrenceType() != Phi->getType())
+ continue;
+
+ // If the target would prefer this reduction to happen "in-loop", then we
+ // want to record it as such.
+ unsigned Opcode = RdxDesc.getOpcode();
+ if (!PreferInLoopReductions &&
+ !TTI.preferInLoopReduction(Opcode, Phi->getType(),
+ TargetTransformInfo::ReductionFlags()))
+ continue;
+
+ // Check that we can correctly put the reductions into the loop, by
+ // finding the chain of operations that leads from the phi to the loop
+ // exit value.
+ SmallVector<Instruction *, 4> ReductionOperations =
+ RdxDesc.getReductionOpChain(Phi, TheLoop);
+ bool InLoop = !ReductionOperations.empty();
+ if (InLoop) {
+ InLoopReductionChains[Phi] = ReductionOperations;
+ // Add the elements to InLoopReductionImmediateChains for cost modelling.
+ Instruction *LastChain = Phi;
+ for (auto *I : ReductionOperations) {
+ InLoopReductionImmediateChains[I] = LastChain;
+ LastChain = I;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
+ << " reduction for phi: " << *Phi << "\n");
+ }
+}
+
// TODO: we could return a pair of values that specify the max VF and
// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
@@ -6527,37 +7586,40 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
}
VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
- unsigned VF = UserVF;
+LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
+ assert(!UserVF.isScalable() && "scalable vectors not yet supported");
+ ElementCount VF = UserVF;
// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
// Since we cannot modify the incoming IR, we need to build VPlan upfront in
// the vectorization pipeline.
- if (!OrigLoop->empty()) {
+ if (!OrigLoop->isInnermost()) {
// If the user doesn't provide a vectorization factor, determine a
// reasonable one.
- if (!UserVF) {
- VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
+ if (UserVF.isZero()) {
+ VF = ElementCount::getFixed(
+ determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
// Make sure we have a VF > 1 for stress testing.
- if (VPlanBuildStressTest && VF < 2) {
+ if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n");
- VF = 4;
+ VF = ElementCount::getFixed(4);
}
}
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
- assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
- LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
- << " to build VPlans.\n");
+ assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+ "VF needs to be a power of two");
+ LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
+ << "VF " << VF << " to build VPlans.\n");
buildVPlans(VF, VF);
// For VPlan build stress testing, we bail out after VPlan construction.
if (VPlanBuildStressTest)
return VectorizationFactor::Disabled();
- return {VF, 0};
+ return {VF, 0 /*Cost*/};
}
LLVM_DEBUG(
@@ -6566,10 +7628,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
return VectorizationFactor::Disabled();
}
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
- unsigned UserIC) {
- assert(OrigLoop->empty() && "Inner loop expected.");
- Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
+Optional<VectorizationFactor>
+LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+ assert(OrigLoop->isInnermost() && "Inner loop expected.");
+ Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
return None;
@@ -6587,40 +7649,55 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
CM.invalidateCostModelingDecisions();
}
- if (UserVF) {
- LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
- assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+ ElementCount MaxVF = MaybeMaxVF.getValue();
+ assert(MaxVF.isNonZero() && "MaxVF is zero.");
+
+ bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
+ if (!UserVF.isZero() &&
+ (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
+ // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
+ // VFs here, this should be reverted to only use legal UserVFs once the
+ // loop below supports scalable VFs.
+ ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
+ LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
+ << " VF " << VF << ".\n");
+ assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+ "VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
- CM.selectUserVectorizationFactor(UserVF);
- buildVPlansWithVPRecipes(UserVF, UserVF);
+ CM.selectUserVectorizationFactor(VF);
+ CM.collectInLoopReductions();
+ buildVPlansWithVPRecipes(VF, VF);
LLVM_DEBUG(printPlans(dbgs()));
- return {{UserVF, 0}};
+ return {{VF, 0}};
}
- unsigned MaxVF = MaybeMaxVF.getValue();
- assert(MaxVF != 0 && "MaxVF is zero.");
+ assert(!MaxVF.isScalable() &&
+ "Scalable vectors not yet supported beyond this point");
- for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
+ for (ElementCount VF = ElementCount::getFixed(1);
+ ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
// Collect Uniform and Scalar instructions after vectorization with VF.
CM.collectUniformsAndScalars(VF);
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
- if (VF > 1)
+ if (VF.isVector())
CM.collectInstsToScalarize(VF);
}
- buildVPlansWithVPRecipes(1, MaxVF);
+ CM.collectInLoopReductions();
+
+ buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
LLVM_DEBUG(printPlans(dbgs()));
- if (MaxVF == 1)
+ if (MaxVF.isScalar())
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
return CM.selectVectorizationFactor(MaxVF);
}
-void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
+void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
<< '\n');
BestVF = VF;
@@ -6639,13 +7716,23 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
// 1. Create a new empty loop. Unlink the old loop and connect the new one.
VPCallbackILV CallbackILV(ILV);
- VPTransformState State{BestVF, BestUF, LI,
- DT, ILV.Builder, ILV.VectorLoopValueMap,
- &ILV, CallbackILV};
+ assert(BestVF.hasValue() && "Vectorization Factor is missing");
+
+ VPTransformState State{*BestVF,
+ BestUF,
+ OrigLoop,
+ LI,
+ DT,
+ ILV.Builder,
+ ILV.VectorLoopValueMap,
+ &ILV,
+ CallbackILV};
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
State.TripCount = ILV.getOrCreateTripCount(nullptr);
State.CanonicalIV = ILV.Induction;
+ ILV.printDebugTracesAtStart();
+
//===------------------------------------------------===//
//
// Notice: any optimization or new instruction that go
@@ -6661,25 +7748,48 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
ILV.fixVectorizedLoop();
+
+ ILV.printDebugTracesAtEnd();
}
void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
SmallPtrSetImpl<Instruction *> &DeadInstructions) {
- BasicBlock *Latch = OrigLoop->getLoopLatch();
- // We create new control-flow for the vectorized loop, so the original
- // condition will be dead after vectorization if it's only used by the
- // branch.
- auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
- if (Cmp && Cmp->hasOneUse())
- DeadInstructions.insert(Cmp);
+ // We create new control-flow for the vectorized loop, so the original exit
+ // conditions will be dead after vectorization if it's only used by the
+ // terminator
+ SmallVector<BasicBlock*> ExitingBlocks;
+ OrigLoop->getExitingBlocks(ExitingBlocks);
+ for (auto *BB : ExitingBlocks) {
+ auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
+ if (!Cmp || !Cmp->hasOneUse())
+ continue;
+
+ // TODO: we should introduce a getUniqueExitingBlocks on Loop
+ if (!DeadInstructions.insert(Cmp).second)
+ continue;
+
+ // The operands of the icmp is often a dead trunc, used by IndUpdate.
+ // TODO: can recurse through operands in general
+ for (Value *Op : Cmp->operands()) {
+ if (isa<TruncInst>(Op) && Op->hasOneUse())
+ DeadInstructions.insert(cast<Instruction>(Op));
+ }
+ }
// We create new "steps" for induction variable updates to which the original
// induction variables map. An original update instruction will be dead if
// all its users except the induction variable are dead.
+ auto *Latch = OrigLoop->getLoopLatch();
for (auto &Induction : Legal->getInductionVars()) {
PHINode *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+ // If the tail is to be folded by masking, the primary induction variable,
+ // if exists, isn't dead: it will be used for masking. Don't kill it.
+ if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
+ continue;
+
if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
@@ -6754,12 +7864,284 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
}
}
+//===--------------------------------------------------------------------===//
+// EpilogueVectorizerMainLoop
+//===--------------------------------------------------------------------===//
+
+/// This function is partially responsible for generating the control flow
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
+BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
+ Loop *Lp = createVectorLoopSkeleton("");
+
+ // Generate the code to check the minimum iteration count of the vector
+ // epilogue (see below).
+ EPI.EpilogueIterationCountCheck =
+ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
+ EPI.EpilogueIterationCountCheck->setName("iter.check");
+
+ // Generate the code to check any assumptions that we've made for SCEV
+ // expressions.
+ BasicBlock *SavedPreHeader = LoopVectorPreHeader;
+ emitSCEVChecks(Lp, LoopScalarPreHeader);
+
+ // If a safety check was generated save it.
+ if (SavedPreHeader != LoopVectorPreHeader)
+ EPI.SCEVSafetyCheck = SavedPreHeader;
+
+ // Generate the code that checks at runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ SavedPreHeader = LoopVectorPreHeader;
+ emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+
+ // If a safety check was generated save/overwite it.
+ if (SavedPreHeader != LoopVectorPreHeader)
+ EPI.MemSafetyCheck = SavedPreHeader;
+
+ // Generate the iteration count check for the main loop, *after* the check
+ // for the epilogue loop, so that the path-length is shorter for the case
+ // that goes directly through the vector epilogue. The longer-path length for
+ // the main loop is compensated for, by the gain from vectorizing the larger
+ // trip count. Note: the branch will get updated later on when we vectorize
+ // the epilogue.
+ EPI.MainLoopIterationCountCheck =
+ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
+
+ // Generate the induction variable.
+ OldInduction = Legal->getPrimaryInduction();
+ Type *IdxTy = Legal->getWidestInductionType();
+ Value *StartIdx = ConstantInt::get(IdxTy, 0);
+ Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+ Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+ EPI.VectorTripCount = CountRoundDown;
+ Induction =
+ createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+ getDebugLocFromInstOrOperands(OldInduction));
+
+ // Skip induction resume value creation here because they will be created in
+ // the second pass. If we created them here, they wouldn't be used anyway,
+ // because the vplan in the second pass still contains the inductions from the
+ // original loop.
+
+ return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
+ LLVM_DEBUG({
+ dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
+ << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+ << ", Main Loop UF:" << EPI.MainLoopUF
+ << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+ << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+ });
+}
+
+void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
+ DEBUG_WITH_TYPE(VerboseDebug, {
+ dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
+ });
+}
+
+BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
+ Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
+ assert(L && "Expected valid Loop.");
+ assert(Bypass && "Expected valid bypass basic block.");
+ unsigned VFactor =
+ ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
+ unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
+ Value *Count = getOrCreateTripCount(L);
+ // Reuse existing vector loop preheader for TC checks.
+ // Note that new preheader block is generated for vector loop.
+ BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+ IRBuilder<> Builder(TCCheckBlock->getTerminator());
+
+ // Generate code to check if the loop's trip count is less than VF * UF of the
+ // main vector loop.
+ auto P =
+ Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+ Value *CheckMinIters = Builder.CreateICmp(
+ P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
+ "min.iters.check");
+
+ if (!ForEpilogue)
+ TCCheckBlock->setName("vector.main.loop.iter.check");
+
+ // Create new preheader for vector loop.
+ LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
+ DT, LI, nullptr, "vector.ph");
+
+ if (ForEpilogue) {
+ assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
+ DT->getNode(Bypass)->getIDom()) &&
+ "TC check is expected to dominate Bypass");
+
+ // Update dominator for Bypass & LoopExit.
+ DT->changeImmediateDominator(Bypass, TCCheckBlock);
+ DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+
+ LoopBypassBlocks.push_back(TCCheckBlock);
+
+ // Save the trip count so we don't have to regenerate it in the
+ // vec.epilog.iter.check. This is safe to do because the trip count
+ // generated here dominates the vector epilog iter check.
+ EPI.TripCount = Count;
+ }
+
+ ReplaceInstWithInst(
+ TCCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+ return TCCheckBlock;
+}
+
+//===--------------------------------------------------------------------===//
+// EpilogueVectorizerEpilogueLoop
+//===--------------------------------------------------------------------===//
+
+/// This function is partially responsible for generating the control flow
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
+BasicBlock *
+EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
+ Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
+
+ // Now, compare the remaining count and if there aren't enough iterations to
+ // execute the vectorized epilogue skip to the scalar part.
+ BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
+ VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
+ LoopVectorPreHeader =
+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+ LI, nullptr, "vec.epilog.ph");
+ emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
+ VecEpilogueIterationCountCheck);
+
+ // Adjust the control flow taking the state info from the main loop
+ // vectorization into account.
+ assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
+ "expected this to be saved from the previous pass.");
+ EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, LoopVectorPreHeader);
+
+ DT->changeImmediateDominator(LoopVectorPreHeader,
+ EPI.MainLoopIterationCountCheck);
+
+ EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+ if (EPI.SCEVSafetyCheck)
+ EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ if (EPI.MemSafetyCheck)
+ EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+ DT->changeImmediateDominator(
+ VecEpilogueIterationCountCheck,
+ VecEpilogueIterationCountCheck->getSinglePredecessor());
+
+ DT->changeImmediateDominator(LoopScalarPreHeader,
+ EPI.EpilogueIterationCountCheck);
+ DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
+
+ // Keep track of bypass blocks, as they feed start values to the induction
+ // phis in the scalar loop preheader.
+ if (EPI.SCEVSafetyCheck)
+ LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
+ if (EPI.MemSafetyCheck)
+ LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
+ LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
+
+ // Generate a resume induction for the vector epilogue and put it in the
+ // vector epilogue preheader
+ Type *IdxTy = Legal->getWidestInductionType();
+ PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
+ LoopVectorPreHeader->getFirstNonPHI());
+ EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
+ EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
+ EPI.MainLoopIterationCountCheck);
+
+ // Generate the induction variable.
+ OldInduction = Legal->getPrimaryInduction();
+ Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+ Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+ Value *StartIdx = EPResumeVal;
+ Induction =
+ createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+ getDebugLocFromInstOrOperands(OldInduction));
+
+ // Generate induction resume values. These variables save the new starting
+ // indexes for the scalar loop. They are used to test if there are any tail
+ // iterations left once the vector loop has completed.
+ // Note that when the vectorized epilogue is skipped due to iteration count
+ // check, then the resume value for the induction variable comes from
+ // the trip count of the main vector loop, hence passing the AdditionalBypass
+ // argument.
+ createInductionResumeValues(Lp, CountRoundDown,
+ {VecEpilogueIterationCountCheck,
+ EPI.VectorTripCount} /* AdditionalBypass */);
+
+ AddRuntimeUnrollDisableMetaData(Lp);
+ return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+BasicBlock *
+EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
+ Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
+
+ assert(EPI.TripCount &&
+ "Expected trip count to have been safed in the first pass.");
+ assert(
+ (!isa<Instruction>(EPI.TripCount) ||
+ DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
+ "saved trip count does not dominate insertion point.");
+ Value *TC = EPI.TripCount;
+ IRBuilder<> Builder(Insert->getTerminator());
+ Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
+
+ // Generate code to check if the loop's trip count is less than VF * UF of the
+ // vector epilogue loop.
+ auto P =
+ Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+ Value *CheckMinIters = Builder.CreateICmp(
+ P, Count,
+ ConstantInt::get(Count->getType(),
+ EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
+ "min.epilog.iters.check");
+
+ ReplaceInstWithInst(
+ Insert->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+ LoopBypassBlocks.push_back(Insert);
+ return Insert;
+}
+
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
+ LLVM_DEBUG({
+ dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
+ << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+ << ", Main Loop UF:" << EPI.MainLoopUF
+ << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+ << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+ });
+}
+
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
+ DEBUG_WITH_TYPE(VerboseDebug, {
+ dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
+ });
+}
+
bool LoopVectorizationPlanner::getDecisionAndClampRange(
- const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
- assert(Range.End > Range.Start && "Trying to test an empty VF range.");
+ const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
+ assert(!Range.isEmpty() && "Trying to test an empty VF range.");
bool PredicateAtRangeStart = Predicate(Range.Start);
- for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
+ for (ElementCount TmpVF = Range.Start * 2;
+ ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
if (Predicate(TmpVF) != PredicateAtRangeStart) {
Range.End = TmpVF;
break;
@@ -6773,9 +8155,11 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
/// of VF's starting at a given VF and extending it as much as possible. Each
/// vectorization decision can potentially shorten this sub-range during
/// buildVPlan().
-void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
- for (unsigned VF = MinVF; VF < MaxVF + 1;) {
- VFRange SubRange = {VF, MaxVF + 1};
+void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
+ ElementCount MaxVF) {
+ auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
+ VFRange SubRange = {VF, MaxVFPlusOne};
VPlans.push_back(buildVPlan(SubRange));
VF = SubRange.End;
}
@@ -6800,7 +8184,13 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
return EdgeMaskCache[Edge] = SrcMask;
- VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
+ // If source is an exiting block, we know the exit edge is dynamically dead
+ // in the vector loop, and thus we don't need to restrict the mask. Avoid
+ // adding uses of an otherwise potentially dead instruction.
+ if (OrigLoop->isLoopExiting(Src))
+ return EdgeMaskCache[Edge] = SrcMask;
+
+ VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
assert(EdgeMask && "No Edge Mask found for condition");
if (BI->getSuccessor(0) != Dst)
@@ -6828,23 +8218,34 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
if (!CM.blockNeedsPredication(BB))
return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+ // Create the block in mask as the first non-phi instruction in the block.
+ VPBuilder::InsertPointGuard Guard(Builder);
+ auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
+ Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
+
// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC.
// Start by constructing the desired canonical IV.
VPValue *IV = nullptr;
if (Legal->getPrimaryInduction())
- IV = Plan->getVPValue(Legal->getPrimaryInduction());
+ IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
else {
auto IVRecipe = new VPWidenCanonicalIVRecipe();
- Builder.getInsertBlock()->appendRecipe(IVRecipe);
+ Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
IV = IVRecipe->getVPValue();
}
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
bool TailFolded = !CM.isScalarEpilogueAllowed();
- if (TailFolded && CM.TTI.emitGetActiveLaneMask())
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
- else
+
+ if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
+ // While ActiveLaneMask is a binary op that consumes the loop tripcount
+ // as a second argument, we only pass the IV here and extract the
+ // tripcount from the transform state where codegen of the VP instructions
+ // happen.
+ BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
+ } else {
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+ }
return BlockMaskCache[BB] = BlockMask;
}
@@ -6865,14 +8266,13 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
return BlockMaskCache[BB] = BlockMask;
}
-VPWidenMemoryInstructionRecipe *
-VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
- VPlanPtr &Plan) {
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store");
- auto willWiden = [&](unsigned VF) -> bool {
- if (VF == 1)
+ auto willWiden = [&](ElementCount VF) -> bool {
+ if (VF.isScalar())
return false;
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, VF);
@@ -6903,20 +8303,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
}
VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
+VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
if (II.getKind() == InductionDescriptor::IK_IntInduction ||
- II.getKind() == InductionDescriptor::IK_FpInduction)
- return new VPWidenIntOrFpInductionRecipe(Phi);
+ II.getKind() == InductionDescriptor::IK_FpInduction) {
+ VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start);
+ }
return nullptr;
}
VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
- VFRange &Range) const {
+VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
+ VPlan &Plan) const {
// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -6925,15 +8327,21 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
// Determine whether \p K is a truncation based on an induction variable that
// can be optimized.
auto isOptimizableIVTruncate =
- [&](Instruction *K) -> std::function<bool(unsigned)> {
- return
- [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
+ [&](Instruction *K) -> std::function<bool(ElementCount)> {
+ return [=](ElementCount VF) -> bool {
+ return CM.isOptimizableIVTruncate(K, VF);
+ };
};
if (LoopVectorizationPlanner::getDecisionAndClampRange(
- isOptimizableIVTruncate(I), Range))
+ isOptimizableIVTruncate(I), Range)) {
+
+ InductionDescriptor II =
+ Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
+ VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
- I);
+ Start, I);
+ }
return nullptr;
}
@@ -6962,7 +8370,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
VPlan &Plan) const {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
+ [this, CI](ElementCount VF) {
+ return CM.isScalarWithPredication(CI, VF);
+ },
Range);
if (IsPredicated)
@@ -6970,19 +8380,23 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
- ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+ ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
+ ID == Intrinsic::pseudoprobe ||
+ ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;
- auto willWiden = [&](unsigned VF) -> bool {
+ auto willWiden = [&](ElementCount VF) -> bool {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// The following case may be scalarized depending on the VF.
// The flag shows whether we use Intrinsic or a usual Call for vectorized
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;
- unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
- bool UseVectorIntrinsic =
- ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
+ InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
+ bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
+ assert(IntrinsicCost.isValid() && CallCost.isValid() &&
+ "Cannot have invalid costs while widening");
return UseVectorIntrinsic || !NeedToScalarize;
};
@@ -6997,7 +8411,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
!isa<StoreInst>(I) && "Instruction should have been handled earlier");
// Instruction should be widened, unless it is scalar after vectorization,
// scalarization is profitable or it is predicated.
- auto WillScalarize = [this, I](unsigned VF) -> bool {
+ auto WillScalarize = [this, I](ElementCount VF) -> bool {
return CM.isScalarAfterVectorization(I, VF) ||
CM.isProfitableToScalarize(I, VF) ||
CM.isScalarWithPredication(I, VF);
@@ -7060,15 +8474,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
VPlanPtr &Plan) {
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
+ [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+ [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
+ Range);
auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
IsUniform, IsPredicated);
setRecipe(I, Recipe);
+ Plan->addVPValue(I, Recipe);
// Find if I uses a predicated instruction. If so, it will use its scalar
// value. Avoid hoisting the insert-element which packs the scalar value into
@@ -7110,8 +8526,9 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
assert(Instr->getParent() && "Predicated instruction not in any basic block");
auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
- auto *PHIRecipe =
- Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
+ auto *PHIRecipe = Instr->getType()->isVoidTy()
+ ? nullptr
+ : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
@@ -7139,13 +8556,21 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (auto Phi = dyn_cast<PHINode>(Instr)) {
if (Phi->getParent() != OrigLoop->getHeader())
return tryToBlend(Phi, Plan);
- if ((Recipe = tryToOptimizeInductionPHI(Phi)))
+ if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
return Recipe;
+
+ if (Legal->isReductionVariable(Phi)) {
+ RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ VPValue *StartV =
+ Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
+ return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
+ }
+
return new VPWidenPHIRecipe(Phi);
}
- if (isa<TruncInst>(Instr) &&
- (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
+ if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
+ cast<TruncInst>(Instr), Range, *Plan)))
return Recipe;
if (!shouldWiden(Instr, Range))
@@ -7165,35 +8590,9 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return tryToWiden(Instr, *Plan);
}
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
- unsigned MaxVF) {
- assert(OrigLoop->empty() && "Inner loop expected.");
-
- // Collect conditions feeding internal conditional branches; they need to be
- // represented in VPlan for it to model masking.
- SmallPtrSet<Value *, 1> NeedDef;
-
- auto *Latch = OrigLoop->getLoopLatch();
- for (BasicBlock *BB : OrigLoop->blocks()) {
- if (BB == Latch)
- continue;
- BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
- if (Branch && Branch->isConditional())
- NeedDef.insert(Branch->getCondition());
- }
-
- // If the tail is to be folded by masking, the primary induction variable, if
- // exists needs to be represented in VPlan for it to model early-exit masking.
- // Also, both the Phi and the live-out instruction of each reduction are
- // required in order to introduce a select between them in VPlan.
- if (CM.foldTailByMasking()) {
- if (Legal->getPrimaryInduction())
- NeedDef.insert(Legal->getPrimaryInduction());
- for (auto &Reduction : Legal->getReductionVars()) {
- NeedDef.insert(Reduction.first);
- NeedDef.insert(Reduction.second.getLoopExitInstr());
- }
- }
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
+ ElementCount MaxVF) {
+ assert(OrigLoop->isInnermost() && "Inner loop expected.");
// Collect instructions from the original loop that will become trivially dead
// in the vectorized loop. We don't need to vectorize these instructions. For
@@ -7216,17 +8615,17 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
for (Instruction *I : DeadInstructions)
SinkAfter.erase(I);
- for (unsigned VF = MinVF; VF < MaxVF + 1;) {
- VFRange SubRange = {VF, MaxVF + 1};
- VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
- DeadInstructions, SinkAfter));
+ auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
+ VFRange SubRange = {VF, MaxVFPlusOne};
+ VPlans.push_back(
+ buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
VF = SubRange.End;
}
}
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
- VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
- SmallPtrSetImpl<Instruction *> &DeadInstructions,
+ VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
const DenseMap<Instruction *, Instruction *> &SinkAfter) {
// Hold a mapping from predicated instructions to their recipes, in order to
@@ -7249,14 +8648,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RecipeBuilder.recordRecipeOf(Entry.first);
RecipeBuilder.recordRecipeOf(Entry.second);
}
+ for (auto &Reduction : CM.getInLoopReductionChains()) {
+ PHINode *Phi = Reduction.first;
+ RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
+ const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
+
+ RecipeBuilder.recordRecipeOf(Phi);
+ for (auto &R : ReductionOperations) {
+ RecipeBuilder.recordRecipeOf(R);
+ // For min/max reducitons, where we have a pair of icmp/select, we also
+ // need to record the ICmp recipe, so it can be removed later.
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
+ RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
+ }
+ }
// For each interleave group which is relevant for this (possibly trimmed)
// Range, add it to the set of groups to be later applied to the VPlan and add
// placeholders for its members' Recipes which we'll be replacing with a
// single VPInterleaveRecipe.
for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
- auto applyIG = [IG, this](unsigned VF) -> bool {
- return (VF >= 2 && // Query is illegal for VF == 1
+ auto applyIG = [IG, this](ElementCount VF) -> bool {
+ return (VF.isVector() && // Query is illegal for VF == 1
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
};
@@ -7278,10 +8691,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
Plan->setEntry(VPBB);
- // Represent values that will have defs inside VPlan.
- for (Value *V : NeedDef)
- Plan->addVPValue(V);
-
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
@@ -7308,6 +8717,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
if (auto Recipe =
RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
+ for (auto *Def : Recipe->definedValues()) {
+ auto *UV = Def->getUnderlyingValue();
+ Plan->addVPValue(UV, Def);
+ }
+
RecipeBuilder.setRecipe(Instr, Recipe);
VPBB->appendRecipe(Recipe);
continue;
@@ -7343,6 +8757,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
for (auto &Entry : SinkAfter) {
VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
+ // If the target is in a replication region, make sure to move Sink to the
+ // block after it, not into the replication region itself.
+ if (auto *Region =
+ dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
+ if (Region->isReplicator()) {
+ assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
+ VPBasicBlock *NextBlock =
+ cast<VPBasicBlock>(Region->getSuccessors().front());
+ Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
+ continue;
+ }
+ }
Sink->moveAfter(Target);
}
@@ -7352,33 +8778,52 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
for (auto IG : InterleaveGroups) {
auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
RecipeBuilder.getRecipe(IG->getInsertPos()));
- (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
- ->insertBefore(Recipe);
+ SmallVector<VPValue *, 4> StoredValues;
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
+ StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
+ auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
+ Recipe->getMask());
+ VPIG->insertBefore(Recipe);
+ unsigned J = 0;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *Member = IG->getMember(i)) {
+ if (!Member->getType()->isVoidTy()) {
+ VPValue *OriginalV = Plan->getVPValue(Member);
+ Plan->removeVPValueFor(Member);
+ Plan->addVPValue(Member, VPIG->getVPValue(J));
+ OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
+ J++;
+ }
RecipeBuilder.getRecipe(Member)->eraseFromParent();
}
}
+ // Adjust the recipes for any inloop reductions.
+ if (Range.Start.isVector())
+ adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
+
// Finally, if tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the end of the latch.
- if (CM.foldTailByMasking()) {
+ if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
Builder.setInsertPoint(VPBB);
auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
for (auto &Reduction : Legal->getReductionVars()) {
- VPValue *Phi = Plan->getVPValue(Reduction.first);
- VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
+ if (CM.isInLoopReduction(Reduction.first))
+ continue;
+ VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
+ VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
}
}
std::string PlanName;
raw_string_ostream RSO(PlanName);
- unsigned VF = Range.Start;
+ ElementCount VF = Range.Start;
Plan->addVF(VF);
RSO << "Initial VPlan for VF={" << VF;
- for (VF *= 2; VF < Range.End; VF *= 2) {
+ for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
Plan->addVF(VF);
RSO << "," << VF;
}
@@ -7394,7 +8839,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// transformations before even evaluating whether vectorization is profitable.
// Since we cannot modify the incoming IR, we need to build VPlan upfront in
// the vectorization pipeline.
- assert(!OrigLoop->empty());
+ assert(!OrigLoop->isInnermost());
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
// Create new empty VPlan
@@ -7404,7 +8849,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
HCFGBuilder.buildHierarchicalCFG();
- for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
+ for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
+ VF *= 2)
Plan->addVF(VF);
if (EnableVPlanPredication) {
@@ -7422,6 +8868,67 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
return Plan;
}
+// Adjust the recipes for any inloop reductions. The chain of instructions
+// leading from the loop exit instr to the phi need to be converted to
+// reductions, with one operand being vector and the other being the scalar
+// reduction chain.
+void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
+ VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
+ for (auto &Reduction : CM.getInLoopReductionChains()) {
+ PHINode *Phi = Reduction.first;
+ RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
+
+ // ReductionOperations are orders top-down from the phi's use to the
+ // LoopExitValue. We keep a track of the previous item (the Chain) to tell
+ // which of the two operands will remain scalar and which will be reduced.
+ // For minmax the chain will be the select instructions.
+ Instruction *Chain = Phi;
+ for (Instruction *R : ReductionOperations) {
+ VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
+ RecurKind Kind = RdxDesc.getRecurrenceKind();
+
+ VPValue *ChainOp = Plan->getVPValue(Chain);
+ unsigned FirstOpId;
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+ assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
+ "Expected to replace a VPWidenSelectSC");
+ FirstOpId = 1;
+ } else {
+ assert(isa<VPWidenRecipe>(WidenRecipe) &&
+ "Expected to replace a VPWidenSC");
+ FirstOpId = 0;
+ }
+ unsigned VecOpId =
+ R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
+ VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
+
+ auto *CondOp = CM.foldTailByMasking()
+ ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
+ : nullptr;
+ VPReductionRecipe *RedRecipe = new VPReductionRecipe(
+ &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
+ WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
+ Plan->removeVPValueFor(R);
+ Plan->addVPValue(R, RedRecipe);
+ WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
+ WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
+ WidenRecipe->eraseFromParent();
+
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+ VPRecipeBase *CompareRecipe =
+ RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
+ assert(isa<VPWidenRecipe>(CompareRecipe) &&
+ "Expected to replace a VPWidenSC");
+ assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
+ "Expected no remaining users");
+ CompareRecipe->eraseFromParent();
+ }
+ Chain = R;
+ }
+ }
+}
+
Value* LoopVectorizationPlanner::VPCallbackILV::
getOrCreateVectorValues(Value *V, unsigned Part) {
return ILV.getOrCreateVectorValue(V, Part);
@@ -7449,29 +8956,35 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
void VPWidenCallRecipe::execute(VPTransformState &State) {
- State.ILV->widenCallInstruction(Ingredient, User, State);
+ State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
+ *this, State);
}
void VPWidenSelectRecipe::execute(VPTransformState &State) {
- State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
+ State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
+ this, *this, InvariantCond, State);
}
void VPWidenRecipe::execute(VPTransformState &State) {
- State.ILV->widenInstruction(Ingredient, User, State);
+ State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
}
void VPWidenGEPRecipe::execute(VPTransformState &State) {
- State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
+ State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
+ *this, State.UF, State.VF, IsPtrLoopInvariant,
IsIndexLoopInvariant, State);
}
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
- State.ILV->widenIntOrFpInduction(IV, Trunc);
+ State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
+ Trunc);
}
void VPWidenPHIRecipe::execute(VPTransformState &State) {
- State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
+ Value *StartV =
+ getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
+ State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF);
}
void VPBlendRecipe::execute(VPTransformState &State) {
@@ -7515,22 +9028,59 @@ void VPBlendRecipe::execute(VPTransformState &State) {
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
- State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
+ State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
+ getStoredValues(), getMask());
+}
+
+void VPReductionRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Reduction being replicated.");
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ RecurKind Kind = RdxDesc->getRecurrenceKind();
+ Value *NewVecOp = State.get(getVecOp(), Part);
+ if (VPValue *Cond = getCondOp()) {
+ Value *NewCond = State.get(Cond, Part);
+ VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
+ Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+ Kind, VecTy->getElementType());
+ Constant *IdenVec =
+ ConstantVector::getSplat(VecTy->getElementCount(), Iden);
+ Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
+ NewVecOp = Select;
+ }
+ Value *NewRed =
+ createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
+ Value *PrevInChain = State.get(getChainOp(), Part);
+ Value *NextInChain;
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+ NextInChain =
+ createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
+ NewRed, PrevInChain);
+ } else {
+ NextInChain = State.Builder.CreateBinOp(
+ (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
+ PrevInChain);
+ }
+ State.set(this, getUnderlyingInstr(), NextInChain, Part);
+ }
}
void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
- State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
- IsPredicated, State);
+ assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
+ *State.Instance, IsPredicated, State);
// Insert scalar instance packing it into a vector.
- if (AlsoPack && State.VF > 1) {
- // If we're constructing lane 0, initialize to start from undef.
+ if (AlsoPack && State.VF.isVector()) {
+ // If we're constructing lane 0, initialize to start from poison.
if (State.Instance->Lane == 0) {
- Value *Undef = UndefValue::get(
- FixedVectorType::get(Ingredient->getType(), State.VF));
- State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
+ assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+ Value *Poison = PoisonValue::get(
+ VectorType::get(getUnderlyingValue()->getType(), State.VF));
+ State.ValueMap.setVectorValue(getUnderlyingInstr(),
+ State.Instance->Part, Poison);
}
- State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
+ State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
+ *State.Instance);
}
return;
}
@@ -7538,10 +9088,12 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
// Generate scalar instances for all VF lanes of all UF parts, unless the
// instruction is uniform inwhich case generate only the first lane for each
// of the UF parts.
- unsigned EndLane = IsUniform ? 1 : State.VF;
+ unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
+ assert((!State.VF.isScalable() || IsUniform) &&
+ "Can't scalarize a scalable vector");
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
IsPredicated, State);
}
@@ -7573,8 +9125,8 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Predicated instruction PHI works per instance.");
- Instruction *ScalarPredInst = cast<Instruction>(
- State.ValueMap.getScalarValue(PredInst, *State.Instance));
+ Instruction *ScalarPredInst =
+ cast<Instruction>(State.get(getOperand(0), *State.Instance));
BasicBlock *PredicatedBB = ScalarPredInst->getParent();
BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
assert(PredicatingBB && "Predicated block has no single predecessor.");
@@ -7586,6 +9138,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
// also do that packing, thereby "hoisting" the insert-element sequence.
// Otherwise, a phi node for the scalar value is needed.
unsigned Part = State.Instance->Part;
+ Instruction *PredInst =
+ cast<Instruction>(getOperand(0)->getUnderlyingValue());
if (State.ValueMap.hasVectorValue(PredInst, Part)) {
Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
@@ -7596,16 +9150,17 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
} else {
Type *PredInstType = PredInst->getType();
PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
- Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
+ Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB);
Phi->addIncoming(ScalarPredInst, PredicatedBB);
State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
}
}
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
- VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
- State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
- getMask());
+ VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
+ State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
+ StoredValue ? nullptr : getVPValue(),
+ getAddr(), StoredValue, getMask());
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -7617,35 +9172,53 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
LoopVectorizationLegality &LVL) {
- bool OptSize =
- F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
- PGSOQueryType::IRPass);
// 1) OptSize takes precedence over all other options, i.e. if this is set,
// don't look at hints or options, and don't request a scalar epilogue.
- if (OptSize)
+ // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
+ // LoopAccessInfo (due to code dependency and not being able to reliably get
+ // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
+ // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
+ // versioning when the vectorization is forced, unlike hasOptSize. So revert
+ // back to the old way and vectorize with versioning when forced. See D81345.)
+ if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass) &&
+ Hints.getForce() != LoopVectorizeHints::FK_Enabled))
return CM_ScalarEpilogueNotAllowedOptSize;
- bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
- !PreferPredicateOverEpilog;
+ // 2) If set, obey the directives
+ if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+ switch (PreferPredicateOverEpilogue) {
+ case PreferPredicateTy::ScalarEpilogue:
+ return CM_ScalarEpilogueAllowed;
+ case PreferPredicateTy::PredicateElseScalarEpilogue:
+ return CM_ScalarEpilogueNotNeededUsePredicate;
+ case PreferPredicateTy::PredicateOrDontVectorize:
+ return CM_ScalarEpilogueNotAllowedUsePredicate;
+ };
+ }
- // 2) Next, if disabling predication is requested on the command line, honour
- // this and request a scalar epilogue.
- if (PredicateOptDisabled)
+ // 3) If set, obey the hints
+ switch (Hints.getPredicate()) {
+ case LoopVectorizeHints::FK_Enabled:
+ return CM_ScalarEpilogueNotNeededUsePredicate;
+ case LoopVectorizeHints::FK_Disabled:
return CM_ScalarEpilogueAllowed;
+ };
- // 3) and 4) look if enabling predication is requested on the command line,
- // with a loop hint, or if the TTI hook indicates this is profitable, request
- // predication .
- if (PreferPredicateOverEpilog ||
- Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
- (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
- LVL.getLAI()) &&
- Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
+ // 4) if the TTI hook indicates this is profitable, request predication.
+ if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+ LVL.getLAI()))
return CM_ScalarEpilogueNotNeededUsePredicate;
return CM_ScalarEpilogueAllowed;
}
+void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
+ unsigned Part) {
+ set(Def, V, Part);
+ ILV->setVectorValue(IRDef, Part, V);
+}
+
// Process the loop in the VPlan-native vectorization path. This path builds
// VPlan upfront in the vectorization pipeline, which allows to apply
// VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -7657,7 +9230,7 @@ static bool processLoopInVPlanNativePath(
OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
- if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
+ if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
return false;
}
@@ -7676,7 +9249,7 @@ static bool processLoopInVPlanNativePath(
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
// Get user vectorization factor.
- const unsigned UserVF = Hints.getWidth();
+ ElementCount UserVF = Hints.getWidth();
// Plan how to best vectorize, return the best VF and its cost.
const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
@@ -7691,7 +9264,7 @@ static bool processLoopInVPlanNativePath(
LVP.setBestPlan(VF.Width, 1);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
- &CM);
+ &CM, BFI, PSI);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(LB, DT);
@@ -7710,7 +9283,7 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
!EnableLoopVectorization) {}
bool LoopVectorizePass::processLoop(Loop *L) {
- assert((EnableVPlanNativePath || L->empty()) &&
+ assert((EnableVPlanNativePath || L->isInnermost()) &&
"VPlan-native path is not enabled. Only process inner loops.");
#ifndef NDEBUG
@@ -7755,7 +9328,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements(*ORE);
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
- &Requirements, &Hints, DB, AC);
+ &Requirements, &Hints, DB, AC, BFI, PSI);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
Hints.emitRemarkWithHints();
@@ -7772,11 +9345,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// even evaluating whether vectorization is profitable. Since we cannot modify
// the incoming IR, we need to build VPlan upfront in the vectorization
// pipeline.
- if (!L->empty())
+ if (!L->isInnermost())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
ORE, BFI, PSI, Hints);
- assert(L->empty() && "Inner loop expected.");
+ assert(L->isInnermost() && "Inner loop expected.");
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
@@ -7841,7 +9414,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
// Get user vectorization factor and interleave count.
- unsigned UserVF = Hints.getWidth();
+ ElementCount UserVF = Hints.getWidth();
unsigned UserIC = Hints.getInterleave();
// Plan how to best vectorize, return the best VF and its cost.
@@ -7866,7 +9439,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (VF.Width == 1) {
+ if (VF.Width.isScalar()) {
LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
VecDiagMsg = std::make_pair(
"VectorizationNotBeneficial",
@@ -7955,8 +9528,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
- InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
- &CM);
+ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
+ BFI, PSI);
LVP.executePlan(Unroller, DT);
ORE->emit([&]() {
@@ -7967,16 +9540,51 @@ bool LoopVectorizePass::processLoop(Loop *L) {
});
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
- &LVL, &CM);
- LVP.executePlan(LB, DT);
- ++LoopsVectorized;
- // Add metadata to disable runtime unrolling a scalar loop when there are
- // no runtime checks about strides and memory. A scalar loop that is
- // rarely used is not worth unrolling.
- if (!LB.areSafetyChecksAdded())
- DisableRuntimeUnroll = true;
+ // Consider vectorizing the epilogue too if it's profitable.
+ VectorizationFactor EpilogueVF =
+ CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
+ if (EpilogueVF.Width.isVector()) {
+
+ // The first pass vectorizes the main loop and creates a scalar epilogue
+ // to be vectorized by executing the plan (potentially with a different
+ // factor) again shortly afterwards.
+ EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
+ EpilogueVF.Width.getKnownMinValue(), 1);
+ EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
+ &LVL, &CM, BFI, PSI);
+
+ LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
+ LVP.executePlan(MainILV, DT);
+ ++LoopsVectorized;
+
+ simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
+ formLCSSARecursively(*L, *DT, LI, SE);
+
+ // Second pass vectorizes the epilogue and adjusts the control flow
+ // edges from the first pass.
+ LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
+ EPI.MainLoopVF = EPI.EpilogueVF;
+ EPI.MainLoopUF = EPI.EpilogueUF;
+ EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
+ ORE, EPI, &LVL, &CM, BFI, PSI);
+ LVP.executePlan(EpilogILV, DT);
+ ++LoopsEpilogueVectorized;
+
+ if (!MainILV.areSafetyChecksAdded())
+ DisableRuntimeUnroll = true;
+ } else {
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
+ &LVL, &CM, BFI, PSI);
+ LVP.executePlan(LB, DT);
+ ++LoopsVectorized;
+
+ // Add metadata to disable runtime unrolling a scalar loop when there are
+ // no runtime checks about strides and memory. A scalar loop that is
+ // rarely used is not worth unrolling.
+ if (!LB.areSafetyChecksAdded())
+ DisableRuntimeUnroll = true;
+ }
// Report the vectorization decision.
ORE->emit([&]() {
@@ -8090,7 +9698,8 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
+ TLI, TTI, nullptr, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);