summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp1687
1 files changed, 787 insertions, 900 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 859d0c92ca5a..c45dee590b84 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -58,6 +58,7 @@
#include "LoopVectorizationPlanner.h"
#include "VPRecipeBuilder.h"
#include "VPlanHCFGBuilder.h"
+#include "VPlanHCFGTransforms.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -151,6 +152,16 @@ using namespace llvm;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopVectorizeFollowupAll =
+ "llvm.loop.vectorize.followup_all";
+static const char *const LLVMLoopVectorizeFollowupVectorized =
+ "llvm.loop.vectorize.followup_vectorized";
+static const char *const LLVMLoopVectorizeFollowupEpilogue =
+ "llvm.loop.vectorize.followup_epilogue";
+/// @}
+
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
@@ -171,11 +182,11 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
-/// Maximum factor for an interleaved memory access.
-static cl::opt<unsigned> MaxInterleaveGroupFactor(
- "max-interleave-group-factor", cl::Hidden,
- cl::desc("Maximum factor for an interleaved access group (default = 8)"),
- cl::init(8));
+/// An interleave-group may need masking if it resides in a block that needs
+/// predication, or in order to mask away gaps.
+static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
+ "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+ cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
/// We don't interleave loops with a known constant trip count below this
/// number.
@@ -240,7 +251,7 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
-static cl::opt<bool> EnableVPlanNativePath(
+cl::opt<bool> EnableVPlanNativePath(
"enable-vplan-native-path", cl::init(false), cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
"support for outer loop vectorization."));
@@ -265,10 +276,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) {
return VectorType::get(Scalar, VF);
}
-// FIXME: The following helper functions have multiple implementations
-// in the project. They can be effectively organized in a common Load/Store
-// utilities unit.
-
/// A helper function that returns the type of loaded or stored value.
static Type *getMemInstValueType(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -278,25 +285,6 @@ static Type *getMemInstValueType(Value *I) {
return cast<StoreInst>(I)->getValueOperand()->getType();
}
-/// A helper function that returns the alignment of load or store instruction.
-static unsigned getMemInstAlignment(Value *I) {
- assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
- "Expected Load or Store instruction");
- if (auto *LI = dyn_cast<LoadInst>(I))
- return LI->getAlignment();
- return cast<StoreInst>(I)->getAlignment();
-}
-
-/// A helper function that returns the address space of the pointer operand of
-/// load or store instruction.
-static unsigned getMemInstAddressSpace(Value *I) {
- assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
- "Expected Load or Store instruction");
- if (auto *LI = dyn_cast<LoadInst>(I))
- return LI->getPointerAddressSpace();
- return cast<StoreInst>(I)->getPointerAddressSpace();
-}
-
/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type at the given vectorization factor.
@@ -436,8 +424,10 @@ public:
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
- /// Try to vectorize the interleaved access group that \p Instr belongs to.
- void vectorizeInterleaveGroup(Instruction *Instr);
+ /// Try to vectorize the interleaved access group that \p Instr belongs to,
+ /// optionally masking the vector operations if \p BlockInMask is non-null.
+ void vectorizeInterleaveGroup(Instruction *Instr,
+ VectorParts *BlockInMask = nullptr);
/// Vectorize Load and Store instructions, optionally masking the vector
/// operations if \p BlockInMask is non-null.
@@ -448,6 +438,9 @@ public:
/// the instruction.
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+ /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
+ void fixNonInductionPHIs(void);
+
protected:
friend class LoopVectorizationPlanner;
@@ -584,6 +577,16 @@ protected:
/// Emit bypass checks to check any memory assumptions we may have made.
void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+ /// Compute the transformed value of Index at offset StartValue using step
+ /// StepValue.
+ /// For integer induction, returns StartValue + Index * StepValue.
+ /// For pointer induction, returns StartValue[Index * StepValue].
+ /// FIXME: The newly created binary instructions should contain nsw/nuw
+ /// flags, which can be found from the original scalar operations.
+ Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
+ const DataLayout &DL,
+ const InductionDescriptor &ID) const;
+
/// Add additional metadata to \p To that was not present on \p Orig.
///
/// Currently this is used to add the noalias annotations based on the
@@ -705,6 +708,10 @@ protected:
// Holds the end values for each induction variable. We save the end values
// so we can later fix-up the external users of the induction variables.
DenseMap<PHINode *, Value *> IVEndValues;
+
+ // Vector of original scalar PHIs whose corresponding widened PHIs need to be
+ // fixed up at the end of vector code generation.
+ SmallVector<PHINode *, 8> OrigPHIsToFix;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -752,8 +759,15 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
const DILocation *DIL = Inst->getDebugLoc();
if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
- !isa<DbgInfoIntrinsic>(Inst))
- B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
+ !isa<DbgInfoIntrinsic>(Inst)) {
+ auto NewDIL = DIL->cloneWithDuplicationFactor(UF * VF);
+ if (NewDIL)
+ B.SetCurrentDebugLocation(NewDIL.getValue());
+ else
+ LLVM_DEBUG(dbgs()
+ << "Failed to create new discriminator: "
+ << DIL->getFilename() << " Line: " << DIL->getLine());
+ }
else
B.SetCurrentDebugLocation(DIL);
} else
@@ -801,367 +815,6 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
namespace llvm {
-/// The group of interleaved loads/stores sharing the same stride and
-/// close to each other.
-///
-/// Each member in this group has an index starting from 0, and the largest
-/// index should be less than interleaved factor, which is equal to the absolute
-/// value of the access's stride.
-///
-/// E.g. An interleaved load group of factor 4:
-/// for (unsigned i = 0; i < 1024; i+=4) {
-/// a = A[i]; // Member of index 0
-/// b = A[i+1]; // Member of index 1
-/// d = A[i+3]; // Member of index 3
-/// ...
-/// }
-///
-/// An interleaved store group of factor 4:
-/// for (unsigned i = 0; i < 1024; i+=4) {
-/// ...
-/// A[i] = a; // Member of index 0
-/// A[i+1] = b; // Member of index 1
-/// A[i+2] = c; // Member of index 2
-/// A[i+3] = d; // Member of index 3
-/// }
-///
-/// Note: the interleaved load group could have gaps (missing members), but
-/// the interleaved store group doesn't allow gaps.
-class InterleaveGroup {
-public:
- InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
- : Align(Align), InsertPos(Instr) {
- assert(Align && "The alignment should be non-zero");
-
- Factor = std::abs(Stride);
- assert(Factor > 1 && "Invalid interleave factor");
-
- Reverse = Stride < 0;
- Members[0] = Instr;
- }
-
- bool isReverse() const { return Reverse; }
- unsigned getFactor() const { return Factor; }
- unsigned getAlignment() const { return Align; }
- unsigned getNumMembers() const { return Members.size(); }
-
- /// Try to insert a new member \p Instr with index \p Index and
- /// alignment \p NewAlign. The index is related to the leader and it could be
- /// negative if it is the new leader.
- ///
- /// \returns false if the instruction doesn't belong to the group.
- bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
- assert(NewAlign && "The new member's alignment should be non-zero");
-
- int Key = Index + SmallestKey;
-
- // Skip if there is already a member with the same index.
- if (Members.count(Key))
- return false;
-
- if (Key > LargestKey) {
- // The largest index is always less than the interleave factor.
- if (Index >= static_cast<int>(Factor))
- return false;
-
- LargestKey = Key;
- } else if (Key < SmallestKey) {
- // The largest index is always less than the interleave factor.
- if (LargestKey - Key >= static_cast<int>(Factor))
- return false;
-
- SmallestKey = Key;
- }
-
- // It's always safe to select the minimum alignment.
- Align = std::min(Align, NewAlign);
- Members[Key] = Instr;
- return true;
- }
-
- /// Get the member with the given index \p Index
- ///
- /// \returns nullptr if contains no such member.
- Instruction *getMember(unsigned Index) const {
- int Key = SmallestKey + Index;
- if (!Members.count(Key))
- return nullptr;
-
- return Members.find(Key)->second;
- }
-
- /// Get the index for the given member. Unlike the key in the member
- /// map, the index starts from 0.
- unsigned getIndex(Instruction *Instr) const {
- for (auto I : Members)
- if (I.second == Instr)
- return I.first - SmallestKey;
-
- llvm_unreachable("InterleaveGroup contains no such member");
- }
-
- Instruction *getInsertPos() const { return InsertPos; }
- void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
-
- /// Add metadata (e.g. alias info) from the instructions in this group to \p
- /// NewInst.
- ///
- /// FIXME: this function currently does not add noalias metadata a'la
- /// addNewMedata. To do that we need to compute the intersection of the
- /// noalias info from all members.
- void addMetadata(Instruction *NewInst) const {
- SmallVector<Value *, 4> VL;
- std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
- [](std::pair<int, Instruction *> p) { return p.second; });
- propagateMetadata(NewInst, VL);
- }
-
-private:
- unsigned Factor; // Interleave Factor.
- bool Reverse;
- unsigned Align;
- DenseMap<int, Instruction *> Members;
- int SmallestKey = 0;
- int LargestKey = 0;
-
- // To avoid breaking dependences, vectorized instructions of an interleave
- // group should be inserted at either the first load or the last store in
- // program order.
- //
- // E.g. %even = load i32 // Insert Position
- // %add = add i32 %even // Use of %even
- // %odd = load i32
- //
- // store i32 %even
- // %odd = add i32 // Def of %odd
- // store i32 %odd // Insert Position
- Instruction *InsertPos;
-};
-} // end namespace llvm
-
-namespace {
-
-/// Drive the analysis of interleaved memory accesses in the loop.
-///
-/// Use this class to analyze interleaved accesses only when we can vectorize
-/// a loop. Otherwise it's meaningless to do analysis as the vectorization
-/// on interleaved accesses is unsafe.
-///
-/// The analysis collects interleave groups and records the relationships
-/// between the member and the group in a map.
-class InterleavedAccessInfo {
-public:
- InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
- DominatorTree *DT, LoopInfo *LI,
- const LoopAccessInfo *LAI)
- : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
-
- ~InterleavedAccessInfo() {
- SmallPtrSet<InterleaveGroup *, 4> DelSet;
- // Avoid releasing a pointer twice.
- for (auto &I : InterleaveGroupMap)
- DelSet.insert(I.second);
- for (auto *Ptr : DelSet)
- delete Ptr;
- }
-
- /// Analyze the interleaved accesses and collect them in interleave
- /// groups. Substitute symbolic strides using \p Strides.
- void analyzeInterleaving();
-
- /// Check if \p Instr belongs to any interleave group.
- bool isInterleaved(Instruction *Instr) const {
- return InterleaveGroupMap.count(Instr);
- }
-
- /// Get the interleave group that \p Instr belongs to.
- ///
- /// \returns nullptr if doesn't have such group.
- InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
- if (InterleaveGroupMap.count(Instr))
- return InterleaveGroupMap.find(Instr)->second;
- return nullptr;
- }
-
- /// Returns true if an interleaved group that may access memory
- /// out-of-bounds requires a scalar epilogue iteration for correctness.
- bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
-
-private:
- /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
- /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
- /// The interleaved access analysis can also add new predicates (for example
- /// by versioning strides of pointers).
- PredicatedScalarEvolution &PSE;
-
- Loop *TheLoop;
- DominatorTree *DT;
- LoopInfo *LI;
- const LoopAccessInfo *LAI;
-
- /// True if the loop may contain non-reversed interleaved groups with
- /// out-of-bounds accesses. We ensure we don't speculatively access memory
- /// out-of-bounds by executing at least one scalar epilogue iteration.
- bool RequiresScalarEpilogue = false;
-
- /// Holds the relationships between the members and the interleave group.
- DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
-
- /// Holds dependences among the memory accesses in the loop. It maps a source
- /// access to a set of dependent sink accesses.
- DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
-
- /// The descriptor for a strided memory access.
- struct StrideDescriptor {
- StrideDescriptor() = default;
- StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
- unsigned Align)
- : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
-
- // The access's stride. It is negative for a reverse access.
- int64_t Stride = 0;
-
- // The scalar expression of this access.
- const SCEV *Scev = nullptr;
-
- // The size of the memory object.
- uint64_t Size = 0;
-
- // The alignment of this access.
- unsigned Align = 0;
- };
-
- /// A type for holding instructions and their stride descriptors.
- using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
-
- /// Create a new interleave group with the given instruction \p Instr,
- /// stride \p Stride and alignment \p Align.
- ///
- /// \returns the newly created interleave group.
- InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
- unsigned Align) {
- assert(!InterleaveGroupMap.count(Instr) &&
- "Already in an interleaved access group");
- InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
- return InterleaveGroupMap[Instr];
- }
-
- /// Release the group and remove all the relationships.
- void releaseGroup(InterleaveGroup *Group) {
- for (unsigned i = 0; i < Group->getFactor(); i++)
- if (Instruction *Member = Group->getMember(i))
- InterleaveGroupMap.erase(Member);
-
- delete Group;
- }
-
- /// Collect all the accesses with a constant stride in program order.
- void collectConstStrideAccesses(
- MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
- const ValueToValueMap &Strides);
-
- /// Returns true if \p Stride is allowed in an interleaved group.
- static bool isStrided(int Stride) {
- unsigned Factor = std::abs(Stride);
- return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
- }
-
- /// Returns true if \p BB is a predicated block.
- bool isPredicated(BasicBlock *BB) const {
- return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
- }
-
- /// Returns true if LoopAccessInfo can be used for dependence queries.
- bool areDependencesValid() const {
- return LAI && LAI->getDepChecker().getDependences();
- }
-
- /// Returns true if memory accesses \p A and \p B can be reordered, if
- /// necessary, when constructing interleaved groups.
- ///
- /// \p A must precede \p B in program order. We return false if reordering is
- /// not necessary or is prevented because \p A and \p B may be dependent.
- bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
- StrideEntry *B) const {
- // Code motion for interleaved accesses can potentially hoist strided loads
- // and sink strided stores. The code below checks the legality of the
- // following two conditions:
- //
- // 1. Potentially moving a strided load (B) before any store (A) that
- // precedes B, or
- //
- // 2. Potentially moving a strided store (A) after any load or store (B)
- // that A precedes.
- //
- // It's legal to reorder A and B if we know there isn't a dependence from A
- // to B. Note that this determination is conservative since some
- // dependences could potentially be reordered safely.
-
- // A is potentially the source of a dependence.
- auto *Src = A->first;
- auto SrcDes = A->second;
-
- // B is potentially the sink of a dependence.
- auto *Sink = B->first;
- auto SinkDes = B->second;
-
- // Code motion for interleaved accesses can't violate WAR dependences.
- // Thus, reordering is legal if the source isn't a write.
- if (!Src->mayWriteToMemory())
- return true;
-
- // At least one of the accesses must be strided.
- if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
- return true;
-
- // If dependence information is not available from LoopAccessInfo,
- // conservatively assume the instructions can't be reordered.
- if (!areDependencesValid())
- return false;
-
- // If we know there is a dependence from source to sink, assume the
- // instructions can't be reordered. Otherwise, reordering is legal.
- return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
- }
-
- /// Collect the dependences from LoopAccessInfo.
- ///
- /// We process the dependences once during the interleaved access analysis to
- /// enable constant-time dependence queries.
- void collectDependences() {
- if (!areDependencesValid())
- return;
- auto *Deps = LAI->getDepChecker().getDependences();
- for (auto Dep : *Deps)
- Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
- }
-};
-
-} // end anonymous namespace
-
-static void emitMissedWarning(Function *F, Loop *L,
- const LoopVectorizeHints &LH,
- OptimizationRemarkEmitter *ORE) {
- LH.emitRemarkWithHints();
-
- if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
- if (LH.getWidth() != 1)
- ORE->emit(DiagnosticInfoOptimizationFailure(
- DEBUG_TYPE, "FailedRequestedVectorization",
- L->getStartLoc(), L->getHeader())
- << "loop not vectorized: "
- << "failed explicitly specified loop vectorization");
- else if (LH.getInterleave() != 1)
- ORE->emit(DiagnosticInfoOptimizationFailure(
- DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
- L->getHeader())
- << "loop not interleaved: "
- << "failed explicitly specified loop interleaving");
- }
-}
-
-namespace llvm {
-
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
@@ -1247,34 +900,55 @@ public:
/// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return false;
+
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
- return Scalars->second.count(I);
+ return Scalars->second.find(I) != Scalars->second.end();
}
/// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
if (VF == 1)
return true;
- assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return false;
+
auto UniformsPerVF = Uniforms.find(VF);
- return UniformsPerVF->second.count(I);
+ assert(UniformsPerVF != Uniforms.end() &&
+ "VF not yet analyzed for uniformity");
+ return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
}
/// Returns true if \p I is known to be scalar after vectorization.
bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
if (VF == 1)
return true;
- assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return false;
+
auto ScalarsPerVF = Scalars.find(VF);
- return ScalarsPerVF->second.count(I);
+ assert(ScalarsPerVF != Scalars.end() &&
+ "Scalar values are not calculated for VF");
+ return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
}
/// \returns True if instruction \p I can be truncated to a smaller bitwidth
/// for vectorization factor \p VF.
bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
- return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
+ return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
+ !isProfitableToScalarize(I, VF) &&
!isScalarAfterVectorization(I, VF);
}
@@ -1298,7 +972,7 @@ public:
/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// interleaving group \p Grp and vector width \p VF.
- void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
+ void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
InstWidening W, unsigned Cost) {
assert(VF >= 2 && "Expected VF >=2");
/// Broadcast this decicion to all instructions inside the group.
@@ -1318,6 +992,12 @@ public:
/// through the cost modeling.
InstWidening getWideningDecision(Instruction *I, unsigned VF) {
assert(VF >= 2 && "Expected VF >=2");
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return CM_GatherScatter;
+
std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
auto Itr = WideningDecisions.find(InstOnVF);
if (Itr == WideningDecisions.end())
@@ -1330,7 +1010,8 @@ public:
unsigned getWideningCost(Instruction *I, unsigned VF) {
assert(VF >= 2 && "Expected VF >=2");
std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
- assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
+ assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
+ "The cost is not calculated");
return WideningDecisions[InstOnVF].second;
}
@@ -1369,7 +1050,7 @@ public:
/// that may be vectorized as interleave, gather-scatter or scalarized.
void collectUniformsAndScalars(unsigned VF) {
// Do the analysis once.
- if (VF == 1 || Uniforms.count(VF))
+ if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
return;
setCostBasedWideningDecision(VF);
collectLoopUniforms(VF);
@@ -1414,26 +1095,58 @@ public:
/// Returns true if \p I is an instruction that will be scalarized with
/// predication. Such instructions include conditional stores and
/// instructions that may divide by zero.
- bool isScalarWithPredication(Instruction *I);
+ /// If a non-zero VF has been calculated, we check if I will be scalarized
+ /// predication for that VF.
+ bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
+
+ // Returns true if \p I is an instruction that will be predicated either
+ // through scalar predication or masked load/store or masked gather/scatter.
+ // Superset of instructions that return true for isScalarWithPredication.
+ bool isPredicatedInst(Instruction *I) {
+ if (!blockNeedsPredication(I->getParent()))
+ return false;
+ // Loads and stores that need some form of masked operation are predicated
+ // instructions.
+ if (isa<LoadInst>(I) || isa<StoreInst>(I))
+ return Legal->isMaskRequired(I);
+ return isScalarWithPredication(I);
+ }
/// Returns true if \p I is a memory instruction with consecutive memory
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+ /// Returns true if \p I is a memory instruction in an interleaved-group
+ /// of memory accesses that can be vectorized with wide vector loads/stores
+ /// and shuffles.
+ bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
+
/// Check if \p Instr belongs to any interleaved access group.
bool isAccessInterleaved(Instruction *Instr) {
return InterleaveInfo.isInterleaved(Instr);
}
/// Get the interleaved access group that \p Instr belongs to.
- const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
+ const InterleaveGroup<Instruction> *
+ getInterleavedAccessGroup(Instruction *Instr) {
return InterleaveInfo.getInterleaveGroup(Instr);
}
/// Returns true if an interleaved group requires a scalar iteration
- /// to handle accesses with gaps.
+ /// to handle accesses with gaps, and there is nothing preventing us from
+ /// creating a scalar epilogue.
bool requiresScalarEpilogue() const {
- return InterleaveInfo.requiresScalarEpilogue();
+ return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
+ }
+
+ /// Returns true if a scalar epilogue is not allowed due to optsize.
+ bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+
+ /// Returns true if all loop blocks should be masked to fold tail loop.
+ bool foldTailByMasking() const { return FoldTailByMasking; }
+
+ bool blockNeedsPredication(BasicBlock *BB) {
+ return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
private:
@@ -1482,8 +1195,10 @@ private:
/// memory access.
unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
- /// The cost calculation for Load instruction \p I with uniform pointer -
- /// scalar load + broadcast.
+ /// The cost calculation for Load/Store instruction \p I with uniform pointer -
+ /// Load: scalar load + broadcast.
+ /// Store: scalar store + (loop invariant value stored? 0 : extract of last
+ /// element)
unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
/// Returns whether the instruction is a load or store and will be a emitted
@@ -1517,6 +1232,18 @@ private:
/// vectorization as a predicated block.
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+ /// Records whether it is allowed to have the original scalar loop execute at
+ /// least once. This may be needed as a fallback loop in case runtime
+ /// aliasing/dependence checks fail, or to handle the tail/remainder
+ /// iterations when the trip count is unknown or doesn't divide by the VF,
+ /// or as a peel-loop to handle gaps in interleave-groups.
+ /// Under optsize and when the trip count is very small we don't allow any
+ /// iterations to execute in the scalar loop.
+ bool IsScalarEpilogueAllowed = true;
+
+ /// All blocks of loop are to be masked to fold tail of scalar iterations.
+ bool FoldTailByMasking = false;
+
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
@@ -1639,14 +1366,15 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp,
return false;
Function *Fn = OuterLp->getHeader()->getParent();
- if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
+ if (!Hints.allowVectorization(Fn, OuterLp,
+ true /*VectorizeOnlyWhenForced*/)) {
LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
return false;
}
if (!Hints.getWidth()) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
- emitMissedWarning(Fn, OuterLp, Hints, ORE);
+ Hints.emitRemarkWithHints();
return false;
}
@@ -1654,7 +1382,7 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp,
// TODO: Interleave support is future work.
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n");
- emitMissedWarning(Fn, OuterLp, Hints, ORE);
+ Hints.emitRemarkWithHints();
return false;
}
@@ -1695,10 +1423,11 @@ struct LoopVectorize : public FunctionPass {
LoopVectorizePass Impl;
- explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
+ explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
+ bool VectorizeOnlyWhenForced = false)
: FunctionPass(ID) {
- Impl.DisableUnrolling = NoUnrolling;
- Impl.AlwaysVectorize = AlwaysVectorize;
+ Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
+ Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
@@ -1737,8 +1466,16 @@ struct LoopVectorize : public FunctionPass {
AU.addRequired<LoopAccessLegacyAnalysis>();
AU.addRequired<DemandedBitsWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
+
+ // We currently do not preserve loopinfo/dominator analyses with outer loop
+ // vectorization. Until this is addressed, mark these analyses as preserved
+ // only for non-VPlan-native path.
+ // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+ if (!EnableVPlanNativePath) {
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+
AU.addPreserved<BasicAAWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
@@ -1950,7 +1687,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
? Builder.CreateSExtOrTrunc(Induction, IV->getType())
: Builder.CreateCast(Instruction::SIToFP, Induction,
IV->getType());
- ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
+ ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
ScalarIV->setName("offset.idx");
}
if (Trunc) {
@@ -2089,8 +1826,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
assert(!V->getType()->isVoidTy() && "Type does not produce a value");
- // If we have a stride that is replaced by one, do it here.
- if (Legal->hasStride(V))
+ // If we have a stride that is replaced by one, do it here. Defer this for
+ // the VPlan-native path until we start running Legal checks in that path.
+ if (!EnableVPlanNativePath && Legal->hasStride(V))
V = ConstantInt::get(V->getType(), 1);
// If we have a vector mapped to this value, return it.
@@ -2214,6 +1952,17 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
"reverse");
}
+// Return whether we allow using masked interleave-groups (for dealing with
+// strided loads/stores that reside in predicated blocks, or for dealing
+// with gaps).
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
+ return EnableMaskedInterleavedMemAccesses;
+
+ return TTI.enableMaskedInterleavedAccessVectorization();
+}
+
// Try to vectorize the interleave group that \p Instr belongs to.
//
// E.g. Translate following interleaved load group (factor = 3):
@@ -2242,8 +1991,10 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
- const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
+ VectorParts *BlockInMask) {
+ const InterleaveGroup<Instruction> *Group =
+ Cost->getInterleavedAccessGroup(Instr);
assert(Group && "Fail to get an interleaved access group.");
// Skip if current instruction is not the insert position.
@@ -2257,13 +2008,22 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
Type *ScalarTy = getMemInstValueType(Instr);
unsigned InterleaveFactor = Group->getFactor();
Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
- Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
+ Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
// Prepare for the new pointers.
setDebugLocFromInst(Builder, Ptr);
SmallVector<Value *, 2> NewPtrs;
unsigned Index = Group->getIndex(Instr);
+ VectorParts Mask;
+ bool IsMaskForCondRequired = BlockInMask;
+ if (IsMaskForCondRequired) {
+ Mask = *BlockInMask;
+ // TODO: extend the masked interleaved-group support to reversed access.
+ assert(!Group->isReverse() && "Reversed masked interleave-group "
+ "not supported.");
+ }
+
// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
@@ -2302,13 +2062,39 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
setDebugLocFromInst(Builder, Instr);
Value *UndefVec = UndefValue::get(VecTy);
+ Value *MaskForGaps = nullptr;
+ if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
+ MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
+ assert(MaskForGaps && "Mask for Gaps is required but it is null");
+ }
+
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
// For each unroll part, create a wide load for the group.
SmallVector<Value *, 2> NewLoads;
for (unsigned Part = 0; Part < UF; Part++) {
- auto *NewLoad = Builder.CreateAlignedLoad(
- NewPtrs[Part], Group->getAlignment(), "wide.vec");
+ Instruction *NewLoad;
+ if (IsMaskForCondRequired || MaskForGaps) {
+ assert(useMaskedInterleavedAccesses(*TTI) &&
+ "masked interleaved groups are not allowed.");
+ Value *GroupMask = MaskForGaps;
+ if (IsMaskForCondRequired) {
+ auto *Undefs = UndefValue::get(Mask[Part]->getType());
+ auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ Mask[Part], Undefs, RepMask, "interleaved.mask");
+ GroupMask = MaskForGaps
+ ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+ MaskForGaps)
+ : ShuffledMask;
+ }
+ NewLoad =
+ Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
+ GroupMask, UndefVec, "wide.masked.vec");
+ }
+ else
+ NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part],
+ Group->getAlignment(), "wide.vec");
Group->addMetadata(NewLoad);
NewLoads.push_back(NewLoad);
}
@@ -2375,8 +2161,18 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
"interleaved.vec");
- Instruction *NewStoreInstr =
- Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
+ Instruction *NewStoreInstr;
+ if (IsMaskForCondRequired) {
+ auto *Undefs = UndefValue::get(Mask[Part]->getType());
+ auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ Mask[Part], Undefs, RepMask, "interleaved.mask");
+ NewStoreInstr = Builder.CreateMaskedStore(
+ IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
+ }
+ else
+ NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
+ Group->getAlignment());
Group->addMetadata(NewStoreInstr);
}
@@ -2400,13 +2196,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
Type *ScalarDataTy = getMemInstValueType(Instr);
Type *DataTy = VectorType::get(ScalarDataTy, VF);
Value *Ptr = getLoadStorePointerOperand(Instr);
- unsigned Alignment = getMemInstAlignment(Instr);
+ unsigned Alignment = getLoadStoreAlignment(Instr);
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
const DataLayout &DL = Instr->getModule()->getDataLayout();
if (!Alignment)
Alignment = DL.getABITypeAlignment(ScalarDataTy);
- unsigned AddressSpace = getMemInstAddressSpace(Instr);
+ unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
@@ -2594,6 +2390,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
if (TripCount)
return TripCount;
+ assert(L && "Create Trip Count for null loop.");
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
// Find the loop boundaries.
ScalarEvolution *SE = PSE.getSE();
@@ -2602,6 +2399,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
"Invalid loop count");
Type *IdxTy = Legal->getWidestInductionType();
+ assert(IdxTy && "No type for induction");
// The exit count might have the type of i64 while the phi is i32. This can
// happen if we have an induction variable that is sign extended before the
@@ -2642,12 +2440,26 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
Value *TC = getOrCreateTripCount(L);
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ Type *Ty = TC->getType();
+ Constant *Step = ConstantInt::get(Ty, VF * UF);
+
+ // If the tail is to be folded by masking, round the number of iterations N
+ // up to a multiple of Step instead of rounding down. This is done by first
+ // adding Step-1 and then rounding down. Note that it's ok if this addition
+ // overflows: the vector induction variable will eventually wrap to zero given
+ // that it starts at zero and its Step is a power of two; the loop will then
+ // exit, with the last early-exit vector comparison also producing all-true.
+ if (Cost->foldTailByMasking()) {
+ assert(isPowerOf2_32(VF * UF) &&
+ "VF*UF must be a power of 2 when folding tail by masking");
+ TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
+ }
+
// Now we need to generate the expression for the part of the loop that the
// vectorized body will execute. This is equal to N - (N % Step) if scalar
// iterations are not required for correctness, or N - Step, otherwise. Step
// is equal to the vectorization factor (number of SIMD elements) times the
// unroll factor (number of SIMD instructions).
- Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
// If there is a non-reversed interleaved group that may speculatively access
@@ -2710,8 +2522,13 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
// of zero. In this case we will also jump to the scalar loop.
auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
: ICmpInst::ICMP_ULT;
- Value *CheckMinIters = Builder.CreateICmp(
- P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
+
+ // If tail is to be folded, vector loop takes care of all iterations.
+ Value *CheckMinIters = Builder.getFalse();
+ if (!Cost->foldTailByMasking())
+ CheckMinIters = Builder.CreateICmp(
+ P, Count, ConstantInt::get(Count->getType(), VF * UF),
+ "min.iters.check");
BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
// Update dominator tree immediately if the generated block is a
@@ -2740,6 +2557,8 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
if (C->isZero())
return;
+ assert(!Cost->foldTailByMasking() &&
+ "Cannot SCEV check stride or overflow when folding tail");
// Create a new block containing the stride check.
BB->setName("vector.scevcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2756,6 +2575,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
}
void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
+ // VPlan-native path does not do any analysis for runtime checks currently.
+ if (EnableVPlanNativePath)
+ return;
+
BasicBlock *BB = L->getLoopPreheader();
// Generate the code that checks in runtime if arrays overlap. We put the
@@ -2768,6 +2591,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
if (!MemRuntimeCheck)
return;
+ assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
// Create a new block containing the memory check.
BB->setName("vector.memcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2789,6 +2613,94 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
LVer->prepareNoAliasMetadata();
}
+Value *InnerLoopVectorizer::emitTransformedIndex(
+ IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
+ const InductionDescriptor &ID) const {
+
+ SCEVExpander Exp(*SE, DL, "induction");
+ auto Step = ID.getStep();
+ auto StartValue = ID.getStartValue();
+ assert(Index->getType() == Step->getType() &&
+ "Index type does not match StepValue type");
+
+ // Note: the IR at this point is broken. We cannot use SE to create any new
+ // SCEV and then expand it, hoping that SCEV's simplification will give us
+ // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
+ // lead to various SCEV crashes. So all we can do is to use builder and rely
+ // on InstCombine for future simplifications. Here we handle some trivial
+ // cases only.
+ auto CreateAdd = [&B](Value *X, Value *Y) {
+ assert(X->getType() == Y->getType() && "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isZero())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isZero())
+ return X;
+ return B.CreateAdd(X, Y);
+ };
+
+ auto CreateMul = [&B](Value *X, Value *Y) {
+ assert(X->getType() == Y->getType() && "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isOne())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isOne())
+ return X;
+ return B.CreateMul(X, Y);
+ };
+
+ switch (ID.getKind()) {
+ case InductionDescriptor::IK_IntInduction: {
+ assert(Index->getType() == StartValue->getType() &&
+ "Index type does not match StartValue type");
+ if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
+ return B.CreateSub(StartValue, Index);
+ auto *Offset = CreateMul(
+ Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
+ return CreateAdd(StartValue, Offset);
+ }
+ case InductionDescriptor::IK_PtrInduction: {
+ assert(isa<SCEVConstant>(Step) &&
+ "Expected constant step for pointer induction");
+ return B.CreateGEP(
+ nullptr, StartValue,
+ CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
+ &*B.GetInsertPoint())));
+ }
+ case InductionDescriptor::IK_FpInduction: {
+ assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+ auto InductionBinOp = ID.getInductionBinOp();
+ assert(InductionBinOp &&
+ (InductionBinOp->getOpcode() == Instruction::FAdd ||
+ InductionBinOp->getOpcode() == Instruction::FSub) &&
+ "Original bin op should be defined for FP induction");
+
+ Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
+
+ // Floating point operations had to be 'fast' to enable the induction.
+ FastMathFlags Flags;
+ Flags.setFast();
+
+ Value *MulExp = B.CreateFMul(StepValue, Index);
+ if (isa<Instruction>(MulExp))
+ // We have to check, the MulExp may be a constant.
+ cast<Instruction>(MulExp)->setFastMathFlags(Flags);
+
+ Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
+ "induction");
+ if (isa<Instruction>(BOp))
+ cast<Instruction>(BOp)->setFastMathFlags(Flags);
+
+ return BOp;
+ }
+ case InductionDescriptor::IK_NoInduction:
+ return nullptr;
+ }
+ llvm_unreachable("invalid enum");
+}
+
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
@@ -2825,6 +2737,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
BasicBlock *OldBasicBlock = OrigLoop->getHeader();
BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
assert(VectorPH && "Invalid loop structure");
assert(ExitBlock && "Must have an exit block");
@@ -2927,7 +2840,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
- EndValue = II.transform(B, CRD, PSE.getSE(), DL);
+ EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
EndValue->setName("ind.end");
}
@@ -2948,9 +2861,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
- Value *CmpN =
- CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
- CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
+ // If tail is to be folded, we know we don't need to run the remainder.
+ Value *CmpN = Builder.getTrue();
+ if (!Cost->foldTailByMasking())
+ CmpN =
+ CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+ CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
ReplaceInstWithInst(MiddleBlock->getTerminator(),
BranchInst::Create(ExitBlock, ScalarPH, CmpN));
@@ -2965,6 +2881,17 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
LoopVectorBody = VecBody;
LoopScalarBody = OldBasicBlock;
+ Optional<MDNode *> VectorizedLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupVectorized});
+ if (VectorizedLoopID.hasValue()) {
+ Lp->setLoopID(VectorizedLoopID.getValue());
+
+ // Do not setAlreadyVectorized if loop attributes have been defined
+ // explicitly.
+ return LoopVectorPreHeader;
+ }
+
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
if (MDNode *LID = OrigLoop->getLoopID())
@@ -3023,7 +2950,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
II.getStep()->getType())
: B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
CMO->setName("cast.cmo");
- Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
+ Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
@@ -3109,6 +3036,10 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
!TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(RetTy, true, false);
+ // Some targets keep addresses scalar.
+ if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
+ return Cost;
+
if (CallInst *CI = dyn_cast<CallInst>(I)) {
SmallVector<const Value *, 4> Operands(CI->arg_operands());
Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
@@ -3212,7 +3143,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
continue;
for (unsigned Part = 0; Part < UF; ++Part) {
Value *I = getOrCreateVectorValue(KV.first, Part);
- if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
+ if (Erased.find(I) != Erased.end() || I->use_empty() ||
+ !isa<Instruction>(I))
continue;
Type *OriginalTy = I->getType();
Type *ScalarTruncatedTy =
@@ -3330,6 +3262,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
if (VF > 1)
truncateToMinimalBitwidths();
+ // Fix widened non-induction PHIs by setting up the PHI operands.
+ if (OrigPHIsToFix.size()) {
+ assert(EnableVPlanNativePath &&
+ "Unexpected non-induction PHIs for fixup in non VPlan-native path");
+ fixNonInductionPHIs();
+ }
+
// At this point every instruction in the original loop is widened to a
// vector form. Now we need to fix the recurrences in the loop. These PHI
// nodes are currently empty because we did not want to introduce cycles.
@@ -3666,8 +3605,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
ReducedPartRdx, "bin.rdx"));
else
- ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
- Builder, MinMaxKind, ReducedPartRdx, RdxPart);
+ ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
+ RdxPart);
}
if (VF > 1) {
@@ -3720,9 +3659,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
void InnerLoopVectorizer::fixLCSSAPHIs() {
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
if (LCSSAPhi.getNumIncomingValues() == 1) {
- assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) &&
- "Incoming value isn't loop invariant");
- LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock);
+ auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
+ // Non-instruction incoming values will have only one value.
+ unsigned LastLane = 0;
+ if (isa<Instruction>(IncomingValue))
+ LastLane = Cost->isUniformAfterVectorization(
+ cast<Instruction>(IncomingValue), VF)
+ ? 0
+ : VF - 1;
+ // Can be a loop invariant incoming value or the last scalar value to be
+ // extracted from the vectorized loop.
+ Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+ Value *lastIncomingValue =
+ getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
+ LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
}
}
}
@@ -3791,12 +3741,62 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
} while (Changed);
}
+void InnerLoopVectorizer::fixNonInductionPHIs() {
+ for (PHINode *OrigPhi : OrigPHIsToFix) {
+ PHINode *NewPhi =
+ cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
+ unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
+
+ SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
+ predecessors(OrigPhi->getParent()));
+ SmallVector<BasicBlock *, 2> VectorBBPredecessors(
+ predecessors(NewPhi->getParent()));
+ assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
+ "Scalar and Vector BB should have the same number of predecessors");
+
+ // The insertion point in Builder may be invalidated by the time we get
+ // here. Force the Builder insertion point to something valid so that we do
+ // not run into issues during insertion point restore in
+ // getOrCreateVectorValue calls below.
+ Builder.SetInsertPoint(NewPhi);
+
+ // The predecessor order is preserved and we can rely on mapping between
+ // scalar and vector block predecessors.
+ for (unsigned i = 0; i < NumIncomingValues; ++i) {
+ BasicBlock *NewPredBB = VectorBBPredecessors[i];
+
+ // When looking up the new scalar/vector values to fix up, use incoming
+ // values from original phi.
+ Value *ScIncV =
+ OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
+
+ // Scalar incoming value may need a broadcast
+ Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
+ NewPhi->addIncoming(NewIncV, NewPredBB);
+ }
+ }
+}
+
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
unsigned VF) {
+ PHINode *P = cast<PHINode>(PN);
+ if (EnableVPlanNativePath) {
+ // Currently we enter here in the VPlan-native path for non-induction
+ // PHIs where all control flow is uniform. We simply widen these PHIs.
+ // Create a vector phi with no operands - the vector phi operands will be
+ // set at the end of vector code generation.
+ Type *VecTy =
+ (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+ Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
+ VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
+ OrigPHIsToFix.push_back(P);
+
+ return;
+ }
+
assert(PN->getParent() == OrigLoop->getHeader() &&
"Non-header phis should have been handled elsewhere");
- PHINode *P = cast<PHINode>(PN);
// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
@@ -3846,7 +3846,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
- Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
+ Value *SclrGep =
+ emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
SclrGep->setName("next.gep");
VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
}
@@ -4151,6 +4152,10 @@ void InnerLoopVectorizer::updateAnalysis() {
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
+ // DT is not kept up-to-date for outer loop vectorization
+ if (EnableVPlanNativePath)
+ return;
+
// Update the dominator tree information.
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
"Entry does not dominate exit.");
@@ -4167,7 +4172,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.
- assert(VF >= 2 && !Scalars.count(VF) &&
+ assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF");
SmallSetVector<Instruction *, 8> Worklist;
@@ -4253,7 +4258,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
}
}
for (auto *I : ScalarPtrs)
- if (!PossibleNonScalarPtrs.count(I)) {
+ if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
Worklist.insert(I);
}
@@ -4279,8 +4284,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// Insert the forced scalars.
// FIXME: Currently widenPHIInstruction() often creates a dead vector
// induction variable when the PHI user is scalarized.
- if (ForcedScalars.count(VF))
- for (auto *I : ForcedScalars.find(VF)->second)
+ auto ForcedScalar = ForcedScalars.find(VF);
+ if (ForcedScalar != ForcedScalars.end())
+ for (auto *I : ForcedScalar->second)
Worklist.insert(I);
// Expand the worklist by looking through any bitcasts and getelementptr
@@ -4348,8 +4354,8 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
- if (!Legal->blockNeedsPredication(I->getParent()))
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
+ if (!blockNeedsPredication(I->getParent()))
return false;
switch(I->getOpcode()) {
default:
@@ -4360,6 +4366,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
return false;
auto *Ptr = getLoadStorePointerOperand(I);
auto *Ty = getMemInstValueType(I);
+ // We have already decided how to vectorize this instruction, get that
+ // result.
+ if (VF > 1) {
+ InstWidening WideningDecision = getWideningDecision(I, VF);
+ assert(WideningDecision != CM_Unknown &&
+ "Widening decision should be ready at this moment");
+ return WideningDecision == CM_Scalarize;
+ }
return isa<LoadInst>(I) ?
!(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
: !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
@@ -4373,6 +4387,35 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
return false;
}
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
+ unsigned VF) {
+ assert(isAccessInterleaved(I) && "Expecting interleaved access.");
+ assert(getWideningDecision(I, VF) == CM_Unknown &&
+ "Decision should not be set yet.");
+ auto *Group = getInterleavedAccessGroup(I);
+ assert(Group && "Must have a group.");
+
+ // Check if masking is required.
+ // A Group may need masking for one of two reasons: it resides in a block that
+ // needs predication, or it was decided to use masking to deal with gaps.
+ bool PredicatedAccessRequiresMasking =
+ Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
+ bool AccessWithGapsRequiresMasking =
+ Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+ if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
+ return true;
+
+ // If masked interleaving is required, we expect that the user/target had
+ // enabled it, because otherwise it either wouldn't have been created or
+ // it should have been invalidated by the CostModel.
+ assert(useMaskedInterleavedAccesses(TTI) &&
+ "Masked interleave-groups for predicated accesses are not enabled.");
+
+ auto *Ty = getMemInstValueType(I);
+ return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
+ : TTI.isLegalMaskedStore(Ty);
+}
+
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
unsigned VF) {
// Get and ensure we have a valid memory instruction.
@@ -4407,7 +4450,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// already does this check. Collecting Uniforms for VF=1 does not make any
// sense.
- assert(VF >= 2 && !Uniforms.count(VF) &&
+ assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
"This function should not be visited twice for the same VF");
// Visit the list of Uniforms. If we'll not find any uniform value, we'll
@@ -4494,26 +4537,33 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// Add to the Worklist all consecutive and consecutive-like pointers that
// aren't also identified as possibly non-uniform.
for (auto *V : ConsecutiveLikePtrs)
- if (!PossibleNonUniformPtrs.count(V)) {
+ if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
Worklist.insert(V);
}
// Expand Worklist in topological order: whenever a new instruction
- // is added , its users should be either already inside Worklist, or
- // out of scope. It ensures a uniform instruction will only be used
- // by uniform instructions or out of scope instructions.
+ // is added , its users should be already inside Worklist. It ensures
+ // a uniform instruction will only be used by uniform instructions.
unsigned idx = 0;
while (idx != Worklist.size()) {
Instruction *I = Worklist[idx++];
for (auto OV : I->operand_values()) {
+ // isOutOfScope operands cannot be uniform instructions.
if (isOutOfScope(OV))
continue;
+ // First order recurrence Phi's should typically be considered
+ // non-uniform.
+ auto *OP = dyn_cast<PHINode>(OV);
+ if (OP && Legal->isFirstOrderRecurrence(OP))
+ continue;
+ // If all the users of the operand are uniform, then add the
+ // operand into the uniform worklist.
auto *OI = cast<Instruction>(OV);
if (llvm::all_of(OI->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
- return !TheLoop->contains(J) || Worklist.count(J) ||
+ return Worklist.count(J) ||
(OI == getLoadStorePointerOperand(J) &&
isUniformDecision(J, VF));
})) {
@@ -4571,318 +4621,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
Uniforms[VF].insert(Worklist.begin(), Worklist.end());
}
-void InterleavedAccessInfo::collectConstStrideAccesses(
- MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
- const ValueToValueMap &Strides) {
- auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
-
- // Since it's desired that the load/store instructions be maintained in
- // "program order" for the interleaved access analysis, we have to visit the
- // blocks in the loop in reverse postorder (i.e., in a topological order).
- // Such an ordering will ensure that any load/store that may be executed
- // before a second load/store will precede the second load/store in
- // AccessStrideInfo.
- LoopBlocksDFS DFS(TheLoop);
- DFS.perform(LI);
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
- for (auto &I : *BB) {
- auto *LI = dyn_cast<LoadInst>(&I);
- auto *SI = dyn_cast<StoreInst>(&I);
- if (!LI && !SI)
- continue;
-
- Value *Ptr = getLoadStorePointerOperand(&I);
- // We don't check wrapping here because we don't know yet if Ptr will be
- // part of a full group or a group with gaps. Checking wrapping for all
- // pointers (even those that end up in groups with no gaps) will be overly
- // conservative. For full groups, wrapping should be ok since if we would
- // wrap around the address space we would do a memory access at nullptr
- // even without the transformation. The wrapping checks are therefore
- // deferred until after we've formed the interleaved groups.
- int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
- /*Assume=*/true, /*ShouldCheckWrap=*/false);
-
- const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
- PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
- uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
-
- // An alignment of 0 means target ABI alignment.
- unsigned Align = getMemInstAlignment(&I);
- if (!Align)
- Align = DL.getABITypeAlignment(PtrTy->getElementType());
-
- AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
- }
-}
-
-// Analyze interleaved accesses and collect them into interleaved load and
-// store groups.
-//
-// When generating code for an interleaved load group, we effectively hoist all
-// loads in the group to the location of the first load in program order. When
-// generating code for an interleaved store group, we sink all stores to the
-// location of the last store. This code motion can change the order of load
-// and store instructions and may break dependences.
-//
-// The code generation strategy mentioned above ensures that we won't violate
-// any write-after-read (WAR) dependences.
-//
-// E.g., for the WAR dependence: a = A[i]; // (1)
-// A[i] = b; // (2)
-//
-// The store group of (2) is always inserted at or below (2), and the load
-// group of (1) is always inserted at or above (1). Thus, the instructions will
-// never be reordered. All other dependences are checked to ensure the
-// correctness of the instruction reordering.
-//
-// The algorithm visits all memory accesses in the loop in bottom-up program
-// order. Program order is established by traversing the blocks in the loop in
-// reverse postorder when collecting the accesses.
-//
-// We visit the memory accesses in bottom-up order because it can simplify the
-// construction of store groups in the presence of write-after-write (WAW)
-// dependences.
-//
-// E.g., for the WAW dependence: A[i] = a; // (1)
-// A[i] = b; // (2)
-// A[i + 1] = c; // (3)
-//
-// We will first create a store group with (3) and (2). (1) can't be added to
-// this group because it and (2) are dependent. However, (1) can be grouped
-// with other accesses that may precede it in program order. Note that a
-// bottom-up order does not imply that WAW dependences should not be checked.
-void InterleavedAccessInfo::analyzeInterleaving() {
- LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
- const ValueToValueMap &Strides = LAI->getSymbolicStrides();
-
- // Holds all accesses with a constant stride.
- MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
- collectConstStrideAccesses(AccessStrideInfo, Strides);
-
- if (AccessStrideInfo.empty())
- return;
-
- // Collect the dependences in the loop.
- collectDependences();
-
- // Holds all interleaved store groups temporarily.
- SmallSetVector<InterleaveGroup *, 4> StoreGroups;
- // Holds all interleaved load groups temporarily.
- SmallSetVector<InterleaveGroup *, 4> LoadGroups;
-
- // Search in bottom-up program order for pairs of accesses (A and B) that can
- // form interleaved load or store groups. In the algorithm below, access A
- // precedes access B in program order. We initialize a group for B in the
- // outer loop of the algorithm, and then in the inner loop, we attempt to
- // insert each A into B's group if:
- //
- // 1. A and B have the same stride,
- // 2. A and B have the same memory object size, and
- // 3. A belongs in B's group according to its distance from B.
- //
- // Special care is taken to ensure group formation will not break any
- // dependences.
- for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
- BI != E; ++BI) {
- Instruction *B = BI->first;
- StrideDescriptor DesB = BI->second;
-
- // Initialize a group for B if it has an allowable stride. Even if we don't
- // create a group for B, we continue with the bottom-up algorithm to ensure
- // we don't break any of B's dependences.
- InterleaveGroup *Group = nullptr;
- if (isStrided(DesB.Stride)) {
- Group = getInterleaveGroup(B);
- if (!Group) {
- LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
- << '\n');
- Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
- }
- if (B->mayWriteToMemory())
- StoreGroups.insert(Group);
- else
- LoadGroups.insert(Group);
- }
-
- for (auto AI = std::next(BI); AI != E; ++AI) {
- Instruction *A = AI->first;
- StrideDescriptor DesA = AI->second;
-
- // Our code motion strategy implies that we can't have dependences
- // between accesses in an interleaved group and other accesses located
- // between the first and last member of the group. Note that this also
- // means that a group can't have more than one member at a given offset.
- // The accesses in a group can have dependences with other accesses, but
- // we must ensure we don't extend the boundaries of the group such that
- // we encompass those dependent accesses.
- //
- // For example, assume we have the sequence of accesses shown below in a
- // stride-2 loop:
- //
- // (1, 2) is a group | A[i] = a; // (1)
- // | A[i-1] = b; // (2) |
- // A[i-3] = c; // (3)
- // A[i] = d; // (4) | (2, 4) is not a group
- //
- // Because accesses (2) and (3) are dependent, we can group (2) with (1)
- // but not with (4). If we did, the dependent access (3) would be within
- // the boundaries of the (2, 4) group.
- if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
- // If a dependence exists and A is already in a group, we know that A
- // must be a store since A precedes B and WAR dependences are allowed.
- // Thus, A would be sunk below B. We release A's group to prevent this
- // illegal code motion. A will then be free to form another group with
- // instructions that precede it.
- if (isInterleaved(A)) {
- InterleaveGroup *StoreGroup = getInterleaveGroup(A);
- StoreGroups.remove(StoreGroup);
- releaseGroup(StoreGroup);
- }
-
- // If a dependence exists and A is not already in a group (or it was
- // and we just released it), B might be hoisted above A (if B is a
- // load) or another store might be sunk below A (if B is a store). In
- // either case, we can't add additional instructions to B's group. B
- // will only form a group with instructions that it precedes.
- break;
- }
-
- // At this point, we've checked for illegal code motion. If either A or B
- // isn't strided, there's nothing left to do.
- if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
- continue;
-
- // Ignore A if it's already in a group or isn't the same kind of memory
- // operation as B.
- // Note that mayReadFromMemory() isn't mutually exclusive to mayWriteToMemory
- // in the case of atomic loads. We shouldn't see those here, canVectorizeMemory()
- // should have returned false - except for the case we asked for optimization
- // remarks.
- if (isInterleaved(A) || (A->mayReadFromMemory() != B->mayReadFromMemory())
- || (A->mayWriteToMemory() != B->mayWriteToMemory()))
- continue;
-
- // Check rules 1 and 2. Ignore A if its stride or size is different from
- // that of B.
- if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
- continue;
-
- // Ignore A if the memory object of A and B don't belong to the same
- // address space
- if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
- continue;
-
- // Calculate the distance from A to B.
- const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
- PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
- if (!DistToB)
- continue;
- int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
-
- // Check rule 3. Ignore A if its distance to B is not a multiple of the
- // size.
- if (DistanceToB % static_cast<int64_t>(DesB.Size))
- continue;
-
- // Ignore A if either A or B is in a predicated block. Although we
- // currently prevent group formation for predicated accesses, we may be
- // able to relax this limitation in the future once we handle more
- // complicated blocks.
- if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
- continue;
-
- // The index of A is the index of B plus A's distance to B in multiples
- // of the size.
- int IndexA =
- Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
-
- // Try to insert A into B's group.
- if (Group->insertMember(A, IndexA, DesA.Align)) {
- LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
- << " into the interleave group with" << *B
- << '\n');
- InterleaveGroupMap[A] = Group;
-
- // Set the first load in program order as the insert position.
- if (A->mayReadFromMemory())
- Group->setInsertPos(A);
- }
- } // Iteration over A accesses.
- } // Iteration over B accesses.
-
- // Remove interleaved store groups with gaps.
- for (InterleaveGroup *Group : StoreGroups)
- if (Group->getNumMembers() != Group->getFactor()) {
- LLVM_DEBUG(
- dbgs() << "LV: Invalidate candidate interleaved store group due "
- "to gaps.\n");
- releaseGroup(Group);
- }
- // Remove interleaved groups with gaps (currently only loads) whose memory
- // accesses may wrap around. We have to revisit the getPtrStride analysis,
- // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
- // not check wrapping (see documentation there).
- // FORNOW we use Assume=false;
- // TODO: Change to Assume=true but making sure we don't exceed the threshold
- // of runtime SCEV assumptions checks (thereby potentially failing to
- // vectorize altogether).
- // Additional optional optimizations:
- // TODO: If we are peeling the loop and we know that the first pointer doesn't
- // wrap then we can deduce that all pointers in the group don't wrap.
- // This means that we can forcefully peel the loop in order to only have to
- // check the first pointer for no-wrap. When we'll change to use Assume=true
- // we'll only need at most one runtime check per interleaved group.
- for (InterleaveGroup *Group : LoadGroups) {
- // Case 1: A full group. Can Skip the checks; For full groups, if the wide
- // load would wrap around the address space we would do a memory access at
- // nullptr even without the transformation.
- if (Group->getNumMembers() == Group->getFactor())
- continue;
-
- // Case 2: If first and last members of the group don't wrap this implies
- // that all the pointers in the group don't wrap.
- // So we check only group member 0 (which is always guaranteed to exist),
- // and group member Factor - 1; If the latter doesn't exist we rely on
- // peeling (if it is a non-reveresed accsess -- see Case 3).
- Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0));
- if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
- /*ShouldCheckWrap=*/true)) {
- LLVM_DEBUG(
- dbgs() << "LV: Invalidate candidate interleaved group due to "
- "first group member potentially pointer-wrapping.\n");
- releaseGroup(Group);
- continue;
- }
- Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
- if (LastMember) {
- Value *LastMemberPtr = getLoadStorePointerOperand(LastMember);
- if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
- /*ShouldCheckWrap=*/true)) {
- LLVM_DEBUG(
- dbgs() << "LV: Invalidate candidate interleaved group due to "
- "last group member potentially pointer-wrapping.\n");
- releaseGroup(Group);
- }
- } else {
- // Case 3: A non-reversed interleaved load group with gaps: We need
- // to execute at least one scalar epilogue iteration. This will ensure
- // we don't speculatively access memory out-of-bounds. We only need
- // to look for a member at index factor - 1, since every group must have
- // a member at index zero.
- if (Group->isReverse()) {
- LLVM_DEBUG(
- dbgs() << "LV: Invalidate candidate interleaved group due to "
- "a reverse access with gaps.\n");
- releaseGroup(Group);
- continue;
- }
- LLVM_DEBUG(
- dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
- RequiresScalarEpilogue = true;
- }
- }
-}
-
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
@@ -4912,39 +4650,78 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return None;
}
+ if (!PSE.getUnionPredicate().getPredicates().empty()) {
+ ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+ << "runtime SCEV checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz");
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
+ return None;
+ }
+
+ // FIXME: Avoid specializing for stride==1 instead of bailing out.
+ if (!Legal->getLAI()->getSymbolicStrides().empty()) {
+ ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+ << "runtime stride == 1 checks needed. Enable vectorization of "
+ "this loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz");
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+ return None;
+ }
+
// If we optimize the program for size, avoid creating the tail loop.
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
- // If we don't know the precise trip count, don't try to vectorize.
- if (TC < 2) {
- ORE->emit(
- createMissedAnalysis("UnknownLoopCountComplexCFG")
- << "unable to calculate the loop count due to complex control flow");
- LLVM_DEBUG(
- dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+ if (TC == 1) {
+ ORE->emit(createMissedAnalysis("SingleIterationLoop")
+ << "loop trip count is one, irrelevant for vectorization");
+ LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
return None;
}
+ // Record that scalar epilogue is not allowed.
+ LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+
+ IsScalarEpilogueAllowed = !OptForSize;
+
+ // We don't create an epilogue when optimizing for size.
+ // Invalidate interleave groups that require an epilogue if we can't mask
+ // the interleave-group.
+ if (!useMaskedInterleavedAccesses(TTI))
+ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+
unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
- if (TC % MaxVF != 0) {
- // If the trip count that we found modulo the vectorization factor is not
- // zero then we require a tail.
- // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
- // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
- // smaller MaxVF that does not require a scalar epilog.
-
- ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
- << "cannot optimize for size and vectorize at the "
- "same time. Enable vectorization of this loop "
- "with '#pragma clang loop vectorize(enable)' "
- "when compiling with -Os/-Oz");
- LLVM_DEBUG(
- dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+ if (TC > 0 && TC % MaxVF == 0) {
+ LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ return MaxVF;
+ }
+
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ if (Legal->canFoldTailByMasking()) {
+ FoldTailByMasking = true;
+ return MaxVF;
+ }
+
+ if (TC == 0) {
+ ORE->emit(
+ createMissedAnalysis("UnknownLoopCountComplexCFG")
+ << "unable to calculate the loop count due to complex control flow");
return None;
}
- return MaxVF;
+ ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
+ << "cannot optimize for size and vectorize at the same time. "
+ "Enable vectorization of this loop with '#pragma clang loop "
+ "vectorize(enable)' when compiling with -Os/-Oz");
+ return None;
}
unsigned
@@ -5080,11 +4857,11 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
// For each instruction in the loop.
- for (Instruction &I : *BB) {
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
Type *T = I.getType();
// Skip ignored values.
- if (ValuesToIgnore.count(&I))
+ if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
continue;
// Only examine Loads, Stores and PHINodes.
@@ -5182,6 +4959,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// fit without causing spills. All of this is rounded down if necessary to be
// a power of two. We want power of two interleave count to simplify any
// addressing operations or alignment considerations.
+ // We also want power of two interleave counts to ensure that the induction
+ // variable of the vector loop wraps to zero, when tail is folded by masking;
+ // this currently happens when OptForSize, in which case IC is set to 1 above.
unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
R.MaxLocalUsers);
@@ -5307,7 +5087,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
using IntervalMap = DenseMap<Instruction *, unsigned>;
// Maps instruction to its index.
- DenseMap<unsigned, Instruction *> IdxToInstr;
+ SmallVector<Instruction *, 64> IdxToInstr;
// Marks the end of each interval.
IntervalMap EndPoint;
// Saves the list of instruction indices that are used in the loop.
@@ -5316,10 +5096,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// defined outside the loop, such as arguments and constants.
SmallPtrSet<Value *, 8> LoopInvariants;
- unsigned Index = 0;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
- for (Instruction &I : *BB) {
- IdxToInstr[Index++] = &I;
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ IdxToInstr.push_back(&I);
// Save the end location of each USE.
for (Value *U : I.operands()) {
@@ -5336,7 +5115,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
}
// Overwrite previous end points.
- EndPoint[Instr] = Index;
+ EndPoint[Instr] = IdxToInstr.size();
Ends.insert(Instr);
}
}
@@ -5373,7 +5152,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
};
- for (unsigned int i = 0; i < Index; ++i) {
+ for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
Instruction *I = IdxToInstr[i];
// Remove all of the instructions that end at this location.
@@ -5382,11 +5161,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
OpenIntervals.erase(ToRemove);
// Ignore instructions that are never used within the loop.
- if (!Ends.count(I))
+ if (Ends.find(I) == Ends.end())
continue;
// Skip ignored values.
- if (ValuesToIgnore.count(I))
+ if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
continue;
// For each VF find the maximum usage of registers.
@@ -5400,7 +5179,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
unsigned RegUsage = 0;
for (auto Inst : OpenIntervals) {
// Skip ignored values for VF > 1.
- if (VecValuesToIgnore.count(Inst) ||
+ if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
isScalarAfterVectorization(Inst, VFs[j]))
continue;
RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
@@ -5446,8 +5225,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
// from moving "masked load/store" check from legality to cost model.
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
- assert(isScalarWithPredication(I) &&
- "Expecting a scalar emulated instruction");
+ assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
return isa<LoadInst>(I) ||
(isa<StoreInst>(I) &&
NumPredStores > NumberOfStoresToPredicate);
@@ -5458,7 +5236,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
// expected cost for interleaving.
- if (VF < 2 || InstsToScalarize.count(VF))
+ if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
return;
// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
@@ -5470,7 +5248,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
// determine if it would be better to not if-convert the blocks they are in.
// If so, we also record the instructions to scalarize.
for (BasicBlock *BB : TheLoop->blocks()) {
- if (!Legal->blockNeedsPredication(BB))
+ if (!blockNeedsPredication(BB))
continue;
for (Instruction &I : *BB)
if (isScalarWithPredication(&I)) {
@@ -5553,7 +5331,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
Instruction *I = Worklist.pop_back_val();
// If we've already analyzed the instruction, there's nothing to do.
- if (ScalarCosts.count(I))
+ if (ScalarCosts.find(I) != ScalarCosts.end())
continue;
// Compute the cost of the vector instruction. Note that this cost already
@@ -5612,8 +5390,8 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
// For each instruction in the old loop.
for (Instruction &I : BB->instructionsWithoutDebug()) {
// Skip ignored values.
- if (ValuesToIgnore.count(&I) ||
- (VF > 1 && VecValuesToIgnore.count(&I)))
+ if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
+ (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
continue;
VectorizationCostTy C = getInstructionCost(&I, VF);
@@ -5635,7 +5413,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
// unconditionally executed. For the scalar case, we may not always execute
// the predicated block. Thus, scale the block's cost by the probability of
// executing it.
- if (VF == 1 && Legal->blockNeedsPredication(BB))
+ if (VF == 1 && blockNeedsPredication(BB))
BlockCost.first /= getReciprocalPredBlockProb();
Cost.first += BlockCost.first;
@@ -5682,11 +5460,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
unsigned VF) {
+ assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
Type *ValTy = getMemInstValueType(I);
auto SE = PSE.getSE();
- unsigned Alignment = getMemInstAlignment(I);
- unsigned AS = getMemInstAddressSpace(I);
+ unsigned Alignment = getLoadStoreAlignment(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
Value *Ptr = getLoadStorePointerOperand(I);
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
@@ -5697,9 +5476,11 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Get the cost of the scalar memory instruction and address computation.
unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+ // Don't pass *I here, since it is scalar but will actually be part of a
+ // vectorized loop where the user of it is a vectorized instruction.
Cost += VF *
TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
- AS, I);
+ AS);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
@@ -5708,7 +5489,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// If we have a predicated store, it may not be executed for each vector
// lane. Scale the cost by the probability of executing the predicated
// block.
- if (isScalarWithPredication(I)) {
+ if (isPredicatedInst(I)) {
Cost /= getReciprocalPredBlockProb();
if (useEmulatedMaskMemRefHack(I))
@@ -5724,9 +5505,9 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
- unsigned Alignment = getMemInstAlignment(I);
+ unsigned Alignment = getLoadStoreAlignment(I);
Value *Ptr = getLoadStorePointerOperand(I);
- unsigned AS = getMemInstAddressSpace(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
@@ -5745,22 +5526,30 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
unsigned VF) {
- LoadInst *LI = cast<LoadInst>(I);
- Type *ValTy = LI->getType();
+ Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
- unsigned Alignment = LI->getAlignment();
- unsigned AS = LI->getPointerAddressSpace();
+ unsigned Alignment = getLoadStoreAlignment(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
+ if (isa<LoadInst>(I)) {
+ return TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+ }
+ StoreInst *SI = cast<StoreInst>(I);
+ bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+ TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
+ (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
+ Instruction::ExtractElement,
+ VectorTy, VF - 1));
}
unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
- unsigned Alignment = getMemInstAlignment(I);
+ unsigned Alignment = getLoadStoreAlignment(I);
Value *Ptr = getLoadStorePointerOperand(I);
return TTI.getAddressComputationCost(VectorTy) +
@@ -5772,7 +5561,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
- unsigned AS = getMemInstAddressSpace(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
auto Group = getInterleavedAccessGroup(I);
assert(Group && "Fail to get an interleaved access group.");
@@ -5790,13 +5579,19 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
}
// Calculate the cost of the whole interleaved group.
- unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
- Group->getFactor(), Indices,
- Group->getAlignment(), AS);
-
- if (Group->isReverse())
+ bool UseMaskForGaps =
+ Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+ unsigned Cost = TTI.getInterleavedMemoryOpCost(
+ I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+ Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
+
+ if (Group->isReverse()) {
+ // TODO: Add support for reversed masked interleaved access.
+ assert(!Legal->isMaskRequired(I) &&
+ "Reverse masked interleaved access not supported.");
Cost += Group->getNumMembers() *
TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+ }
return Cost;
}
@@ -5806,8 +5601,8 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
// moment.
if (VF == 1) {
Type *ValTy = getMemInstValueType(I);
- unsigned Alignment = getMemInstAlignment(I);
- unsigned AS = getMemInstAddressSpace(I);
+ unsigned Alignment = getLoadStoreAlignment(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
@@ -5826,9 +5621,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
return VectorizationCostTy(InstsToScalarize[VF][I], false);
// Forced scalars do not have any scalarization overhead.
- if (VF > 1 && ForcedScalars.count(VF) &&
- ForcedScalars.find(VF)->second.count(I))
- return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+ auto ForcedScalar = ForcedScalars.find(VF);
+ if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
+ auto InstSet = ForcedScalar->second;
+ if (InstSet.find(I) != InstSet.end())
+ return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+ }
Type *VectorTy;
unsigned C = getInstructionCost(I, VF, VectorTy);
@@ -5849,10 +5647,22 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
if (!Ptr)
continue;
+ // TODO: We should generate better code and update the cost model for
+ // predicated uniform stores. Today they are treated as any other
+ // predicated store (see added test cases in
+ // invariant-store-vectorization.ll).
if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
NumPredStores++;
- if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
- // Scalar load + broadcast
+
+ if (Legal->isUniform(Ptr) &&
+ // Conditional loads and stores should be scalarized and predicated.
+ // isScalarWithPredication cannot be used here since masked
+ // gather/scatters are not considered scalar with predication.
+ !Legal->blockNeedsPredication(I.getParent())) {
+ // TODO: Avoid replicating loads and stores instead of
+ // relying on instcombine to remove them.
+ // Load: Scalar load + broadcast
+ // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
unsigned Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
continue;
@@ -5883,7 +5693,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
continue;
NumAccesses = Group->getNumMembers();
- InterleaveCost = getInterleaveGroupCost(&I, VF);
+ if (interleavedAccessCanBeWidened(&I, VF))
+ InterleaveCost = getInterleaveGroupCost(&I, VF);
}
unsigned GatherScatterCost =
@@ -6001,8 +5812,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
bool ScalarPredicatedBB = false;
BranchInst *BI = cast<BranchInst>(I);
if (VF > 1 && BI->isConditional() &&
- (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
- PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
+ (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
+ PredicatedBBsAfterVectorization.end() ||
+ PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
+ PredicatedBBsAfterVectorization.end()))
ScalarPredicatedBB = true;
if (ScalarPredicatedBB) {
@@ -6025,9 +5838,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
auto *Phi = cast<PHINode>(I);
// First-order recurrences are replaced by vector shuffles inside the loop.
+ // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- VectorTy, VF - 1, VectorTy);
+ VectorTy, VF - 1, VectorType::get(RetTy, 1));
// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
// converted into select instructions. We require N - 1 selects per phi
@@ -6089,38 +5903,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return 0;
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
- TargetTransformInfo::OperandValueKind Op1VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueProperties Op1VP =
- TargetTransformInfo::OP_None;
- TargetTransformInfo::OperandValueProperties Op2VP =
- TargetTransformInfo::OP_None;
Value *Op2 = I->getOperand(1);
-
- // Check for a splat or for a non uniform vector of constants.
- if (isa<ConstantInt>(Op2)) {
- ConstantInt *CInt = cast<ConstantInt>(Op2);
- if (CInt && CInt->getValue().isPowerOf2())
- Op2VP = TargetTransformInfo::OP_PowerOf2;
- Op2VK = TargetTransformInfo::OK_UniformConstantValue;
- } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
- Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
- Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
- if (SplatValue) {
- ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
- if (CInt && CInt->getValue().isPowerOf2())
- Op2VP = TargetTransformInfo::OP_PowerOf2;
- Op2VK = TargetTransformInfo::OK_UniformConstantValue;
- }
- } else if (Legal->isUniform(Op2)) {
+ TargetTransformInfo::OperandValueProperties Op2VP;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TTI.getOperandInfo(Op2, Op2VP);
+ if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
Op2VK = TargetTransformInfo::OK_UniformValue;
- }
+
SmallVector<const Value *, 4> Operands(I->operand_values());
unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
- return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
- Op2VK, Op1VP, Op2VP, Operands);
+ return N * TTI.getArithmeticInstrCost(
+ I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
+ Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
@@ -6237,8 +6031,9 @@ INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
-Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
- return new LoopVectorize(NoUnrolling, AlwaysVectorize);
+Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
+ bool VectorizeOnlyWhenForced) {
+ return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
}
} // end namespace llvm
@@ -6316,6 +6111,16 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
return NoVectorization;
+ // Invalidate interleave groups if all blocks of loop will be predicated.
+ if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
+ !useMaskedInterleavedAccesses(*TTI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Invalidate all interleaved groups due to fold-tail by masking "
+ "which requires masked-interleaved support.\n");
+ CM.InterleaveInfo.reset();
+ }
+
if (UserVF) {
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
@@ -6372,6 +6177,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
DT, ILV.Builder, ILV.VectorLoopValueMap,
&ILV, CallbackILV};
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
+ State.TripCount = ILV.getOrCreateTripCount(nullptr);
//===------------------------------------------------===//
//
@@ -6408,7 +6214,8 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
PHINode *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
- return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+ return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
+ DeadInstructions.end();
}))
DeadInstructions.insert(IndUpdate);
@@ -6551,9 +6358,17 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
// load/store/gather/scatter. Initialize BlockMask to no-mask.
VPValue *BlockMask = nullptr;
- // Loop incoming mask is all-one.
- if (OrigLoop->getHeader() == BB)
+ if (OrigLoop->getHeader() == BB) {
+ if (!CM.blockNeedsPredication(BB))
+ return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
+ // This is used instead of IV < TC because TC may wrap, unlike BTC.
+ VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
+ VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
+ BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
return BlockMaskCache[BB] = BlockMask;
+ }
// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
@@ -6573,8 +6388,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
}
VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
- VFRange &Range) {
- const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
+ VFRange &Range,
+ VPlanPtr &Plan) {
+ const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
if (!IG)
return nullptr;
@@ -6595,7 +6411,11 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
assert(I == IG->getInsertPos() &&
"Generating a recipe for an adjunct member of an interleave group");
- return new VPInterleaveRecipe(IG);
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(I))
+ Mask = createBlockInMask(I->getParent(), Plan);
+
+ return new VPInterleaveRecipe(IG, Mask);
}
VPWidenMemoryInstructionRecipe *
@@ -6688,7 +6508,11 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
VFRange &Range) {
- if (CM.isScalarWithPredication(I))
+
+ bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+
+ if (IsPredicated)
return false;
auto IsVectorizableOpcode = [](unsigned Opcode) {
@@ -6795,7 +6619,9 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
[&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
- bool IsPredicated = CM.isScalarWithPredication(I);
+ bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+
auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
// Find if I uses a predicated instruction. If so, it will use its scalar
@@ -6857,7 +6683,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
VPRecipeBase *Recipe = nullptr;
// Check if Instr should belong to an interleave memory recipe, or already
// does. In the latter case Instr is irrelevant.
- if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+ if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
VPBB->appendRecipe(Recipe);
return true;
}
@@ -6908,6 +6734,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
NeedDef.insert(Branch->getCondition());
}
+ // If the tail is to be folded by masking, the primary induction variable
+ // needs to be represented in VPlan for it to model early-exit masking.
+ if (CM.foldTailByMasking())
+ NeedDef.insert(Legal->getPrimaryInduction());
+
// Collect instructions from the original loop that will become trivially dead
// in the vectorized loop. We don't need to vectorize these instructions. For
// example, original induction update instructions can become dead because we
@@ -6969,18 +6800,21 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// First filter out irrelevant instructions, to ensure no recipes are
// built for them.
- if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
+ if (isa<BranchInst>(Instr) ||
+ DeadInstructions.find(Instr) != DeadInstructions.end())
continue;
// I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
// member of the IG, do not construct any Recipe for it.
- const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr);
+ const InterleaveGroup<Instruction> *IG =
+ CM.getInterleavedAccessGroup(Instr);
if (IG && Instr != IG->getInsertPos() &&
Range.Start >= 2 && // Query is illegal for VF == 1
CM.getWideningDecision(Instr, Range.Start) ==
LoopVectorizationCostModel::CM_Interleave) {
- if (SinkAfterInverse.count(Instr))
- Ingredients.push_back(SinkAfterInverse.find(Instr)->second);
+ auto SinkCandidate = SinkAfterInverse.find(Instr);
+ if (SinkCandidate != SinkAfterInverse.end())
+ Ingredients.push_back(SinkCandidate->second);
continue;
}
@@ -7063,6 +6897,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
HCFGBuilder.buildHierarchicalCFG();
+ SmallPtrSet<Instruction *, 1> DeadInstructions;
+ VPlanHCFGTransforms::VPInstructionsToVPRecipes(
+ Plan, Legal->getInductionVars(), DeadInstructions);
+
+ for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
+ Plan->addVF(VF);
+
return Plan;
}
@@ -7075,6 +6916,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
O << " +\n"
<< Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
+ if (User) {
+ O << ", ";
+ User->getOperand(0)->printAsOperand(O);
+ }
O << "\\l\"";
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *I = IG->getMember(i))
@@ -7137,7 +6982,15 @@ void VPBlendRecipe::execute(VPTransformState &State) {
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
- State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+ if (!User)
+ return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+
+ // Last (and currently only) operand is a mask.
+ InnerLoopVectorizer::VectorParts MaskValues(State.UF);
+ VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ MaskValues[Part] = State.get(Mask, Part);
+ State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
}
void VPReplicateRecipe::execute(VPTransformState &State) {
@@ -7264,11 +7117,26 @@ static bool processLoopInVPlanNativePath(
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
// Plan how to best vectorize, return the best VF and its cost.
- LVP.planInVPlanNativePath(OptForSize, UserVF);
+ VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
- // Returning false. We are currently not generating vector code in the VPlan
- // native path.
- return false;
+ // If we are stress testing VPlan builds, do not attempt to generate vector
+ // code.
+ if (VPlanBuildStressTest)
+ return false;
+
+ LVP.setBestPlan(VF.Width, 1);
+
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL,
+ &CM);
+ LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
+ << L->getHeader()->getParent()->getName() << "\"\n");
+ LVP.executePlan(LB, DT);
+
+ // Mark the loop as already vectorized to avoid vectorizing again.
+ Hints.setAlreadyVectorized();
+
+ LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
+ return true;
}
bool LoopVectorizePass::processLoop(Loop *L) {
@@ -7283,7 +7151,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
<< L->getHeader()->getParent()->getName() << "\" from "
<< DebugLocStr << "\n");
- LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
+ LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
LLVM_DEBUG(
dbgs() << "LV: Loop hints:"
@@ -7307,7 +7175,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// less verbose reporting vectorized loops and unvectorized loops that may
// benefit from vectorization, respectively.
- if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
+ if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
return false;
}
@@ -7320,7 +7188,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
&Requirements, &Hints, DB, AC);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
- emitMissedWarning(F, L, Hints, ORE);
+ Hints.emitRemarkWithHints();
return false;
}
@@ -7393,7 +7261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
"NoImplicitFloat", L)
<< "loop not vectorized due to NoImplicitFloat attribute");
- emitMissedWarning(F, L, Hints, ORE);
+ Hints.emitRemarkWithHints();
return false;
}
@@ -7408,7 +7276,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
ORE->emit(
createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
<< "loop not vectorized due to unsafe FP support.");
- emitMissedWarning(F, L, Hints, ORE);
+ Hints.emitRemarkWithHints();
return false;
}
@@ -7421,7 +7289,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Analyze interleaved memory accesses.
if (UseInterleaved) {
- IAI.analyzeInterleaving();
+ IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
}
// Use the cost model.
@@ -7450,7 +7318,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (Requirements.doesNotMeet(F, L, Hints)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
"requirements.\n");
- emitMissedWarning(F, L, Hints, ORE);
+ Hints.emitRemarkWithHints();
return false;
}
@@ -7527,6 +7395,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LVP.setBestPlan(VF.Width, IC);
using namespace ore;
+ bool DisableRuntimeUnroll = false;
+ MDNode *OrigLoopID = L->getLoopID();
if (!VectorizeLoop) {
assert(IC > 1 && "interleave count should not be 1 or 0");
@@ -7553,7 +7423,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// no runtime checks about strides and memory. A scalar loop that is
// rarely used is not worth unrolling.
if (!LB.areSafetyChecksAdded())
- AddRuntimeUnrollDisableMetaData(L);
+ DisableRuntimeUnroll = true;
// Report the vectorization decision.
ORE->emit([&]() {
@@ -7565,8 +7435,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
});
}
- // Mark the loop as already vectorized to avoid vectorizing again.
- Hints.setAlreadyVectorized();
+ Optional<MDNode *> RemainderLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupEpilogue});
+ if (RemainderLoopID.hasValue()) {
+ L->setLoopID(RemainderLoopID.getValue());
+ } else {
+ if (DisableRuntimeUnroll)
+ AddRuntimeUnrollDisableMetaData(L);
+
+ // Mark the loop as already vectorized to avoid vectorizing again.
+ Hints.setAlreadyVectorized();
+ }
LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
return true;
@@ -7659,8 +7539,15 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
- PA.preserve<LoopAnalysis>();
- PA.preserve<DominatorTreeAnalysis>();
+
+ // We currently do not preserve loopinfo/dominator analyses with outer loop
+ // vectorization. Until this is addressed, mark these analyses as preserved
+ // only for non-VPlan-native path.
+ // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+ if (!EnableVPlanNativePath) {
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<DominatorTreeAnalysis>();
+ }
PA.preserve<BasicAA>();
PA.preserve<GlobalsAA>();
return PA;