summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2015-01-18 16:17:27 +0000
committerDimitry Andric <dim@FreeBSD.org>2015-01-18 16:17:27 +0000
commit67c32a98315f785a9ec9d531c1f571a0196c7463 (patch)
tree4abb9cbeecc7901726dd0b4a37369596c852e9ef /lib/Transforms/Vectorize
parent9f61947910e6ab40de38e6b4034751ef1513200f (diff)
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r--lib/Transforms/Vectorize/BBVectorize.cpp40
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp714
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp1369
3 files changed, 1520 insertions, 603 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 28ec83bf8683..a0ccf9d7b8cd 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -391,8 +391,6 @@ namespace {
Instruction *&InsertionPt,
Instruction *I, Instruction *J);
- void combineMetadata(Instruction *K, const Instruction *J);
-
bool vectorizeBB(BasicBlock &BB) {
if (skipOptnoneFunction(BB))
return false;
@@ -687,6 +685,8 @@ namespace {
case Intrinsic::trunc:
case Intrinsic::floor:
case Intrinsic::fabs:
+ case Intrinsic::minnum:
+ case Intrinsic::maxnum:
return Config.VectorizeMath;
case Intrinsic::bswap:
case Intrinsic::ctpop:
@@ -2609,7 +2609,6 @@ namespace {
true, o, 1));
NewI1->insertBefore(IBeforeJ ? J : I);
I1 = NewI1;
- I1T = I2T;
I1Elem = I2Elem;
} else if (I1Elem > I2Elem) {
std::vector<Constant *> Mask(I1Elem);
@@ -2626,8 +2625,6 @@ namespace {
true, o, 1));
NewI2->insertBefore(IBeforeJ ? J : I);
I2 = NewI2;
- I2T = I1T;
- I2Elem = I1Elem;
}
// Now that both I1 and I2 are the same length we can shuffle them
@@ -2964,31 +2961,6 @@ namespace {
}
}
- // When the first instruction in each pair is cloned, it will inherit its
- // parent's metadata. This metadata must be combined with that of the other
- // instruction in a safe way.
- void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) {
- SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
- K->getAllMetadataOtherThanDebugLoc(Metadata);
- for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
- unsigned Kind = Metadata[i].first;
- MDNode *JMD = J->getMetadata(Kind);
- MDNode *KMD = Metadata[i].second;
-
- switch (Kind) {
- default:
- K->setMetadata(Kind, nullptr); // Remove unknown metadata
- break;
- case LLVMContext::MD_tbaa:
- K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
- break;
- case LLVMContext::MD_fpmath:
- K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
- break;
- }
- }
- }
-
// This function fuses the chosen instruction pairs into vector instructions,
// taking care preserve any needed scalar outputs and, then, it reorders the
// remaining instructions as needed (users of the first member of the pair
@@ -3138,7 +3110,13 @@ namespace {
if (!isa<StoreInst>(K))
K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
- combineMetadata(K, H);
+ unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_fpmath
+ };
+ combineMetadata(K, H, KnownIDs);
K->intersectOptionalDataWith(H);
for (unsigned o = 0; o < NumOperands; ++o)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79fcb09f8913..557304ed56c5 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -55,7 +55,9 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
@@ -108,8 +110,8 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
cl::desc("Sets the SIMD width. Zero is autoselect."));
static cl::opt<unsigned>
-VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
- cl::desc("Sets the vectorization unroll count. "
+VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden,
+ cl::desc("Sets the vectorization interleave count. "
"Zero is autoselect."));
static cl::opt<bool>
@@ -157,17 +159,17 @@ static cl::opt<unsigned> ForceTargetNumVectorRegs(
"force-target-num-vector-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of vector registers."));
-/// Maximum vectorization unroll count.
-static const unsigned MaxUnrollFactor = 16;
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
-static cl::opt<unsigned> ForceTargetMaxScalarUnrollFactor(
- "force-target-max-scalar-unroll", cl::init(0), cl::Hidden,
- cl::desc("A flag that overrides the target's max unroll factor for scalar "
- "loops."));
+static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
+ "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max interleave factor for "
+ "scalar loops."));
-static cl::opt<unsigned> ForceTargetMaxVectorUnrollFactor(
- "force-target-max-vector-unroll", cl::init(0), cl::Hidden,
- cl::desc("A flag that overrides the target's max unroll factor for "
+static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
+ "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max interleave factor for "
"vectorized loops."));
static cl::opt<unsigned> ForceTargetInstructionCost(
@@ -204,11 +206,17 @@ static cl::opt<bool> EnableCondStoresVectorization(
"enable-cond-stores-vec", cl::init(false), cl::Hidden,
cl::desc("Enable if predication of stores during vectorization."));
+static cl::opt<unsigned> MaxNestedScalarReductionUF(
+ "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
+ cl::desc("The maximum unroll factor to use when unrolling a scalar "
+ "reduction in a nested loop."));
+
namespace {
// Forward declarations.
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
+class LoopVectorizeHints;
/// Optimization analysis message produced during vectorization. Messages inform
/// the user why vectorization did not occur.
@@ -535,6 +543,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) {
// non-speculated memory access when the condition was false, this would be
// caught by the runtime overlap checks).
if (Kind != LLVMContext::MD_tbaa &&
+ Kind != LLVMContext::MD_alias_scope &&
+ Kind != LLVMContext::MD_noalias &&
Kind != LLVMContext::MD_fpmath)
continue;
@@ -570,9 +580,10 @@ public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
DominatorTree *DT, TargetLibraryInfo *TLI,
- AliasAnalysis *AA, Function *F)
+ AliasAnalysis *AA, Function *F,
+ const TargetTransformInfo *TTI)
: NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
- DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr),
+ DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr),
WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
}
@@ -758,6 +769,21 @@ public:
}
SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
+ /// Returns true if the target machine supports masked store operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
+ return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
+ }
+ /// Returns true if the target machine supports masked load operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
+ return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
+ }
+ /// Returns true if vector representation of the instruction \p I
+ /// requires mask.
+ bool isMaskRequired(const Instruction* I) {
+ return (MaskedOp.count(I) != 0);
+ }
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@@ -780,7 +806,7 @@ private:
/// Return true if all of the instructions in the block can be speculatively
/// executed. \p SafePtrs is a list of addresses that are known to be legal
/// and we know that we can read from them without segfault.
- bool blockCanBePredicated(BasicBlock *BB, SmallPtrSet<Value *, 8>& SafePtrs);
+ bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
/// Returns True, if 'Phi' is the kind of reduction variable for type
/// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
@@ -804,7 +830,7 @@ private:
///
/// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
/// invariant.
- void collectStridedAcccess(Value *LoadOrStoreInst);
+ void collectStridedAccess(Value *LoadOrStoreInst);
/// Report an analysis message to assist the user in diagnosing loops that are
/// not vectorized.
@@ -830,6 +856,8 @@ private:
AliasAnalysis *AA;
/// Parent function
Function *TheFunction;
+ /// Target Transform Info
+ const TargetTransformInfo *TTI;
// --- vectorization state --- //
@@ -861,6 +889,10 @@ private:
ValueToValueMap Strides;
SmallPtrSet<Value *, 8> StrideSet;
+
+ /// While vectorizing these instructions we have to generate a
+ /// call to the appropriate masked intrinsic
+ SmallPtrSet<const Instruction*, 8> MaskedOp;
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -875,8 +907,13 @@ public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
- const DataLayout *DL, const TargetLibraryInfo *TLI)
- : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
+ const DataLayout *DL, const TargetLibraryInfo *TLI,
+ AssumptionCache *AC, const Function *F,
+ const LoopVectorizeHints *Hints)
+ : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI),
+ TheFunction(F), Hints(Hints) {
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+ }
/// Information about vectorization costs
struct VectorizationFactor {
@@ -887,9 +924,7 @@ public:
/// This method checks every power of two up to VF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
- VectorizationFactor selectVectorizationFactor(bool OptForSize,
- unsigned UserVF,
- bool ForceVectorization);
+ VectorizationFactor selectVectorizationFactor(bool OptForSize);
/// \return The size (in bits) of the widest type in the code that
/// needs to be vectorized. We ignore values that remain scalar such as
@@ -901,8 +936,7 @@ public:
/// based on register pressure and other parameters.
/// VF and LoopCost are the selected vectorization factor and the cost of the
/// selected VF.
- unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
- unsigned LoopCost);
+ unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
/// \brief A struct that represents some properties of the register usage
/// of a loop.
@@ -938,6 +972,19 @@ private:
/// as a vector operation.
bool isConsecutiveLoadOrStore(Instruction *I);
+ /// Report an analysis message to assist the user in diagnosing loops that are
+ /// not vectorized.
+ void emitAnalysis(Report &Message) {
+ DebugLoc DL = TheLoop->getStartLoc();
+ if (Instruction *I = Message.getInstr())
+ DL = I->getDebugLoc();
+ emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
+ *TheFunction, DL, Message.str());
+ }
+
+ /// Values used only by @llvm.assume calls.
+ SmallPtrSet<const Value *, 32> EphValues;
+
/// The loop that we evaluate.
Loop *TheLoop;
/// Scev analysis.
@@ -952,11 +999,59 @@ private:
const DataLayout *DL;
/// Target Library Info.
const TargetLibraryInfo *TLI;
+ const Function *TheFunction;
+ // Loop Vectorize Hint.
+ const LoopVectorizeHints *Hints;
};
/// Utility class for getting and setting loop vectorizer hints in the form
/// of loop metadata.
+/// This class keeps a number of loop annotations locally (as member variables)
+/// and can, upon request, write them back as metadata on the loop. It will
+/// initially scan the loop for existing metadata, and will update the local
+/// values based on information in the loop.
+/// We cannot write all values to metadata, as the mere presence of some info,
+/// for example 'force', means a decision has been made. So, we need to be
+/// careful NOT to add them if the user hasn't specifically asked so.
class LoopVectorizeHints {
+ enum HintKind {
+ HK_WIDTH,
+ HK_UNROLL,
+ HK_FORCE
+ };
+
+ /// Hint - associates name and validation with the hint value.
+ struct Hint {
+ const char * Name;
+ unsigned Value; // This may have to change for non-numeric values.
+ HintKind Kind;
+
+ Hint(const char * Name, unsigned Value, HintKind Kind)
+ : Name(Name), Value(Value), Kind(Kind) { }
+
+ bool validate(unsigned Val) {
+ switch (Kind) {
+ case HK_WIDTH:
+ return isPowerOf2_32(Val) && Val <= MaxVectorWidth;
+ case HK_UNROLL:
+ return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+ case HK_FORCE:
+ return (Val <= 1);
+ }
+ return false;
+ }
+ };
+
+ /// Vectorization width.
+ Hint Width;
+ /// Vectorization interleave factor.
+ Hint Interleave;
+ /// Vectorization forced
+ Hint Force;
+
+ /// Return the loop metadata prefix.
+ static StringRef Prefix() { return "llvm.loop."; }
+
public:
enum ForceKind {
FK_Undefined = -1, ///< Not selected.
@@ -964,90 +1059,57 @@ public:
FK_Enabled = 1, ///< Forcing enabled.
};
- LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
- : Width(VectorizationFactor),
- Unroll(DisableUnrolling),
- Force(FK_Undefined),
- LoopID(L->getLoopID()) {
- getHints(L);
- // force-vector-unroll overrides DisableUnrolling.
- if (VectorizationUnroll.getNumOccurrences() > 0)
- Unroll = VectorizationUnroll;
+ LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
+ : Width("vectorize.width", VectorizationFactor, HK_WIDTH),
+ Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+ Force("vectorize.enable", FK_Undefined, HK_FORCE),
+ TheLoop(L) {
+ // Populate values with existing loop metadata.
+ getHintsFromMetadata();
- DEBUG(if (DisableUnrolling && Unroll == 1) dbgs()
- << "LV: Unrolling disabled by the pass manager\n");
- }
-
- /// Return the loop metadata prefix.
- static StringRef Prefix() { return "llvm.loop."; }
+ // force-vector-interleave overrides DisableInterleaving.
+ if (VectorizationInterleave.getNumOccurrences() > 0)
+ Interleave.Value = VectorizationInterleave;
- MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const {
- SmallVector<Value*, 2> Vals;
- Vals.push_back(MDString::get(Context, Name));
- Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V));
- return MDNode::get(Context, Vals);
+ DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+ << "LV: Interleaving disabled by the pass manager\n");
}
/// Mark the loop L as already vectorized by setting the width to 1.
- void setAlreadyVectorized(Loop *L) {
- LLVMContext &Context = L->getHeader()->getContext();
-
- Width = 1;
-
- // Create a new loop id with one more operand for the already_vectorized
- // hint. If the loop already has a loop id then copy the existing operands.
- SmallVector<Value*, 4> Vals(1);
- if (LoopID)
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i)
- Vals.push_back(LoopID->getOperand(i));
-
- Vals.push_back(
- createHint(Context, Twine(Prefix(), "vectorize.width").str(), Width));
- Vals.push_back(
- createHint(Context, Twine(Prefix(), "interleave.count").str(), 1));
-
- MDNode *NewLoopID = MDNode::get(Context, Vals);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
-
- L->setLoopID(NewLoopID);
- if (LoopID)
- LoopID->replaceAllUsesWith(NewLoopID);
-
- LoopID = NewLoopID;
+ void setAlreadyVectorized() {
+ Width.Value = Interleave.Value = 1;
+ Hint Hints[] = {Width, Interleave};
+ writeHintsToMetadata(Hints);
}
+ /// Dumps all the hint information.
std::string emitRemark() const {
Report R;
- R << "vectorization ";
- switch (Force) {
- case LoopVectorizeHints::FK_Disabled:
- R << "is explicitly disabled";
- break;
- case LoopVectorizeHints::FK_Enabled:
- R << "is explicitly enabled";
- if (Width != 0 && Unroll != 0)
- R << " with width " << Width << " and interleave count " << Unroll;
- else if (Width != 0)
- R << " with width " << Width;
- else if (Unroll != 0)
- R << " with interleave count " << Unroll;
- break;
- case LoopVectorizeHints::FK_Undefined:
- R << "was not specified";
- break;
+ if (Force.Value == LoopVectorizeHints::FK_Disabled)
+ R << "vectorization is explicitly disabled";
+ else {
+ R << "use -Rpass-analysis=loop-vectorize for more info";
+ if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+ R << " (Force=true";
+ if (Width.Value != 0)
+ R << ", Vector Width=" << Width.Value;
+ if (Interleave.Value != 0)
+ R << ", Interleave Count=" << Interleave.Value;
+ R << ")";
+ }
}
+
return R.str();
}
- unsigned getWidth() const { return Width; }
- unsigned getUnroll() const { return Unroll; }
- enum ForceKind getForce() const { return Force; }
- MDNode *getLoopID() const { return LoopID; }
+ unsigned getWidth() const { return Width.Value; }
+ unsigned getInterleave() const { return Interleave.Value; }
+ enum ForceKind getForce() const { return (ForceKind)Force.Value; }
private:
- /// Find hints specified in the loop metadata.
- void getHints(const Loop *L) {
+ /// Find hints specified in the loop metadata and update local values.
+ void getHintsFromMetadata() {
+ MDNode *LoopID = TheLoop->getLoopID();
if (!LoopID)
return;
@@ -1057,7 +1119,7 @@ private:
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
const MDString *S = nullptr;
- SmallVector<Value*, 4> Args;
+ SmallVector<Metadata *, 4> Args;
// The expected hint is either a MDString or a MDNode with the first
// operand a MDString.
@@ -1076,52 +1138,88 @@ private:
continue;
// Check if the hint starts with the loop metadata prefix.
- StringRef Hint = S->getString();
- if (!Hint.startswith(Prefix()))
- continue;
- // Remove the prefix.
- Hint = Hint.substr(Prefix().size(), StringRef::npos);
-
+ StringRef Name = S->getString();
if (Args.size() == 1)
- getHint(Hint, Args[0]);
+ setHint(Name, Args[0]);
}
}
- // Check string hint with one operand.
- void getHint(StringRef Hint, Value *Arg) {
- const ConstantInt *C = dyn_cast<ConstantInt>(Arg);
+ /// Checks string hint with one operand and set value if valid.
+ void setHint(StringRef Name, Metadata *Arg) {
+ if (!Name.startswith(Prefix()))
+ return;
+ Name = Name.substr(Prefix().size(), StringRef::npos);
+
+ const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
if (!C) return;
unsigned Val = C->getZExtValue();
- if (Hint == "vectorize.width") {
- if (isPowerOf2_32(Val) && Val <= MaxVectorWidth)
- Width = Val;
- else
- DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n");
- } else if (Hint == "vectorize.enable") {
- if (C->getBitWidth() == 1)
- Force = Val == 1 ? LoopVectorizeHints::FK_Enabled
- : LoopVectorizeHints::FK_Disabled;
- else
- DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n");
- } else if (Hint == "interleave.count") {
- if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor)
- Unroll = Val;
- else
- DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
- } else {
- DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n');
+ Hint *Hints[] = {&Width, &Interleave, &Force};
+ for (auto H : Hints) {
+ if (Name == H->Name) {
+ if (H->validate(Val))
+ H->Value = Val;
+ else
+ DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+ break;
+ }
}
}
- /// Vectorization width.
- unsigned Width;
- /// Vectorization unroll factor.
- unsigned Unroll;
- /// Vectorization forced
- enum ForceKind Force;
+ /// Create a new hint from name / value pair.
+ MDNode *createHintMetadata(StringRef Name, unsigned V) const {
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ Metadata *MDs[] = {MDString::get(Context, Name),
+ ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Context), V))};
+ return MDNode::get(Context, MDs);
+ }
+
+ /// Matches metadata with hint name.
+ bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
+ MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
+ if (!Name)
+ return false;
- MDNode *LoopID;
+ for (auto H : HintTypes)
+ if (Name->getString().endswith(H.Name))
+ return true;
+ return false;
+ }
+
+ /// Sets current hints into loop metadata, keeping other values intact.
+ void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+ if (HintTypes.size() == 0)
+ return;
+
+ // Reserve the first element to LoopID (see below).
+ SmallVector<Metadata *, 4> MDs(1);
+ // If the loop already has metadata, then ignore the existing operands.
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (LoopID) {
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+ // If node in update list, ignore old value.
+ if (!matchesHintMetadataName(Node, HintTypes))
+ MDs.push_back(Node);
+ }
+ }
+
+ // Now, add the missing hints.
+ for (auto H : HintTypes)
+ MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+ // Replace current metadata node with new one.
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+
+ TheLoop->setLoopID(NewLoopID);
+ }
+
+ /// The loop these hints belong to.
+ const Loop *TheLoop;
};
static void emitMissedWarning(Function *F, Loop *L,
@@ -1134,7 +1232,7 @@ static void emitMissedWarning(Function *F, Loop *L,
emitLoopVectorizeWarning(
F->getContext(), *F, L->getStartLoc(),
"failed explicitly specified loop vectorization");
- else if (LH.getUnroll() != 1)
+ else if (LH.getInterleave() != 1)
emitLoopInterleaveWarning(
F->getContext(), *F, L->getStartLoc(),
"failed explicitly specified loop interleaving");
@@ -1169,6 +1267,7 @@ struct LoopVectorize : public FunctionPass {
BlockFrequencyInfo *BFI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
+ AssumptionCache *AC;
bool DisableUnrolling;
bool AlwaysVectorize;
@@ -1184,6 +1283,7 @@ struct LoopVectorize : public FunctionPass {
BFI = &getAnalysis<BlockFrequencyInfo>();
TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
AA = &getAnalysis<AliasAnalysis>();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
// Compute some weights outside of the loop over the loops. Compute this
// using a BranchProbability to re-use its scaling math.
@@ -1240,7 +1340,7 @@ struct LoopVectorize : public FunctionPass {
: (Hints.getForce() == LoopVectorizeHints::FK_Enabled
? "enabled"
: "?")) << " width=" << Hints.getWidth()
- << " unroll=" << Hints.getUnroll() << "\n");
+ << " unroll=" << Hints.getInterleave() << "\n");
// Function containing loop
Function *F = L->getHeader()->getParent();
@@ -1267,7 +1367,7 @@ struct LoopVectorize : public FunctionPass {
return false;
}
- if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) {
+ if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
emitOptimizationRemarkAnalysis(
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
@@ -1278,8 +1378,7 @@ struct LoopVectorize : public FunctionPass {
// Check the loop for a trip count threshold:
// do not vectorize loops with a tiny trip count.
- BasicBlock *Latch = L->getLoopLatch();
- const unsigned TC = SE->getSmallConstantTripCount(L, Latch);
+ const unsigned TC = SE->getSmallConstantTripCount(L);
if (TC > 0u && TC < TinyTripCountVectorThreshold) {
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is not worth vectorizing.");
@@ -1295,7 +1394,7 @@ struct LoopVectorize : public FunctionPass {
}
// Check if it is legal to vectorize the loop.
- LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F);
+ LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
emitMissedWarning(F, L, Hints);
@@ -1303,7 +1402,8 @@ struct LoopVectorize : public FunctionPass {
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
+ LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F,
+ &Hints);
// Check the function attributes to find out if this function should be
// optimized for size.
@@ -1338,13 +1438,11 @@ struct LoopVectorize : public FunctionPass {
// Select the optimal vectorization factor.
const LoopVectorizationCostModel::VectorizationFactor VF =
- CM.selectVectorizationFactor(OptForSize, Hints.getWidth(),
- Hints.getForce() ==
- LoopVectorizeHints::FK_Enabled);
+ CM.selectVectorizationFactor(OptForSize);
// Select the unroll factor.
const unsigned UF =
- CM.selectUnrollFactor(OptForSize, Hints.getUnroll(), VF.Width, VF.Cost);
+ CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
@@ -1385,13 +1483,14 @@ struct LoopVectorize : public FunctionPass {
}
// Mark the loop as already vectorized to avoid vectorizing again.
- Hints.setAlreadyVectorized(L);
+ Hints.setAlreadyVectorized();
DEBUG(verifyFunction(*L->getHeader()->getParent()));
return true;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addRequired<BlockFrequencyInfo>();
@@ -1683,7 +1782,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
- if (SI && Legal->blockNeedsPredication(SI->getParent()))
+ if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
+ !Legal->isMaskRequired(SI))
return scalarizeInstruction(Instr, true);
if (ScalarAllocatedSize != VectorElementSize)
@@ -1752,6 +1852,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
}
+ VectorParts Mask = createBlockInMask(Instr->getParent());
// Handle Stores:
if (SI) {
assert(!Legal->isUniform(SI->getPointerOperand()) &&
@@ -1760,7 +1861,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
// We don't want to update the value in the map as it might be used in
// another expression. So don't use a reference type for "StoredVal".
VectorParts StoredVal = getVectorValue(SI->getValueOperand());
-
+
for (unsigned Part = 0; Part < UF; ++Part) {
// Calculate the pointer for the specific unroll-part.
Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
@@ -1777,8 +1878,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Value *VecPtr = Builder.CreateBitCast(PartPtr,
DataTy->getPointerTo(AddressSpace));
- StoreInst *NewSI =
- Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
+
+ Instruction *NewSI;
+ if (Legal->isMaskRequired(SI))
+ NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
+ Mask[Part]);
+ else
+ NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
propagateMetadata(NewSI, SI);
}
return;
@@ -1793,14 +1899,20 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
if (Reverse) {
// If the address is consecutive but reversed, then the
- // wide store needs to start at the last vector element.
+ // wide load needs to start at the last vector element.
PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
}
+ Instruction* NewLI;
Value *VecPtr = Builder.CreateBitCast(PartPtr,
DataTy->getPointerTo(AddressSpace));
- LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
+ if (Legal->isMaskRequired(LI))
+ NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
+ UndefValue::get(DataTy),
+ "wide.masked.load");
+ else
+ NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
propagateMetadata(NewLI, LI);
Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
}
@@ -2487,7 +2599,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
LoopScalarBody = OldBasicBlock;
LoopVectorizeHints Hints(Lp, true);
- Hints.setAlreadyVectorized(Lp);
+ Hints.setAlreadyVectorized();
}
/// This function returns the identity element (or neutral element) for
@@ -2755,9 +2867,6 @@ void InnerLoopVectorizer::vectorizeLoop() {
}
// Fix the vector-loop phi.
- // We created the induction variable so we know that the
- // preheader is the first entry.
- BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
@@ -2769,7 +2878,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
// Make sure to add the reduction stat value only to the
// first unroll part.
Value *StartVal = (part == 0) ? VectorStart : Identity;
- cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
+ cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
+ LoopVectorPreHeader);
cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
LoopVectorBody.back());
}
@@ -2901,7 +3011,7 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
LoopMiddleBlock);
}
-}
+}
InnerLoopVectorizer::VectorParts
InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
@@ -3168,18 +3278,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
- // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
- BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
- if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
- VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
- VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
- }
- if (VecOp && isa<PossiblyExactOperator>(VecOp))
- VecOp->setIsExact(BinOp->isExact());
-
- // Copy the fast-math flags.
- if (VecOp && isa<FPMathOperator>(V))
- VecOp->setFastMathFlags(it->getFastMathFlags());
+ if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
+ VecOp->copyIRFlags(BinOp);
Entry[Part] = V;
}
@@ -3292,6 +3392,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
assert(ID && "Not an intrinsic call!");
switch (ID) {
+ case Intrinsic::assume:
case Intrinsic::lifetime_end:
case Intrinsic::lifetime_start:
scalarizeInstruction(it);
@@ -3542,7 +3643,7 @@ static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
/// \brief Check that the instruction has outside loop users and is not an
/// identified reduction variable.
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
- SmallPtrSet<Value *, 4> &Reductions) {
+ SmallPtrSetImpl<Value *> &Reductions) {
// Reduction instructions are allowed to have exit users. All other
// instructions must not have external users.
if (!Reductions.count(Inst))
@@ -3597,12 +3698,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// identified reduction value with an outside user.
if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
continue;
- emitAnalysis(Report(it) << "value that could not be identified as "
- "reduction is used outside the loop");
+ emitAnalysis(Report(it) << "value could not be identified as "
+ "an induction or reduction variable");
return false;
}
- // We only allow if-converted PHIs with more than two incoming values.
+ // We only allow if-converted PHIs with exactly two incoming values.
if (Phi->getNumIncomingValues() != 2) {
emitAnalysis(Report(it)
<< "control flow not understood by vectorizer");
@@ -3683,7 +3784,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
- emitAnalysis(Report(it) << "unvectorizable operation");
+ emitAnalysis(Report(it) << "value that could not be identified as "
+ "reduction is used outside the loop");
DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
return false;
}// end of PHI handling
@@ -3727,12 +3829,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return false;
}
if (EnableMemAccessVersioning)
- collectStridedAcccess(ST);
+ collectStridedAccess(ST);
}
if (EnableMemAccessVersioning)
if (LoadInst *LI = dyn_cast<LoadInst>(it))
- collectStridedAcccess(LI);
+ collectStridedAccess(LI);
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
@@ -3870,7 +3972,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
return Stride;
}
-void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) {
+void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
Value *Ptr = nullptr;
if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
Ptr = LI->getPointerOperand();
@@ -3946,7 +4048,7 @@ public:
/// \brief Register a load and whether it is only read from.
void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
Value *Ptr = const_cast<Value*>(Loc.Ptr);
- AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.TBAATag);
+ AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
Accesses.insert(MemAccessInfo(Ptr, false));
if (IsReadOnly)
ReadOnlyPtr.insert(Ptr);
@@ -3955,7 +4057,7 @@ public:
/// \brief Register a store.
void addStore(AliasAnalysis::Location &Loc) {
Value *Ptr = const_cast<Value*>(Loc.Ptr);
- AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.TBAATag);
+ AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
Accesses.insert(MemAccessInfo(Ptr, true));
}
@@ -4166,57 +4268,66 @@ void AccessAnalysis::processMemAccesses() {
bool UseDeferred = SetIteration > 0;
PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
- for (auto A : AS) {
- Value *Ptr = A.getValue();
- bool IsWrite = S.count(MemAccessInfo(Ptr, true));
+ for (auto AV : AS) {
+ Value *Ptr = AV.getValue();
- // If we're using the deferred access set, then it contains only reads.
- bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
- if (UseDeferred && !IsReadOnlyPtr)
- continue;
- // Otherwise, the pointer must be in the PtrAccessSet, either as a read
- // or a write.
- assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
- S.count(MemAccessInfo(Ptr, false))) &&
- "Alias-set pointer not in the access set?");
-
- MemAccessInfo Access(Ptr, IsWrite);
- DepCands.insert(Access);
-
- // Memorize read-only pointers for later processing and skip them in the
- // first round (they need to be checked after we have seen all write
- // pointers). Note: we also mark pointer that are not consecutive as
- // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need
- // the second check for "!IsWrite".
- if (!UseDeferred && IsReadOnlyPtr) {
- DeferredAccesses.insert(Access);
- continue;
- }
+ // For a single memory access in AliasSetTracker, Accesses may contain
+ // both read and write, and they both need to be handled for CheckDeps.
+ for (auto AC : S) {
+ if (AC.getPointer() != Ptr)
+ continue;
- // If this is a write - check other reads and writes for conflicts. If
- // this is a read only check other writes for conflicts (but only if
- // there is no other write to the ptr - this is an optimization to
- // catch "a[i] = a[i] + " without having to do a dependence check).
- if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
- CheckDeps.insert(Access);
- IsRTCheckNeeded = true;
- }
+ bool IsWrite = AC.getInt();
+
+ // If we're using the deferred access set, then it contains only
+ // reads.
+ bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
+ if (UseDeferred && !IsReadOnlyPtr)
+ continue;
+ // Otherwise, the pointer must be in the PtrAccessSet, either as a
+ // read or a write.
+ assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
+ S.count(MemAccessInfo(Ptr, false))) &&
+ "Alias-set pointer not in the access set?");
+
+ MemAccessInfo Access(Ptr, IsWrite);
+ DepCands.insert(Access);
+
+ // Memorize read-only pointers for later processing and skip them in
+ // the first round (they need to be checked after we have seen all
+ // write pointers). Note: we also mark pointer that are not
+ // consecutive as "read-only" pointers (so that we check
+ // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
+ if (!UseDeferred && IsReadOnlyPtr) {
+ DeferredAccesses.insert(Access);
+ continue;
+ }
+
+ // If this is a write - check other reads and writes for conflicts. If
+ // this is a read only check other writes for conflicts (but only if
+ // there is no other write to the ptr - this is an optimization to
+ // catch "a[i] = a[i] + " without having to do a dependence check).
+ if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
+ CheckDeps.insert(Access);
+ IsRTCheckNeeded = true;
+ }
- if (IsWrite)
- SetHasWrite = true;
-
- // Create sets of pointers connected by a shared alias set and
- // underlying object.
- typedef SmallVector<Value*, 16> ValueVector;
- ValueVector TempObjects;
- GetUnderlyingObjects(Ptr, TempObjects, DL);
- for (Value *UnderlyingObj : TempObjects) {
- UnderlyingObjToAccessMap::iterator Prev =
- ObjToLastAccess.find(UnderlyingObj);
- if (Prev != ObjToLastAccess.end())
- DepCands.unionSets(Access, Prev->second);
-
- ObjToLastAccess[UnderlyingObj] = Access;
+ if (IsWrite)
+ SetHasWrite = true;
+
+ // Create sets of pointers connected by a shared alias set and
+ // underlying object.
+ typedef SmallVector<Value *, 16> ValueVector;
+ ValueVector TempObjects;
+ GetUnderlyingObjects(Ptr, TempObjects, DL);
+ for (Value *UnderlyingObj : TempObjects) {
+ UnderlyingObjToAccessMap::iterator Prev =
+ ObjToLastAccess.find(UnderlyingObj);
+ if (Prev != ObjToLastAccess.end())
+ DepCands.unionSets(Access, Prev->second);
+
+ ObjToLastAccess[UnderlyingObj] = Access;
+ }
}
}
}
@@ -4566,7 +4677,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// Bail out early if passed-in parameters make vectorization not feasible.
unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
- unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1;
+ unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1;
// The distance must be bigger than the size needed for a vectorized version
// of the operation and the size of the vectorized operation must not be
@@ -4738,7 +4849,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// If we did *not* see this pointer before, insert it to the read-write
// list. At this phase it is only a 'write' list.
- if (Seen.insert(Ptr)) {
+ if (Seen.insert(Ptr).second) {
++NumReadWrites;
AliasAnalysis::Location Loc = AA->getLocation(ST);
@@ -4746,7 +4857,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// condition, so we cannot rely on it when determining whether or not we
// need runtime pointer checks.
if (blockNeedsPredication(ST->getParent()))
- Loc.TBAATag = nullptr;
+ Loc.AATags.TBAA = nullptr;
Accesses.addStore(Loc);
}
@@ -4771,7 +4882,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// read a few words, modify, and write a few words, and some of the
// words may be written to the same address.
bool IsReadOnlyPtr = false;
- if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
+ if (Seen.insert(Ptr).second ||
+ !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
++NumReads;
IsReadOnlyPtr = true;
}
@@ -4781,7 +4893,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// condition, so we cannot rely on it when determining whether or not we
// need runtime pointer checks.
if (blockNeedsPredication(LD->getParent()))
- Loc.TBAATag = nullptr;
+ Loc.AATags.TBAA = nullptr;
Accesses.addLoad(Loc, IsReadOnlyPtr);
}
@@ -4884,7 +4996,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
}
static bool hasMultipleUsesOf(Instruction *I,
- SmallPtrSet<Instruction *, 8> &Insts) {
+ SmallPtrSetImpl<Instruction *> &Insts) {
unsigned NumUses = 0;
for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
if (Insts.count(dyn_cast<Instruction>(*Use)))
@@ -4896,7 +5008,7 @@ static bool hasMultipleUsesOf(Instruction *I,
return false;
}
-static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) {
+static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set) {
for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
if (!Set.count(dyn_cast<Instruction>(*Use)))
return false;
@@ -5034,7 +5146,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
// value must only be used once, except by phi nodes and min/max
// reductions which are represented as a cmp followed by a select.
ReductionInstDesc IgnoredVal(false, nullptr);
- if (VisitedInsts.insert(UI)) {
+ if (VisitedInsts.insert(UI).second) {
if (isa<PHINode>(UI))
PHIs.push_back(UI);
else
@@ -5136,7 +5248,7 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
ReductionKind Kind,
ReductionInstDesc &Prev) {
bool FP = I->getType()->isFloatingPointTy();
- bool FastMath = (FP && I->isCommutative() && I->isAssociative());
+ bool FastMath = FP && I->hasUnsafeAlgebra();
switch (I->getOpcode()) {
default:
return ReductionInstDesc(false, I);
@@ -5158,6 +5270,7 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
return ReductionInstDesc(Kind == RK_IntegerXor, I);
case Instruction::FMul:
return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
+ case Instruction::FSub:
case Instruction::FAdd:
return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
case Instruction::FCmp:
@@ -5234,13 +5347,28 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
}
bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
- SmallPtrSet<Value *, 8>& SafePtrs) {
+ SmallPtrSetImpl<Value *> &SafePtrs) {
+
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // Check that we don't have a constant expression that can trap as operand.
+ for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
+ OI != OE; ++OI) {
+ if (Constant *C = dyn_cast<Constant>(*OI))
+ if (C->canTrap())
+ return false;
+ }
// We might be able to hoist the load.
if (it->mayReadFromMemory()) {
LoadInst *LI = dyn_cast<LoadInst>(it);
- if (!LI || !SafePtrs.count(LI->getPointerOperand()))
+ if (!LI)
+ return false;
+ if (!SafePtrs.count(LI->getPointerOperand())) {
+ if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
+ MaskedOp.insert(LI);
+ continue;
+ }
return false;
+ }
}
// We don't predicate stores at the moment.
@@ -5248,22 +5376,30 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
StoreInst *SI = dyn_cast<StoreInst>(it);
// We only support predication of stores in basic blocks with one
// predecessor.
- if (!SI || ++NumPredStores > NumberOfStoresToPredicate ||
- !SafePtrs.count(SI->getPointerOperand()) ||
- !SI->getParent()->getSinglePredecessor())
+ if (!SI)
+ return false;
+
+ bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
+ bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
+
+ if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
+ !isSinglePredecessor) {
+ // Build a masked store if it is legal for the target, otherwise scalarize
+ // the block.
+ bool isLegalMaskedOp =
+ isLegalMaskedStore(SI->getValueOperand()->getType(),
+ SI->getPointerOperand());
+ if (isLegalMaskedOp) {
+ --NumPredStores;
+ MaskedOp.insert(SI);
+ continue;
+ }
return false;
+ }
}
if (it->mayThrow())
return false;
- // Check that we don't have a constant expression that can trap as operand.
- for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
- OI != OE; ++OI) {
- if (Constant *C = dyn_cast<Constant>(*OI))
- if (C->canTrap())
- return false;
- }
-
// The instructions below can trap.
switch (it->getOpcode()) {
default: continue;
@@ -5271,7 +5407,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
- return false;
+ return false;
}
}
@@ -5279,23 +5415,23 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
}
LoopVectorizationCostModel::VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
- unsigned UserVF,
- bool ForceVectorization) {
+LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
// Width 1 means no vectorize
VectorizationFactor Factor = { 1U, 0U };
if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
+ emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os");
DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
return Factor;
}
if (!EnableCondStoresVectorization && Legal->NumPredStores) {
+ emitAnalysis(Report() << "store that is conditionally executed prevents vectorization");
DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
return Factor;
}
// Find the trip count.
- unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
+ unsigned TC = SE->getSmallConstantTripCount(TheLoop);
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
unsigned WidestType = getWidestType();
@@ -5315,7 +5451,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
MaxVectorSize = 1;
}
- assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
+ assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
" into one vector!");
unsigned VF = MaxVectorSize;
@@ -5324,6 +5460,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
if (OptForSize) {
// If we are unable to calculate the trip count then don't try to vectorize.
if (TC < 2) {
+ emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow");
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
return Factor;
}
@@ -5337,11 +5474,16 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
// If the trip count that we found modulo the vectorization factor is not
// zero then we require a tail.
if (VF < 2) {
+ emitAnalysis(Report() << "cannot optimize for size and vectorize at the "
+ "same time. Enable vectorization of this loop "
+ "with '#pragma clang loop vectorize(enable)' "
+ "when compiling with -Os");
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
return Factor;
}
}
+ int UserVF = Hints->getWidth();
if (UserVF != 0) {
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
@@ -5357,6 +5499,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
unsigned Width = 1;
DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+ bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
// Ignore scalar width, because the user explicitly wants vectorization.
if (ForceVectorization && VF > 1) {
Width = 2;
@@ -5397,6 +5540,10 @@ unsigned LoopVectorizationCostModel::getWidestType() {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
Type *T = it->getType();
+ // Ignore ephemeral values.
+ if (EphValues.count(it))
+ continue;
+
// Only examine Loads, Stores and PHINodes.
if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
continue;
@@ -5426,29 +5573,29 @@ unsigned LoopVectorizationCostModel::getWidestType() {
unsigned
LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
- unsigned UserUF,
unsigned VF,
unsigned LoopCost) {
// -- The unroll heuristics --
// We unroll the loop in order to expose ILP and reduce the loop overhead.
// There are many micro-architectural considerations that we can't predict
- // at this level. For example frontend pressure (on decode or fetch) due to
+ // at this level. For example, frontend pressure (on decode or fetch) due to
// code size, or the number and capabilities of the execution ports.
//
// We use the following heuristics to select the unroll factor:
- // 1. If the code has reductions the we unroll in order to break the cross
+ // 1. If the code has reductions, then we unroll in order to break the cross
// iteration dependency.
- // 2. If the loop is really small then we unroll in order to reduce the loop
+ // 2. If the loop is really small, then we unroll in order to reduce the loop
// overhead.
// 3. We don't unroll if we think that we will spill registers to memory due
// to the increased register pressure.
// Use the user preference, unless 'auto' is selected.
+ int UserUF = Hints->getInterleave();
if (UserUF != 0)
return UserUF;
- // When we optimize for size we don't unroll.
+ // When we optimize for size, we don't unroll.
if (OptForSize)
return 1;
@@ -5457,8 +5604,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
return 1;
// Do not unroll loops with a relatively small trip count.
- unsigned TC = SE->getSmallConstantTripCount(TheLoop,
- TheLoop->getLoopLatch());
+ unsigned TC = SE->getSmallConstantTripCount(TheLoop);
if (TC > 1 && TC < TinyTripCountUnrollThreshold)
return 1;
@@ -5497,15 +5643,15 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
std::max(1U, (R.MaxLocalUsers - 1)));
// Clamp the unroll factor ranges to reasonable factors.
- unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
+ unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor();
// Check if the user has overridden the unroll max.
if (VF == 1) {
- if (ForceTargetMaxScalarUnrollFactor.getNumOccurrences() > 0)
- MaxUnrollSize = ForceTargetMaxScalarUnrollFactor;
+ if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
+ MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
} else {
- if (ForceTargetMaxVectorUnrollFactor.getNumOccurrences() > 0)
- MaxUnrollSize = ForceTargetMaxVectorUnrollFactor;
+ if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
+ MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
}
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -5515,8 +5661,8 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
// Clamp the calculated UF to be between the 1 and the max unroll factor
// that the target allows.
- if (UF > MaxUnrollSize)
- UF = MaxUnrollSize;
+ if (UF > MaxInterleaveSize)
+ UF = MaxInterleaveSize;
else if (UF < 1)
UF = 1;
@@ -5547,6 +5693,18 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1);
+ // If we have a scalar reduction (vector reductions are already dealt with
+ // by this point), we can increase the critical path length if the loop
+ // we're unrolling is inside another loop. Limit, by default to 2, so the
+ // critical path only gets increased by one reduction operation.
+ if (Legal->getReductionVars()->size() &&
+ TheLoop->getLoopDepth() > 1) {
+ unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
+ SmallUF = std::min(SmallUF, F);
+ StoresUF = std::min(StoresUF, F);
+ LoadsUF = std::min(LoadsUF, F);
+ }
+
if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
return std::max(StoresUF, LoadsUF);
@@ -5648,6 +5806,10 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
// Ignore instructions that are never used within the loop.
if (!Ends.count(I)) continue;
+ // Ignore ephemeral values.
+ if (EphValues.count(I))
+ continue;
+
// Remove all of the instructions that end at this location.
InstrList &List = TransposeEnds[i];
for (unsigned int j=0, e = List.size(); j < e; ++j)
@@ -5688,6 +5850,10 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
if (isa<DbgInfoIntrinsic>(it))
continue;
+ // Ignore ephemeral values.
+ if (EphValues.count(it))
+ continue;
+
unsigned C = getInstructionCost(it, VF);
// Check if we should override the cost.
@@ -5821,18 +5987,31 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueProperties Op1VP =
+ TargetTransformInfo::OP_None;
+ TargetTransformInfo::OperandValueProperties Op2VP =
+ TargetTransformInfo::OP_None;
Value *Op2 = I->getOperand(1);
// Check for a splat of a constant or for a non uniform vector of constants.
- if (isa<ConstantInt>(Op2))
+ if (isa<ConstantInt>(Op2)) {
+ ConstantInt *CInt = cast<ConstantInt>(Op2);
+ if (CInt && CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_PowerOf2;
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
- else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
+ } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
- if (cast<Constant>(Op2)->getSplatValue() != nullptr)
+ Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
+ if (SplatValue) {
+ ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
+ if (CInt && CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_PowerOf2;
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+ }
}
- return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
+ return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
+ Op1VP, Op2VP);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
@@ -5975,6 +6154,7 @@ static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53a43d9851e9..4834782ecc14 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19,7 +19,11 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -42,12 +46,15 @@
#include "llvm/Transforms/Utils/VectorUtils.h"
#include <algorithm>
#include <map>
+#include <memory>
using namespace llvm;
#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"
+STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize if you gain more than this "
@@ -68,53 +75,6 @@ static const unsigned MinVecRegSize = 128;
static const unsigned RecursionMaxDepth = 12;
-/// A helper class for numbering instructions in multiple blocks.
-/// Numbers start at zero for each basic block.
-struct BlockNumbering {
-
- BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
-
- void numberInstructions() {
- unsigned Loc = 0;
- InstrIdx.clear();
- InstrVec.clear();
- // Number the instructions in the block.
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- InstrIdx[it] = Loc++;
- InstrVec.push_back(it);
- assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
- }
- Valid = true;
- }
-
- int getIndex(Instruction *I) {
- assert(I->getParent() == BB && "Invalid instruction");
- if (!Valid)
- numberInstructions();
- assert(InstrIdx.count(I) && "Unknown instruction");
- return InstrIdx[I];
- }
-
- Instruction *getInstruction(unsigned loc) {
- if (!Valid)
- numberInstructions();
- assert(InstrVec.size() > loc && "Invalid Index");
- return InstrVec[loc];
- }
-
- void forget() { Valid = false; }
-
-private:
- /// The block we are numbering.
- BasicBlock *BB;
- /// Is the block numbered.
- bool Valid;
- /// Maps instructions to numbers and back.
- SmallDenseMap<Instruction *, int> InstrIdx;
- /// Maps integers to Instructions.
- SmallVector<Instruction *, 32> InstrVec;
-};
-
/// \returns the parent basic block if all of the instructions in \p VL
/// are in the same block or null otherwise.
static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
@@ -209,6 +169,23 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
return Opcode;
}
+/// Get the intersection (logical and) of all of the potential IR flags
+/// of each scalar operation (VL) that will be converted into a vector (I).
+/// Flag set: NSW, NUW, exact, and all of fast-math.
+static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+ if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
+ if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
+ // Intersection is initialized to the 0th scalar,
+ // so start counting from index '1'.
+ for (int i = 1, e = VL.size(); i < e; ++i) {
+ if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
+ Intersection->andIRFlags(Scalar);
+ }
+ VecOp->copyIRFlags(Intersection);
+ }
+ }
+}
+
/// \returns \p I after propagating metadata from \p VL.
static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
Instruction *I0 = cast<Instruction>(VL[0]);
@@ -230,6 +207,10 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
case LLVMContext::MD_tbaa:
MD = MDNode::getMostGenericTBAA(MD, IMD);
break;
+ case LLVMContext::MD_alias_scope:
+ case LLVMContext::MD_noalias:
+ MD = MDNode::intersect(MD, IMD);
+ break;
case LLVMContext::MD_fpmath:
MD = MDNode::getMostGenericFPMath(MD, IMD);
break;
@@ -381,6 +362,42 @@ static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
}
}
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+ TargetLibraryInfo *TLI) {
+
+ unsigned Opcode = UserInst->getOpcode();
+ switch (Opcode) {
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(UserInst);
+ return (LI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(UserInst);
+ return (SI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(UserInst);
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+ return (CI->getArgOperand(1) == Scalar);
+ }
+ }
+ default:
+ return false;
+ }
+}
+
+/// \returns the AA location that is being access by the instruction.
+static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return AA->getLocation(SI);
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return AA->getLocation(LI);
+ return AliasAnalysis::Location();
+}
+
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
@@ -391,14 +408,21 @@ public:
BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
- LoopInfo *Li, DominatorTree *Dt)
- : F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
- Builder(Se->getContext()) {}
+ LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC)
+ : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
+ SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
+ Builder(Se->getContext()) {
+ CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+ }
/// \brief Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
+ /// \returns the cost incurred by unwanted spills and fills, caused by
+ /// holding live values over call sites.
+ int getSpillCost();
+
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
int getTreeCost();
@@ -414,7 +438,12 @@ public:
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
- MemBarrierIgnoreList.clear();
+ NumLoadsWantToKeepOrder = 0;
+ NumLoadsWantToChangeOrder = 0;
+ for (auto &Iter : BlocksSchedules) {
+ BlockScheduling *BS = Iter.second.get();
+ BS->clear();
+ }
}
/// \returns true if the memory operations A and B are consecutive.
@@ -423,6 +452,11 @@ public:
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
+ /// \returns true if it is benefitial to reverse the vector order.
+ bool shouldReorder() const {
+ return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+ }
+
private:
struct TreeEntry;
@@ -459,20 +493,6 @@ private:
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL);
- /// \returns the AA location that is being access by the instruction.
- AliasAnalysis::Location getLocation(Instruction *I);
-
- /// \brief Checks if it is possible to sink an instruction from
- /// \p Src to \p Dst.
- /// \returns the pointer to the barrier instruction if we can't sink.
- Value *getSinkBarrier(Instruction *Src, Instruction *Dst);
-
- /// \returns the index of the last instruction in the BB from \p VL.
- int getLastIndex(ArrayRef<Value *> VL);
-
- /// \returns the Instruction in the bundle \p VL.
- Instruction *getLastInstruction(ArrayRef<Value *> VL);
-
/// \brief Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(ArrayRef<Value *> VL);
@@ -485,7 +505,7 @@ private:
bool isFullyVectorizableTinyTree();
struct TreeEntry {
- TreeEntry() : Scalars(), VectorizedValue(nullptr), LastScalarIndex(0),
+ TreeEntry() : Scalars(), VectorizedValue(nullptr),
NeedToGather(0) {}
/// \returns true if the scalars in VL are equal to this entry.
@@ -500,9 +520,6 @@ private:
/// The Scalars are vectorized into this value. It is initialized to Null.
Value *VectorizedValue;
- /// The index in the basic block of the last scalar.
- int LastScalarIndex;
-
/// Do we need to gather this sequence ?
bool NeedToGather;
};
@@ -515,18 +532,16 @@ private:
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
if (Vectorized) {
- Last->LastScalarIndex = getLastIndex(VL);
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
ScalarToTreeEntry[VL[i]] = idx;
}
} else {
- Last->LastScalarIndex = 0;
MustGather.insert(VL.begin(), VL.end());
}
return Last;
}
-
+
/// -- Vectorization State --
/// Holds all of the tree entries.
std::vector<TreeEntry> VectorizableTree;
@@ -550,32 +565,369 @@ private:
};
typedef SmallVector<ExternalUser, 16> UserList;
+ /// Checks if two instructions may access the same memory.
+ ///
+ /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
+ /// is invariant in the calling loop.
+ bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1,
+ Instruction *Inst2) {
+
+ // First check if the result is already in the cache.
+ AliasCacheKey key = std::make_pair(Inst1, Inst2);
+ Optional<bool> &result = AliasCache[key];
+ if (result.hasValue()) {
+ return result.getValue();
+ }
+ AliasAnalysis::Location Loc2 = getLocation(Inst2, AA);
+ bool aliased = true;
+ if (Loc1.Ptr && Loc2.Ptr) {
+ // Do the alias check.
+ aliased = AA->alias(Loc1, Loc2);
+ }
+ // Store the result in the cache.
+ result = aliased;
+ return aliased;
+ }
+
+ typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
+
+ /// Cache for alias results.
+ /// TODO: consider moving this to the AliasAnalysis itself.
+ DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+ /// Removes an instruction from its block and eventually deletes it.
+ /// It's like Instruction::eraseFromParent() except that the actual deletion
+ /// is delayed until BoUpSLP is destructed.
+ /// This is required to ensure that there are no incorrect collisions in the
+ /// AliasCache, which can happen if a new instruction is allocated at the
+ /// same address as a previously deleted instruction.
+ void eraseInstruction(Instruction *I) {
+ I->removeFromParent();
+ I->dropAllReferences();
+ DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
+ }
+
+ /// Temporary store for deleted instructions. Instructions will be deleted
+ /// eventually when the BoUpSLP is destructed.
+ SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
+
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User).
UserList ExternalUses;
- /// A list of instructions to ignore while sinking
- /// memory instructions. This map must be reset between runs of getCost.
- ValueSet MemBarrierIgnoreList;
+ /// Values used only by @llvm.assume calls.
+ SmallPtrSet<const Value *, 32> EphValues;
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
- /// Numbers instructions in different blocks.
- DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
+ /// Contains all scheduling relevant data for an instruction.
+ /// A ScheduleData either represents a single instruction or a member of an
+ /// instruction bundle (= a group of instructions which is combined into a
+ /// vector instruction).
+ struct ScheduleData {
+
+ // The initial value for the dependency counters. It means that the
+ // dependencies are not calculated yet.
+ enum { InvalidDeps = -1 };
+
+ ScheduleData()
+ : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
+ NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
+ Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
+ UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
+
+ void init(int BlockSchedulingRegionID) {
+ FirstInBundle = this;
+ NextInBundle = nullptr;
+ NextLoadStore = nullptr;
+ IsScheduled = false;
+ SchedulingRegionID = BlockSchedulingRegionID;
+ UnscheduledDepsInBundle = UnscheduledDeps;
+ clearDependencies();
+ }
+
+ /// Returns true if the dependency information has been calculated.
+ bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
+
+ /// Returns true for single instructions and for bundle representatives
+ /// (= the head of a bundle).
+ bool isSchedulingEntity() const { return FirstInBundle == this; }
+
+ /// Returns true if it represents an instruction bundle and not only a
+ /// single instruction.
+ bool isPartOfBundle() const {
+ return NextInBundle != nullptr || FirstInBundle != this;
+ }
+
+ /// Returns true if it is ready for scheduling, i.e. it has no more
+ /// unscheduled depending instructions/bundles.
+ bool isReady() const {
+ assert(isSchedulingEntity() &&
+ "can't consider non-scheduling entity for ready list");
+ return UnscheduledDepsInBundle == 0 && !IsScheduled;
+ }
+
+ /// Modifies the number of unscheduled dependencies, also updating it for
+ /// the whole bundle.
+ int incrementUnscheduledDeps(int Incr) {
+ UnscheduledDeps += Incr;
+ return FirstInBundle->UnscheduledDepsInBundle += Incr;
+ }
+
+ /// Sets the number of unscheduled dependencies to the number of
+ /// dependencies.
+ void resetUnscheduledDeps() {
+ incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+ }
+
+ /// Clears all dependency information.
+ void clearDependencies() {
+ Dependencies = InvalidDeps;
+ resetUnscheduledDeps();
+ MemoryDependencies.clear();
+ }
+
+ void dump(raw_ostream &os) const {
+ if (!isSchedulingEntity()) {
+ os << "/ " << *Inst;
+ } else if (NextInBundle) {
+ os << '[' << *Inst;
+ ScheduleData *SD = NextInBundle;
+ while (SD) {
+ os << ';' << *SD->Inst;
+ SD = SD->NextInBundle;
+ }
+ os << ']';
+ } else {
+ os << *Inst;
+ }
+ }
+
+ Instruction *Inst;
- /// \brief Get the corresponding instruction numbering list for a given
- /// BasicBlock. The list is allocated lazily.
- BlockNumbering &getBlockNumbering(BasicBlock *BB) {
- auto I = BlocksNumbers.insert(std::make_pair(BB, BlockNumbering(BB)));
- return I.first->second;
- }
+ /// Points to the head in an instruction bundle (and always to this for
+ /// single instructions).
+ ScheduleData *FirstInBundle;
+
+ /// Single linked list of all instructions in a bundle. Null if it is a
+ /// single instruction.
+ ScheduleData *NextInBundle;
+
+ /// Single linked list of all memory instructions (e.g. load, store, call)
+ /// in the block - until the end of the scheduling region.
+ ScheduleData *NextLoadStore;
+
+ /// The dependent memory instructions.
+ /// This list is derived on demand in calculateDependencies().
+ SmallVector<ScheduleData *, 4> MemoryDependencies;
+
+ /// This ScheduleData is in the current scheduling region if this matches
+ /// the current SchedulingRegionID of BlockScheduling.
+ int SchedulingRegionID;
+
+ /// Used for getting a "good" final ordering of instructions.
+ int SchedulingPriority;
+
+ /// The number of dependencies. Constitutes of the number of users of the
+ /// instruction plus the number of dependent memory instructions (if any).
+ /// This value is calculated on demand.
+ /// If InvalidDeps, the number of dependencies is not calculated yet.
+ ///
+ int Dependencies;
+
+ /// The number of dependencies minus the number of dependencies of scheduled
+ /// instructions. As soon as this is zero, the instruction/bundle gets ready
+ /// for scheduling.
+ /// Note that this is negative as long as Dependencies is not calculated.
+ int UnscheduledDeps;
+
+ /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
+ /// single instructions.
+ int UnscheduledDepsInBundle;
+
+ /// True if this instruction is scheduled (or considered as scheduled in the
+ /// dry-run).
+ bool IsScheduled;
+ };
+
+#ifndef NDEBUG
+ friend raw_ostream &operator<<(raw_ostream &os,
+ const BoUpSLP::ScheduleData &SD);
+#endif
+
+ /// Contains all scheduling data for a basic block.
+ ///
+ struct BlockScheduling {
+
+ BlockScheduling(BasicBlock *BB)
+ : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
+ ScheduleStart(nullptr), ScheduleEnd(nullptr),
+ FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
+ // Make sure that the initial SchedulingRegionID is greater than the
+ // initial SchedulingRegionID in ScheduleData (which is 0).
+ SchedulingRegionID(1) {}
+
+ void clear() {
+ ReadyInsts.clear();
+ ScheduleStart = nullptr;
+ ScheduleEnd = nullptr;
+ FirstLoadStoreInRegion = nullptr;
+ LastLoadStoreInRegion = nullptr;
+
+ // Make a new scheduling region, i.e. all existing ScheduleData is not
+ // in the new region yet.
+ ++SchedulingRegionID;
+ }
+
+ ScheduleData *getScheduleData(Value *V) {
+ ScheduleData *SD = ScheduleDataMap[V];
+ if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ return SD;
+ return nullptr;
+ }
+
+ bool isInSchedulingRegion(ScheduleData *SD) {
+ return SD->SchedulingRegionID == SchedulingRegionID;
+ }
+
+ /// Marks an instruction as scheduled and puts all dependent ready
+ /// instructions into the ready-list.
+ template <typename ReadyListType>
+ void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
+ SD->IsScheduled = true;
+ DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
+
+ ScheduleData *BundleMember = SD;
+ while (BundleMember) {
+ // Handle the def-use chain dependencies.
+ for (Use &U : BundleMember->Inst->operands()) {
+ ScheduleData *OpDef = getScheduleData(U.get());
+ if (OpDef && OpDef->hasValidDependencies() &&
+ OpDef->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after decrementing,
+ // so we can put the dependent instruction into the ready list.
+ ScheduleData *DepBundle = OpDef->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
+ }
+ }
+ // Handle the memory dependencies.
+ for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
+ if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after decrementing,
+ // so we can put the dependent instruction into the ready list.
+ ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
+ }
+ }
+ BundleMember = BundleMember->NextInBundle;
+ }
+ }
+
+ /// Put all instructions into the ReadyList which are ready for scheduling.
+ template <typename ReadyListType>
+ void initialFillReadyList(ReadyListType &ReadyList) {
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ ScheduleData *SD = getScheduleData(I);
+ if (SD->isSchedulingEntity() && SD->isReady()) {
+ ReadyList.insert(SD);
+ DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
+ }
+ }
+ }
+
+ /// Checks if a bundle of instructions can be scheduled, i.e. has no
+ /// cyclic dependencies. This is only a dry-run, no instructions are
+ /// actually moved at this stage.
+ bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
+
+ /// Un-bundles a group of instructions.
+ void cancelScheduling(ArrayRef<Value *> VL);
+
+ /// Extends the scheduling region so that V is inside the region.
+ void extendSchedulingRegion(Value *V);
+
+ /// Initialize the ScheduleData structures for new instructions in the
+ /// scheduling region.
+ void initScheduleData(Instruction *FromI, Instruction *ToI,
+ ScheduleData *PrevLoadStore,
+ ScheduleData *NextLoadStore);
+
+ /// Updates the dependency information of a bundle and of all instructions/
+ /// bundles which depend on the original bundle.
+ void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
+ BoUpSLP *SLP);
+
+ /// Sets all instruction in the scheduling region to un-scheduled.
+ void resetSchedule();
+
+ BasicBlock *BB;
+
+ /// Simple memory allocation for ScheduleData.
+ std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+
+ /// The size of a ScheduleData array in ScheduleDataChunks.
+ int ChunkSize;
+
+ /// The allocator position in the current chunk, which is the last entry
+ /// of ScheduleDataChunks.
+ int ChunkPos;
+
+ /// Attaches ScheduleData to Instruction.
+ /// Note that the mapping survives during all vectorization iterations, i.e.
+ /// ScheduleData structures are recycled.
+ DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+
+ struct ReadyList : SmallVector<ScheduleData *, 8> {
+ void insert(ScheduleData *SD) { push_back(SD); }
+ };
+
+ /// The ready-list for scheduling (only used for the dry-run).
+ ReadyList ReadyInsts;
+
+ /// The first instruction of the scheduling region.
+ Instruction *ScheduleStart;
+
+ /// The first instruction _after_ the scheduling region.
+ Instruction *ScheduleEnd;
+
+ /// The first memory accessing instruction in the scheduling region
+ /// (can be null).
+ ScheduleData *FirstLoadStoreInRegion;
+
+ /// The last memory accessing instruction in the scheduling region
+ /// (can be null).
+ ScheduleData *LastLoadStoreInRegion;
+
+ /// The ID of the scheduling region. For a new vectorization iteration this
+ /// is incremented which "removes" all ScheduleData from the region.
+ int SchedulingRegionID;
+ };
+
+ /// Attaches the BlockScheduling structures to basic blocks.
+ MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+ /// Performs the "real" scheduling. Done before vectorization is actually
+ /// performed in a basic block.
+ void scheduleBlock(BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
+ // Number of load-bundles, which contain consecutive loads.
+ int NumLoadsWantToKeepOrder;
+
+ // Number of load-bundles of size 2, which are consecutive loads if reversed.
+ int NumLoadsWantToChangeOrder;
+
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
@@ -589,6 +941,13 @@ private:
IRBuilder<> Builder;
};
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
+ SD.dump(os);
+ return os;
+}
+#endif
+
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
deleteTree();
@@ -612,18 +971,27 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
- // Skip in-tree scalars that become vectors.
- if (ScalarToTreeEntry.count(U)) {
- DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
- *U << ".\n");
- int Idx = ScalarToTreeEntry[U]; (void) Idx;
- assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
- continue;
- }
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
continue;
+ // Skip in-tree scalars that become vectors
+ if (ScalarToTreeEntry.count(U)) {
+ int Idx = ScalarToTreeEntry[U];
+ TreeEntry *UseEntry = &VectorizableTree[Idx];
+ Value *UseScalar = UseEntry->Scalars[0];
+ // Some in-tree scalars will remain as scalar in vectorized
+ // instructions. If that is the case, the one in Lane 0 will
+ // be used.
+ if (UseScalar != U ||
+ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+ DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
+ assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+ continue;
+ }
+ }
+
// Ignore users in the user ignore list.
if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
UserIgnoreList.end())
@@ -683,6 +1051,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// We now know that this is a vector of instructions of the same type from
// the same block.
+ // Don't vectorize ephemeral values.
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ if (EphValues.count(VL[i])) {
+ DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+ ") is ephemeral.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
// Check if this is a duplicate of another entry.
if (ScalarToTreeEntry.count(VL[0])) {
int Idx = ScalarToTreeEntry[VL[0]];
@@ -709,11 +1087,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
}
}
- // If any of the scalars appears in the table OR it is marked as a value that
- // needs to stat scalar then we need to gather the scalars.
+ // If any of the scalars is marked as a value that needs to stay scalar then
+ // we need to gather the scalars.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
- DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
+ if (MustGather.count(VL[i])) {
+ DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false);
return;
}
@@ -722,69 +1100,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
Instruction *VL0 = cast<Instruction>(VL[0]);
- int MyLastIndex = getLastIndex(VL);
BasicBlock *BB = cast<Instruction>(VL0)->getParent();
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- Instruction *Scalar = cast<Instruction>(VL[i]);
- DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n");
- for (User *U : Scalar->users()) {
- DEBUG(dbgs() << "SLP: \tUser " << *U << ". \n");
- Instruction *UI = dyn_cast<Instruction>(U);
- if (!UI) {
- DEBUG(dbgs() << "SLP: Gathering due unknown user. \n");
- newTreeEntry(VL, false);
- return;
- }
-
- // We don't care if the user is in a different basic block.
- BasicBlock *UserBlock = UI->getParent();
- if (UserBlock != BB) {
- DEBUG(dbgs() << "SLP: User from a different basic block "
- << *UI << ". \n");
- continue;
- }
-
- // If this is a PHINode within this basic block then we can place the
- // extract wherever we want.
- if (isa<PHINode>(*UI)) {
- DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *UI << ". \n");
- continue;
- }
-
- // Check if this is a safe in-tree user.
- if (ScalarToTreeEntry.count(UI)) {
- int Idx = ScalarToTreeEntry[UI];
- int VecLocation = VectorizableTree[Idx].LastScalarIndex;
- if (VecLocation <= MyLastIndex) {
- DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n");
- newTreeEntry(VL, false);
- return;
- }
- DEBUG(dbgs() << "SLP: In-tree user (" << *UI << ") at #" <<
- VecLocation << " vector value (" << *Scalar << ") at #"
- << MyLastIndex << ".\n");
- continue;
- }
-
- // Ignore users in the user ignore list.
- if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UI) !=
- UserIgnoreList.end())
- continue;
-
- // Make sure that we can schedule this unknown user.
- BlockNumbering &BN = getBlockNumbering(BB);
- int UserIndex = BN.getIndex(UI);
- if (UserIndex < MyLastIndex) {
-
- DEBUG(dbgs() << "SLP: Can't schedule extractelement for "
- << *UI << ". \n");
- newTreeEntry(VL, false);
- return;
- }
- }
+ if (!DT->isReachableFromEntry(BB)) {
+ // Don't go into unreachable blocks. They may contain instructions with
+ // dependency cycles which confuse the final scheduling.
+ DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+ newTreeEntry(VL, false);
+ return;
}
-
+
// Check that every instructions appears once in this bundle.
for (unsigned i = 0, e = VL.size(); i < e; ++i)
for (unsigned j = i+1; j < e; ++j)
@@ -794,38 +1119,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
return;
}
- // Check that instructions in this bundle don't reference other instructions.
- // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4.
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- for (User *U : VL[i]->users()) {
- for (unsigned j = 0; j < e; ++j) {
- if (i != j && U == VL[j]) {
- DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << *U << ". \n");
- newTreeEntry(VL, false);
- return;
- }
- }
- }
+ auto &BSRef = BlocksSchedules[BB];
+ if (!BSRef) {
+ BSRef = llvm::make_unique<BlockScheduling>(BB);
}
+ BlockScheduling &BS = *BSRef.get();
- DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
-
- // Check if it is safe to sink the loads or the stores.
- if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
- Instruction *Last = getLastInstruction(VL);
-
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- if (VL[i] == Last)
- continue;
- Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
- if (Barrier) {
- DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
- << "\n because of " << *Barrier << ". Gathering.\n");
- newTreeEntry(VL, false);
- return;
- }
- }
+ if (!BS.tryScheduleBundle(VL, this)) {
+ DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ return;
}
+ DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
switch (Opcode) {
case Instruction::PHI: {
@@ -838,6 +1144,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
if (Term) {
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
@@ -861,6 +1168,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
bool Reuse = CanReuseExtract(VL);
if (Reuse) {
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
+ } else {
+ BS.cancelScheduling(VL);
}
newTreeEntry(VL, Reuse);
return;
@@ -869,12 +1178,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// Check if the loads are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
LoadInst *L = cast<LoadInst>(VL[i]);
- if (!L->isSimple() || !isConsecutiveAccess(VL[i], VL[i + 1])) {
+ if (!L->isSimple()) {
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
- DEBUG(dbgs() << "SLP: Need to swizzle loads.\n");
+ DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ return;
+ }
+ if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+ if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
+ ++NumLoadsWantToChangeOrder;
+ }
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
return;
}
}
+ ++NumLoadsWantToKeepOrder;
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return;
@@ -895,6 +1215,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 0; i < VL.size(); ++i) {
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
return;
@@ -922,6 +1243,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) {
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
return;
@@ -968,20 +1290,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right);
- BasicBlock *LeftBB = getSameBlock(Left);
- BasicBlock *RightBB = getSameBlock(Right);
- // If we have common uses on separate paths in the tree make sure we
- // process the one with greater common depth first.
- // We can use block numbering to determine the subtree traversal as
- // earler user has to come in between the common use and the later user.
- if (LeftBB && RightBB && LeftBB == RightBB &&
- getLastIndex(Right) > getLastIndex(Left)) {
- buildTree_rec(Right, Depth + 1);
- buildTree_rec(Left, Depth + 1);
- } else {
- buildTree_rec(Left, Depth + 1);
- buildTree_rec(Right, Depth + 1);
- }
+ buildTree_rec(Left, Depth + 1);
+ buildTree_rec(Right, Depth + 1);
return;
}
@@ -1000,6 +1310,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
@@ -1012,6 +1323,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
@@ -1023,6 +1335,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (!isa<ConstantInt>(Op)) {
DEBUG(
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
@@ -1044,6 +1357,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
@@ -1056,8 +1370,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned j = 0; j < VL.size(); ++j)
Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
- // We can ignore these values because we are sinking them down.
- MemBarrierIgnoreList.insert(VL.begin(), VL.end());
buildTree_rec(Operands, Depth + 1);
return;
}
@@ -1068,6 +1380,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// represented by an intrinsic call
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
@@ -1080,6 +1393,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
if (!CI2 || CI2->getCalledFunction() != Int ||
getIntrinsicIDForCall(CI2, TLI) != ID) {
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
<< "\n");
@@ -1090,6 +1404,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument "<< A1I<<"!=" << A1J
@@ -1115,6 +1430,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
if (!isAltShuffle) {
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
@@ -1132,6 +1448,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
return;
}
default:
+ BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
@@ -1234,6 +1551,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
+ TargetTransformInfo::OperandValueProperties Op1VP =
+ TargetTransformInfo::OP_None;
+ TargetTransformInfo::OperandValueProperties Op2VP =
+ TargetTransformInfo::OP_None;
// If all operands are exactly the same ConstantInt then set the
// operand kind to OK_UniformConstantValue.
@@ -1255,11 +1576,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
CInt != cast<ConstantInt>(I->getOperand(1)))
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
}
+ // FIXME: Currently cost of model modification for division by
+ // power of 2 is handled only for X86. Add support for other targets.
+ if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+ CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_PowerOf2;
- ScalarCost =
- VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK);
- VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK);
+ ScalarCost = VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
+ Op1VP, Op2VP);
+ VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
+ Op1VP, Op2VP);
}
return VecCost - ScalarCost;
}
@@ -1364,6 +1691,68 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
return true;
}
+int BoUpSLP::getSpillCost() {
+ // Walk from the bottom of the tree to the top, tracking which values are
+ // live. When we see a call instruction that is not part of our tree,
+ // query TTI to see if there is a cost to keeping values live over it
+ // (for example, if spills and fills are required).
+ unsigned BundleWidth = VectorizableTree.front().Scalars.size();
+ int Cost = 0;
+
+ SmallPtrSet<Instruction*, 4> LiveValues;
+ Instruction *PrevInst = nullptr;
+
+ for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
+ Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
+ if (!Inst)
+ continue;
+
+ if (!PrevInst) {
+ PrevInst = Inst;
+ continue;
+ }
+
+ DEBUG(
+ dbgs() << "SLP: #LV: " << LiveValues.size();
+ for (auto *X : LiveValues)
+ dbgs() << " " << X->getName();
+ dbgs() << ", Looking at ";
+ Inst->dump();
+ );
+
+ // Update LiveValues.
+ LiveValues.erase(PrevInst);
+ for (auto &J : PrevInst->operands()) {
+ if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
+ LiveValues.insert(cast<Instruction>(&*J));
+ }
+
+ // Now find the sequence of instructions between PrevInst and Inst.
+ BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
+ --PrevInstIt;
+ while (InstIt != PrevInstIt) {
+ if (PrevInstIt == PrevInst->getParent()->rend()) {
+ PrevInstIt = Inst->getParent()->rbegin();
+ continue;
+ }
+
+ if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
+ SmallVector<Type*, 4> V;
+ for (auto *II : LiveValues)
+ V.push_back(VectorType::get(II->getType(), BundleWidth));
+ Cost += TTI->getCostOfKeepingLiveOverCall(V);
+ }
+
+ ++PrevInstIt;
+ }
+
+ PrevInst = Inst;
+ }
+
+ DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
+ return Cost;
+}
+
int BoUpSLP::getTreeCost() {
int Cost = 0;
DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
@@ -1391,7 +1780,13 @@ int BoUpSLP::getTreeCost() {
for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
I != E; ++I) {
// We only add extract cost once for the same scalar.
- if (!ExtractCostCalculated.insert(I->Scalar))
+ if (!ExtractCostCalculated.insert(I->Scalar).second)
+ continue;
+
+ // Uses by ephemeral values are free (because the ephemeral value will be
+ // removed prior to code generation, and so the extraction will be
+ // removed as well).
+ if (EphValues.count(I->User))
continue;
VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
@@ -1399,6 +1794,8 @@ int BoUpSLP::getTreeCost() {
I->Lane);
}
+ Cost += getSpillCost();
+
DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
return Cost + ExtractCost;
}
@@ -1420,14 +1817,6 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
return getGatherCost(VecTy);
}
-AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return AA->getLocation(SI);
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return AA->getLocation(LI);
- return AliasAnalysis::Location();
-}
-
Value *BoUpSLP::getPointerOperand(Value *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return LI->getPointerOperand();
@@ -1485,59 +1874,9 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
return X == PtrSCEVB;
}
-Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
- assert(Src->getParent() == Dst->getParent() && "Not the same BB");
- BasicBlock::iterator I = Src, E = Dst;
- /// Scan all of the instruction from SRC to DST and check if
- /// the source may alias.
- for (++I; I != E; ++I) {
- // Ignore store instructions that are marked as 'ignore'.
- if (MemBarrierIgnoreList.count(I))
- continue;
- if (Src->mayWriteToMemory()) /* Write */ {
- if (!I->mayReadOrWriteMemory())
- continue;
- } else /* Read */ {
- if (!I->mayWriteToMemory())
- continue;
- }
- AliasAnalysis::Location A = getLocation(&*I);
- AliasAnalysis::Location B = getLocation(Src);
-
- if (!A.Ptr || !B.Ptr || AA->alias(A, B))
- return I;
- }
- return nullptr;
-}
-
-int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) {
- BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
- assert(BB == getSameBlock(VL) && "Invalid block");
- BlockNumbering &BN = getBlockNumbering(BB);
-
- int MaxIdx = BN.getIndex(BB->getFirstNonPHI());
- for (unsigned i = 0, e = VL.size(); i < e; ++i)
- MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
- return MaxIdx;
-}
-
-Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) {
- BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
- assert(BB == getSameBlock(VL) && "Invalid block");
- BlockNumbering &BN = getBlockNumbering(BB);
-
- int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
- for (unsigned i = 1, e = VL.size(); i < e; ++i)
- MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
- Instruction *I = BN.getInstruction(MaxIdx);
- assert(I && "bad location");
- return I;
-}
-
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
Instruction *VL0 = cast<Instruction>(VL[0]);
- Instruction *LastInst = getLastInstruction(VL);
- BasicBlock::iterator NextInst = LastInst;
+ BasicBlock::iterator NextInst = VL0;
++NextInst;
Builder.SetInsertPoint(VL0->getParent(), NextInst);
Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
@@ -1620,6 +1959,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E->Scalars);
return Gather(E->Scalars, VecTy);
}
+
unsigned Opcode = getSameOpcode(E->Scalars);
switch (Opcode) {
@@ -1638,7 +1978,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ValueList Operands;
BasicBlock *IBB = PH->getIncomingBlock(i);
- if (!VisitedBBs.insert(IBB)) {
+ if (!VisitedBBs.insert(IBB).second) {
NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
continue;
}
@@ -1693,6 +2033,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CastInst *CI = dyn_cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
E->VectorizedValue = V;
+ ++NumVectorInstructions;
return V;
}
case Instruction::FCmp:
@@ -1719,6 +2060,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V = Builder.CreateICmp(P0, L, R);
E->VectorizedValue = V;
+ ++NumVectorInstructions;
return V;
}
case Instruction::Select: {
@@ -1740,6 +2082,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateSelect(Cond, True, False);
E->VectorizedValue = V;
+ ++NumVectorInstructions;
return V;
}
case Instruction::Add:
@@ -1784,6 +2127,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
E->VectorizedValue = V;
+ propagateIRFlags(E->VectorizedValue, E->Scalars);
+ ++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
@@ -1796,16 +2141,26 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E->Scalars);
LoadInst *LI = cast<LoadInst>(VL0);
+ Type *ScalarLoadTy = LI->getType();
unsigned AS = LI->getPointerAddressSpace();
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
VecTy->getPointerTo(AS));
+
+ // The pointer operand uses an in-tree scalar so we add the new BitCast to
+ // ExternalUses list to make sure that an extract will be generated in the
+ // future.
+ if (ScalarToTreeEntry.count(LI->getPointerOperand()))
+ ExternalUses.push_back(
+ ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
+
unsigned Alignment = LI->getAlignment();
LI = Builder.CreateLoad(VecPtr);
if (!Alignment)
- Alignment = DL->getABITypeAlignment(LI->getPointerOperand()->getType());
+ Alignment = DL->getABITypeAlignment(ScalarLoadTy);
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
+ ++NumVectorInstructions;
return propagateMetadata(LI, E->Scalars);
}
case Instruction::Store: {
@@ -1823,10 +2178,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
VecTy->getPointerTo(AS));
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+
+ // The pointer operand uses an in-tree scalar so we add the new BitCast to
+ // ExternalUses list to make sure that an extract will be generated in the
+ // future.
+ if (ScalarToTreeEntry.count(SI->getPointerOperand()))
+ ExternalUses.push_back(
+ ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
+
if (!Alignment)
- Alignment = DL->getABITypeAlignment(SI->getPointerOperand()->getType());
+ Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
S->setAlignment(Alignment);
E->VectorizedValue = S;
+ ++NumVectorInstructions;
return propagateMetadata(S, E->Scalars);
}
case Instruction::GetElementPtr: {
@@ -1851,6 +2215,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateGEP(Op0, OpVecs);
E->VectorizedValue = V;
+ ++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
@@ -1862,6 +2227,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E->Scalars);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ Value *ScalarArg = nullptr;
if (CI && (FI = CI->getCalledFunction())) {
IID = (Intrinsic::ID) FI->getIntrinsicID();
}
@@ -1872,6 +2238,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// a scalar. This argument should not be vectorized.
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+ ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
@@ -1890,7 +2257,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
Value *V = Builder.CreateCall(CF, OpVecs);
+
+ // The scalar argument uses an in-tree scalar so we add the new vectorized
+ // call to ExternalUses list to make sure that an extract will be
+ // generated in the future.
+ if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
+ ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
E->VectorizedValue = V;
+ ++NumVectorInstructions;
return V;
}
case Instruction::ShuffleVector: {
@@ -1916,21 +2291,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
- // Create appropriate shuffle to take alternative operations from
- // the vector.
- std::vector<Constant *> Mask(E->Scalars.size());
+ // Create shuffle to take alternate operations from the vector.
+ // Also, gather up odd and even scalar ops to propagate IR flags to
+ // each vector operation.
+ ValueList OddScalars, EvenScalars;
unsigned e = E->Scalars.size();
+ SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
- if (i & 1)
+ if (i & 1) {
Mask[i] = Builder.getInt32(e + i);
- else
+ OddScalars.push_back(E->Scalars[i]);
+ } else {
Mask[i] = Builder.getInt32(i);
+ EvenScalars.push_back(E->Scalars[i]);
+ }
}
Value *ShuffleMask = ConstantVector::get(Mask);
+ propagateIRFlags(V0, EvenScalars);
+ propagateIRFlags(V1, OddScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
E->VectorizedValue = V;
+ ++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
@@ -1943,6 +2326,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *BoUpSLP::vectorizeTree() {
+
+ // All blocks must be scheduled before any instructions are inserted.
+ for (auto &BSIter : BlocksSchedules) {
+ scheduleBlock(BSIter.second.get());
+ }
+
Builder.SetInsertPoint(F->getEntryBlock().begin());
vectorizeTree(&VectorizableTree[0]);
@@ -2027,13 +2416,10 @@ Value *BoUpSLP::vectorizeTree() {
Scalar->replaceAllUsesWith(Undef);
}
DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
- cast<Instruction>(Scalar)->eraseFromParent();
+ eraseInstruction(cast<Instruction>(Scalar));
}
}
- for (auto &BN : BlocksNumbers)
- BN.second.forget();
-
Builder.ClearInsertionPoint();
return VectorizableTree[0].VectorizedValue;
@@ -2112,7 +2498,7 @@ void BoUpSLP::optimizeGatherSequence() {
if (In->isIdenticalTo(*v) &&
DT->dominates((*v)->getParent(), In->getParent())) {
In->replaceAllUsesWith(*v);
- In->eraseFromParent();
+ eraseInstruction(In);
In = nullptr;
break;
}
@@ -2127,6 +2513,354 @@ void BoUpSLP::optimizeGatherSequence() {
GatherSeq.clear();
}
+// Groups the instructions to a bundle (which is then a single scheduling entity)
+// and schedules instructions until the bundle gets ready.
+bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
+ BoUpSLP *SLP) {
+ if (isa<PHINode>(VL[0]))
+ return true;
+
+ // Initialize the instruction bundle.
+ Instruction *OldScheduleEnd = ScheduleEnd;
+ ScheduleData *PrevInBundle = nullptr;
+ ScheduleData *Bundle = nullptr;
+ bool ReSchedule = false;
+ DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
+ for (Value *V : VL) {
+ extendSchedulingRegion(V);
+ ScheduleData *BundleMember = getScheduleData(V);
+ assert(BundleMember &&
+ "no ScheduleData for bundle member (maybe not in same basic block)");
+ if (BundleMember->IsScheduled) {
+ // A bundle member was scheduled as single instruction before and now
+ // needs to be scheduled as part of the bundle. We just get rid of the
+ // existing schedule.
+ DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
+ << " was already scheduled\n");
+ ReSchedule = true;
+ }
+ assert(BundleMember->isSchedulingEntity() &&
+ "bundle member already part of other bundle");
+ if (PrevInBundle) {
+ PrevInBundle->NextInBundle = BundleMember;
+ } else {
+ Bundle = BundleMember;
+ }
+ BundleMember->UnscheduledDepsInBundle = 0;
+ Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
+
+ // Group the instructions to a bundle.
+ BundleMember->FirstInBundle = Bundle;
+ PrevInBundle = BundleMember;
+ }
+ if (ScheduleEnd != OldScheduleEnd) {
+ // The scheduling region got new instructions at the lower end (or it is a
+ // new region for the first bundle). This makes it necessary to
+ // recalculate all dependencies.
+ // It is seldom that this needs to be done a second time after adding the
+ // initial bundle to the region.
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ ScheduleData *SD = getScheduleData(I);
+ SD->clearDependencies();
+ }
+ ReSchedule = true;
+ }
+ if (ReSchedule) {
+ resetSchedule();
+ initialFillReadyList(ReadyInsts);
+ }
+
+ DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+ << BB->getName() << "\n");
+
+ calculateDependencies(Bundle, true, SLP);
+
+ // Now try to schedule the new bundle. As soon as the bundle is "ready" it
+ // means that there are no cyclic dependencies and we can schedule it.
+ // Note that's important that we don't "schedule" the bundle yet (see
+ // cancelScheduling).
+ while (!Bundle->isReady() && !ReadyInsts.empty()) {
+
+ ScheduleData *pickedSD = ReadyInsts.back();
+ ReadyInsts.pop_back();
+
+ if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
+ schedule(pickedSD, ReadyInsts);
+ }
+ }
+ return Bundle->isReady();
+}
+
+void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
+ if (isa<PHINode>(VL[0]))
+ return;
+
+ ScheduleData *Bundle = getScheduleData(VL[0]);
+ DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
+ assert(!Bundle->IsScheduled &&
+ "Can't cancel bundle which is already scheduled");
+ assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+ "tried to unbundle something which is not a bundle");
+
+ // Un-bundle: make single instructions out of the bundle.
+ ScheduleData *BundleMember = Bundle;
+ while (BundleMember) {
+ assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+ BundleMember->FirstInBundle = BundleMember;
+ ScheduleData *Next = BundleMember->NextInBundle;
+ BundleMember->NextInBundle = nullptr;
+ BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
+ if (BundleMember->UnscheduledDepsInBundle == 0) {
+ ReadyInsts.insert(BundleMember);
+ }
+ BundleMember = Next;
+ }
+}
+
+void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
+ if (getScheduleData(V))
+ return;
+ Instruction *I = dyn_cast<Instruction>(V);
+ assert(I && "bundle member must be an instruction");
+ assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+ if (!ScheduleStart) {
+ // It's the first instruction in the new region.
+ initScheduleData(I, I->getNextNode(), nullptr, nullptr);
+ ScheduleStart = I;
+ ScheduleEnd = I->getNextNode();
+ assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+ DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
+ return;
+ }
+ // Search up and down at the same time, because we don't know if the new
+ // instruction is above or below the existing scheduling region.
+ BasicBlock::reverse_iterator UpIter(ScheduleStart);
+ BasicBlock::reverse_iterator UpperEnd = BB->rend();
+ BasicBlock::iterator DownIter(ScheduleEnd);
+ BasicBlock::iterator LowerEnd = BB->end();
+ for (;;) {
+ if (UpIter != UpperEnd) {
+ if (&*UpIter == I) {
+ initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
+ ScheduleStart = I;
+ DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
+ return;
+ }
+ UpIter++;
+ }
+ if (DownIter != LowerEnd) {
+ if (&*DownIter == I) {
+ initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
+ nullptr);
+ ScheduleEnd = I->getNextNode();
+ assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+ DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
+ return;
+ }
+ DownIter++;
+ }
+ assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
+ "instruction not found in block");
+ }
+}
+
+void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
+ Instruction *ToI,
+ ScheduleData *PrevLoadStore,
+ ScheduleData *NextLoadStore) {
+ ScheduleData *CurrentLoadStore = PrevLoadStore;
+ for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
+ ScheduleData *SD = ScheduleDataMap[I];
+ if (!SD) {
+ // Allocate a new ScheduleData for the instruction.
+ if (ChunkPos >= ChunkSize) {
+ ScheduleDataChunks.push_back(
+ llvm::make_unique<ScheduleData[]>(ChunkSize));
+ ChunkPos = 0;
+ }
+ SD = &(ScheduleDataChunks.back()[ChunkPos++]);
+ ScheduleDataMap[I] = SD;
+ SD->Inst = I;
+ }
+ assert(!isInSchedulingRegion(SD) &&
+ "new ScheduleData already in scheduling region");
+ SD->init(SchedulingRegionID);
+
+ if (I->mayReadOrWriteMemory()) {
+ // Update the linked list of memory accessing instructions.
+ if (CurrentLoadStore) {
+ CurrentLoadStore->NextLoadStore = SD;
+ } else {
+ FirstLoadStoreInRegion = SD;
+ }
+ CurrentLoadStore = SD;
+ }
+ }
+ if (NextLoadStore) {
+ if (CurrentLoadStore)
+ CurrentLoadStore->NextLoadStore = NextLoadStore;
+ } else {
+ LastLoadStoreInRegion = CurrentLoadStore;
+ }
+}
+
+void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
+ bool InsertInReadyList,
+ BoUpSLP *SLP) {
+ assert(SD->isSchedulingEntity());
+
+ SmallVector<ScheduleData *, 10> WorkList;
+ WorkList.push_back(SD);
+
+ while (!WorkList.empty()) {
+ ScheduleData *SD = WorkList.back();
+ WorkList.pop_back();
+
+ ScheduleData *BundleMember = SD;
+ while (BundleMember) {
+ assert(isInSchedulingRegion(BundleMember));
+ if (!BundleMember->hasValidDependencies()) {
+
+ DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
+ BundleMember->Dependencies = 0;
+ BundleMember->resetUnscheduledDeps();
+
+ // Handle def-use chain dependencies.
+ for (User *U : BundleMember->Inst->users()) {
+ if (isa<Instruction>(U)) {
+ ScheduleData *UseSD = getScheduleData(U);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled) {
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ if (!DestBundle->hasValidDependencies()) {
+ WorkList.push_back(DestBundle);
+ }
+ }
+ } else {
+ // I'm not sure if this can ever happen. But we need to be safe.
+ // This lets the instruction/bundle never be scheduled and eventally
+ // disable vectorization.
+ BundleMember->Dependencies++;
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ }
+
+ // Handle the memory dependencies.
+ ScheduleData *DepDest = BundleMember->NextLoadStore;
+ if (DepDest) {
+ Instruction *SrcInst = BundleMember->Inst;
+ AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA);
+ bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+
+ while (DepDest) {
+ assert(isInSchedulingRegion(DepDest));
+ if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
+ if (SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)) {
+ DepDest->MemoryDependencies.push_back(BundleMember);
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
+ if (!DestBundle->IsScheduled) {
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ if (!DestBundle->hasValidDependencies()) {
+ WorkList.push_back(DestBundle);
+ }
+ }
+ }
+ DepDest = DepDest->NextLoadStore;
+ }
+ }
+ }
+ BundleMember = BundleMember->NextInBundle;
+ }
+ if (InsertInReadyList && SD->isReady()) {
+ ReadyInsts.push_back(SD);
+ DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
+ }
+ }
+}
+
+void BoUpSLP::BlockScheduling::resetSchedule() {
+ assert(ScheduleStart &&
+ "tried to reset schedule on block which has not been scheduled");
+ for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ ScheduleData *SD = getScheduleData(I);
+ assert(isInSchedulingRegion(SD));
+ SD->IsScheduled = false;
+ SD->resetUnscheduledDeps();
+ }
+ ReadyInsts.clear();
+}
+
+void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+
+ if (!BS->ScheduleStart)
+ return;
+
+ DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+
+ BS->resetSchedule();
+
+ // For the real scheduling we use a more sophisticated ready-list: it is
+ // sorted by the original instruction location. This lets the final schedule
+ // be as close as possible to the original instruction order.
+ struct ScheduleDataCompare {
+ bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
+ return SD2->SchedulingPriority < SD1->SchedulingPriority;
+ }
+ };
+ std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+
+ // Ensure that all depencency data is updated and fill the ready-list with
+ // initial instructions.
+ int Idx = 0;
+ int NumToSchedule = 0;
+ for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+ I = I->getNextNode()) {
+ ScheduleData *SD = BS->getScheduleData(I);
+ assert(
+ SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
+ "scheduler and vectorizer have different opinion on what is a bundle");
+ SD->FirstInBundle->SchedulingPriority = Idx++;
+ if (SD->isSchedulingEntity()) {
+ BS->calculateDependencies(SD, false, this);
+ NumToSchedule++;
+ }
+ }
+ BS->initialFillReadyList(ReadyInsts);
+
+ Instruction *LastScheduledInst = BS->ScheduleEnd;
+
+ // Do the "real" scheduling.
+ while (!ReadyInsts.empty()) {
+ ScheduleData *picked = *ReadyInsts.begin();
+ ReadyInsts.erase(ReadyInsts.begin());
+
+ // Move the scheduled instruction(s) to their dedicated places, if not
+ // there yet.
+ ScheduleData *BundleMember = picked;
+ while (BundleMember) {
+ Instruction *pickedInst = BundleMember->Inst;
+ if (LastScheduledInst->getNextNode() != pickedInst) {
+ BS->BB->getInstList().remove(pickedInst);
+ BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
+ }
+ LastScheduledInst = pickedInst;
+ BundleMember = BundleMember->NextInBundle;
+ }
+
+ BS->schedule(picked, ReadyInsts);
+ NumToSchedule--;
+ }
+ assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+ // Avoid duplicate scheduling of the block.
+ BS->ScheduleStart = nullptr;
+}
+
/// The SLPVectorizer Pass.
struct SLPVectorizer : public FunctionPass {
typedef SmallVector<StoreInst *, 8> StoreList;
@@ -2146,6 +2880,7 @@ struct SLPVectorizer : public FunctionPass {
AliasAnalysis *AA;
LoopInfo *LI;
DominatorTree *DT;
+ AssumptionCache *AC;
bool runOnFunction(Function &F) override {
if (skipOptnoneFunction(F))
@@ -2159,6 +2894,7 @@ struct SLPVectorizer : public FunctionPass {
AA = &getAnalysis<AliasAnalysis>();
LI = &getAnalysis<LoopInfo>();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
StoreRefs.clear();
bool Changed = false;
@@ -2181,7 +2917,10 @@ struct SLPVectorizer : public FunctionPass {
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
- BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT);
+ BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC);
+
+ // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+ // delete instructions.
// Scan the blocks in the function in post order.
for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
@@ -2208,6 +2947,7 @@ struct SLPVectorizer : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolution>();
AU.addRequired<AliasAnalysis>();
AU.addRequired<TargetTransformInfo>();
@@ -2234,7 +2974,8 @@ private:
/// scheduling and that don't need extracting.
/// \returns true if a value was vectorized.
bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- ArrayRef<Value *> BuildVector = None);
+ ArrayRef<Value *> BuildVector = None,
+ bool allowReorder = false);
/// \brief Try to vectorize a chain that may start at the operands of \V;
bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
@@ -2404,11 +3145,12 @@ bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = { A, B };
- return tryToVectorizeList(VL, R);
+ return tryToVectorizeList(VL, R, None, true);
}
bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- ArrayRef<Value *> BuildVector) {
+ ArrayRef<Value *> BuildVector,
+ bool allowReorder) {
if (VL.size() < 2)
return false;
@@ -2463,6 +3205,14 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
BuildVectorSlice = BuildVector.slice(i, OpsWidth);
R.buildTree(Ops, BuildVectorSlice);
+ // TODO: check if we can allow reordering also for other cases than
+ // tryToVectorizePair()
+ if (allowReorder && R.shouldReorder()) {
+ assert(Ops.size() == 2);
+ assert(BuildVectorSlice.empty());
+ Value *ReorderedOps[] = { Ops[1], Ops[0] };
+ R.buildTree(ReorderedOps, None);
+ }
int Cost = R.getTreeCost();
if (Cost < -SLPCostThreshold) {
@@ -2514,11 +3264,9 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
if (tryToVectorizePair(A, B0, R)) {
- B->moveBefore(V);
return true;
}
if (tryToVectorizePair(A, B1, R)) {
- B->moveBefore(V);
return true;
}
}
@@ -2528,11 +3276,9 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
if (tryToVectorizePair(A0, B, R)) {
- A->moveBefore(V);
return true;
}
if (tryToVectorizePair(A1, B, R)) {
- A->moveBefore(V);
return true;
}
}
@@ -2728,8 +3474,7 @@ public:
unsigned i = 0;
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
- ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
- V.buildTree(ValsToReduce, ReductionOps);
+ V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
// Estimate cost.
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
@@ -2807,11 +3552,10 @@ private:
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
assert(VectorizedValue && "Need to have a vectorized tree node");
- Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
- Value *TmpVec = ValToReduce;
+ Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
if (IsPairwiseReduction) {
Value *LeftMask =
@@ -2921,8 +3665,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);
DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
- if (NumElts > 1 &&
- tryToVectorizeList(ArrayRef<Value *>(IncIt, NumElts), R)) {
+ if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
@@ -2938,7 +3681,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
// We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(it))
+ if (!VisitedInstrs.insert(it).second)
continue;
if (isa<DbgInfoIntrinsic>(it))
@@ -3002,6 +3745,21 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
}
+ // Try to vectorize horizontal reductions feeding into a return.
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
+ if (RI->getNumOperands() != 0)
+ if (BinaryOperator *BinOp =
+ dyn_cast<BinaryOperator>(RI->getOperand(0))) {
+ DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
+ if (tryToVectorizePair(BinOp->getOperand(0),
+ BinOp->getOperand(1), R)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+ }
+
// Try to vectorize trees that start at compare instructions.
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
@@ -3014,15 +3772,15 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
for (int i = 0; i < 2; ++i) {
- if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
- if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
- Changed = true;
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- }
- }
+ if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
+ if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
+ Changed = true;
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ it = BB->begin();
+ e = BB->end();
+ }
+ }
}
continue;
}
@@ -3064,8 +3822,8 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
// Process the stores in chunks of 16.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
- ArrayRef<StoreInst *> Chunk(&it->second[CI], Len);
- Changed |= vectorizeStores(Chunk, -SLPCostThreshold, R);
+ Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
+ -SLPCostThreshold, R);
}
}
return Changed;
@@ -3078,6 +3836,7 @@ static const char lv_name[] = "SLP Vectorizer";
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)