aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp820
1 files changed, 522 insertions, 298 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 27a86c0bca91..974eff9974d9 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -194,10 +194,13 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
return true;
}
-/// \returns True if all of the values in \p VL are constants.
+/// \returns True if all of the values in \p VL are constants (but not
+/// globals/constant expressions).
static bool allConstant(ArrayRef<Value *> VL) {
+ // Constant expressions and globals can't be vectorized like normal integer/FP
+ // constants.
for (Value *i : VL)
- if (!isa<Constant>(i))
+ if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
return false;
return true;
}
@@ -486,6 +489,7 @@ namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
struct TreeEntry;
+ struct ScheduleData;
public:
using ValueList = SmallVector<Value *, 8>;
@@ -614,6 +618,15 @@ public:
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable() const;
+ /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
+ /// can be load combined in the backend. Load combining may not be allowed in
+ /// the IR optimizer, so we do not want to alter the pattern. For example,
+ /// partially transforming a scalar bswap() pattern into vector code is
+ /// effectively impossible for the backend to undo.
+ /// TODO: If load combining is allowed in the IR optimizer, this analysis
+ /// may not be necessary.
+ bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+
OptimizationRemarkEmitter *getORE() { return ORE; }
/// This structure holds any data we need about the edges being traversed
@@ -1117,6 +1130,14 @@ public:
#endif
};
+ /// Checks if the instruction is marked for deletion.
+ bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+ /// Marks values operands for later deletion by replacing them with Undefs.
+ void eraseInstructions(ArrayRef<Value *> AV);
+
+ ~BoUpSLP();
+
private:
/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I) const;
@@ -1153,8 +1174,7 @@ private:
/// Set the Builder insert point to one after the last instruction in
/// the bundle
- void setInsertPointAfterBundle(ArrayRef<Value *> VL,
- const InstructionsState &S);
+ void setInsertPointAfterBundle(TreeEntry *E);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
@@ -1220,27 +1240,37 @@ private:
/// reordering of operands during buildTree_rec() and vectorizeTree().
SmallVector<ValueList, 2> Operands;
+ /// The main/alternate instruction.
+ Instruction *MainOp = nullptr;
+ Instruction *AltOp = nullptr;
+
public:
/// Set this bundle's \p OpIdx'th operand to \p OpVL.
- void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL,
- ArrayRef<unsigned> ReuseShuffleIndices) {
+ void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
if (Operands.size() < OpIdx + 1)
Operands.resize(OpIdx + 1);
assert(Operands[OpIdx].size() == 0 && "Already resized?");
Operands[OpIdx].resize(Scalars.size());
for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
- Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty())
- ? OpVL[ReuseShuffleIndices[Lane]]
- : OpVL[Lane];
- }
-
- /// If there is a user TreeEntry, then set its operand.
- void trySetUserTEOperand(const EdgeInfo &UserTreeIdx,
- ArrayRef<Value *> OpVL,
- ArrayRef<unsigned> ReuseShuffleIndices) {
- if (UserTreeIdx.UserTE)
- UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL,
- ReuseShuffleIndices);
+ Operands[OpIdx][Lane] = OpVL[Lane];
+ }
+
+ /// Set the operands of this bundle in their original order.
+ void setOperandsInOrder() {
+ assert(Operands.empty() && "Already initialized?");
+ auto *I0 = cast<Instruction>(Scalars[0]);
+ Operands.resize(I0->getNumOperands());
+ unsigned NumLanes = Scalars.size();
+ for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
+ OpIdx != NumOperands; ++OpIdx) {
+ Operands[OpIdx].resize(NumLanes);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ auto *I = cast<Instruction>(Scalars[Lane]);
+ assert(I->getNumOperands() == NumOperands &&
+ "Expected same number of operands");
+ Operands[OpIdx][Lane] = I->getOperand(OpIdx);
+ }
+ }
}
/// \returns the \p OpIdx operand of this TreeEntry.
@@ -1249,6 +1279,9 @@ private:
return Operands[OpIdx];
}
+ /// \returns the number of operands.
+ unsigned getNumOperands() const { return Operands.size(); }
+
/// \return the single \p OpIdx operand.
Value *getSingleOperand(unsigned OpIdx) const {
assert(OpIdx < Operands.size() && "Off bounds");
@@ -1256,6 +1289,58 @@ private:
return Operands[OpIdx][0];
}
+ /// Some of the instructions in the list have alternate opcodes.
+ bool isAltShuffle() const {
+ return getOpcode() != getAltOpcode();
+ }
+
+ bool isOpcodeOrAlt(Instruction *I) const {
+ unsigned CheckedOpcode = I->getOpcode();
+ return (getOpcode() == CheckedOpcode ||
+ getAltOpcode() == CheckedOpcode);
+ }
+
+ /// Chooses the correct key for scheduling data. If \p Op has the same (or
+ /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
+ /// \p OpValue.
+ Value *isOneOf(Value *Op) const {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && isOpcodeOrAlt(I))
+ return Op;
+ return MainOp;
+ }
+
+ void setOperations(const InstructionsState &S) {
+ MainOp = S.MainOp;
+ AltOp = S.AltOp;
+ }
+
+ Instruction *getMainOp() const {
+ return MainOp;
+ }
+
+ Instruction *getAltOp() const {
+ return AltOp;
+ }
+
+ /// The main/alternate opcodes for the list of instructions.
+ unsigned getOpcode() const {
+ return MainOp ? MainOp->getOpcode() : 0;
+ }
+
+ unsigned getAltOpcode() const {
+ return AltOp ? AltOp->getOpcode() : 0;
+ }
+
+ /// Update operations state of this entry if reorder occurred.
+ bool updateStateIfReorder() {
+ if (ReorderIndices.empty())
+ return false;
+ InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
+ setOperations(S);
+ return true;
+ }
+
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
@@ -1269,6 +1354,8 @@ private:
for (Value *V : Scalars)
dbgs().indent(2) << *V << "\n";
dbgs() << "NeedToGather: " << NeedToGather << "\n";
+ dbgs() << "MainOp: " << *MainOp << "\n";
+ dbgs() << "AltOp: " << *AltOp << "\n";
dbgs() << "VectorizedValue: ";
if (VectorizedValue)
dbgs() << *VectorizedValue;
@@ -1279,12 +1366,12 @@ private:
if (ReuseShuffleIndices.empty())
dbgs() << "Emtpy";
else
- for (unsigned Idx : ReuseShuffleIndices)
- dbgs() << Idx << ", ";
+ for (unsigned ReuseIdx : ReuseShuffleIndices)
+ dbgs() << ReuseIdx << ", ";
dbgs() << "\n";
dbgs() << "ReorderIndices: ";
- for (unsigned Idx : ReorderIndices)
- dbgs() << Idx << ", ";
+ for (unsigned ReorderIdx : ReorderIndices)
+ dbgs() << ReorderIdx << ", ";
dbgs() << "\n";
dbgs() << "UserTreeIndices: ";
for (const auto &EInfo : UserTreeIndices)
@@ -1295,11 +1382,13 @@ private:
};
/// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
+ const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
ArrayRef<unsigned> ReuseShuffleIndices = None,
ArrayRef<unsigned> ReorderIndices = None) {
- VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree));
+ bool Vectorized = (bool)Bundle;
+ VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
TreeEntry *Last = VectorizableTree.back().get();
Last->Idx = VectorizableTree.size() - 1;
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
@@ -1307,11 +1396,22 @@ private:
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
Last->ReorderIndices = ReorderIndices;
+ Last->setOperations(S);
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
- ScalarToTreeEntry[VL[i]] = Last->Idx;
- }
+ ScalarToTreeEntry[VL[i]] = Last;
+ }
+ // Update the scheduler bundle to point to this TreeEntry.
+ unsigned Lane = 0;
+ for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ BundleMember->TE = Last;
+ BundleMember->Lane = Lane;
+ ++Lane;
+ }
+ assert((!Bundle.getValue() || Lane == VL.size()) &&
+ "Bundle and VL out of sync");
} else {
MustGather.insert(VL.begin(), VL.end());
}
@@ -1319,7 +1419,6 @@ private:
if (UserTreeIdx.UserTE)
Last->UserTreeIndices.push_back(UserTreeIdx);
- Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices);
return Last;
}
@@ -1340,19 +1439,19 @@ private:
TreeEntry *getTreeEntry(Value *V) {
auto I = ScalarToTreeEntry.find(V);
if (I != ScalarToTreeEntry.end())
- return VectorizableTree[I->second].get();
+ return I->second;
return nullptr;
}
const TreeEntry *getTreeEntry(Value *V) const {
auto I = ScalarToTreeEntry.find(V);
if (I != ScalarToTreeEntry.end())
- return VectorizableTree[I->second].get();
+ return I->second;
return nullptr;
}
/// Maps a specific scalar to its tree entry.
- SmallDenseMap<Value*, int> ScalarToTreeEntry;
+ SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
@@ -1408,15 +1507,14 @@ private:
/// This is required to ensure that there are no incorrect collisions in the
/// AliasCache, which can happen if a new instruction is allocated at the
/// same address as a previously deleted instruction.
- void eraseInstruction(Instruction *I) {
- I->removeFromParent();
- I->dropAllReferences();
- DeletedInstructions.emplace_back(I);
+ void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
+ auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
+ It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
}
/// Temporary store for deleted instructions. Instructions will be deleted
/// eventually when the BoUpSLP is destructed.
- SmallVector<unique_value, 8> DeletedInstructions;
+ DenseMap<Instruction *, bool> DeletedInstructions;
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User). External User
@@ -1453,6 +1551,8 @@ private:
UnscheduledDepsInBundle = UnscheduledDeps;
clearDependencies();
OpValue = OpVal;
+ TE = nullptr;
+ Lane = -1;
}
/// Returns true if the dependency information has been calculated.
@@ -1559,6 +1659,12 @@ private:
/// Opcode of the current instruction in the schedule data.
Value *OpValue = nullptr;
+
+ /// The TreeEntry that this instruction corresponds to.
+ TreeEntry *TE = nullptr;
+
+ /// The lane of this node in the TreeEntry.
+ int Lane = -1;
};
#ifndef NDEBUG
@@ -1633,10 +1739,9 @@ private:
continue;
}
// Handle the def-use chain dependencies.
- for (Use &U : BundleMember->Inst->operands()) {
- auto *I = dyn_cast<Instruction>(U.get());
- if (!I)
- continue;
+
+ // Decrement the unscheduled counter and insert to ready list if ready.
+ auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
if (OpDef && OpDef->hasValidDependencies() &&
OpDef->incrementUnscheduledDeps(-1) == 0) {
@@ -1651,6 +1756,24 @@ private:
<< "SLP: gets ready (def): " << *DepBundle << "\n");
}
});
+ };
+
+ // If BundleMember is a vector bundle, its operands may have been
+ // reordered duiring buildTree(). We therefore need to get its operands
+ // through the TreeEntry.
+ if (TreeEntry *TE = BundleMember->TE) {
+ int Lane = BundleMember->Lane;
+ assert(Lane >= 0 && "Lane not set");
+ for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
+ OpIdx != NumOperands; ++OpIdx)
+ if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
+ DecrUnsched(I);
+ } else {
+ // If BundleMember is a stand-alone instruction, no operand reordering
+ // has taken place, so we directly access its operands.
+ for (Use &U : BundleMember->Inst->operands())
+ if (auto *I = dyn_cast<Instruction>(U.get()))
+ DecrUnsched(I);
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
@@ -1697,8 +1820,11 @@ private:
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
- bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S);
+ /// \returns the scheduling bundle. The returned Optional value is non-None
+ /// if \p VL is allowed to be scheduled.
+ Optional<ScheduleData *>
+ tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
@@ -1945,6 +2071,30 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
} // end namespace llvm
+BoUpSLP::~BoUpSLP() {
+ for (const auto &Pair : DeletedInstructions) {
+ // Replace operands of ignored instructions with Undefs in case if they were
+ // marked for deletion.
+ if (Pair.getSecond()) {
+ Value *Undef = UndefValue::get(Pair.getFirst()->getType());
+ Pair.getFirst()->replaceAllUsesWith(Undef);
+ }
+ Pair.getFirst()->dropAllReferences();
+ }
+ for (const auto &Pair : DeletedInstructions) {
+ assert(Pair.getFirst()->use_empty() &&
+ "trying to erase instruction with users.");
+ Pair.getFirst()->eraseFromParent();
+ }
+}
+
+void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
+ for (auto *V : AV) {
+ if (auto *I = dyn_cast<Instruction>(V))
+ eraseInstruction(I, /*ReplaceWithUndef=*/true);
+ };
+}
+
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
ExtraValueToDebugLocsMap ExternallyUsedValues;
@@ -2026,28 +2176,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
// Don't handle vectors.
if (S.OpValue->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
// If all of the operands are identical or constant we have a simple solution.
if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
@@ -2055,11 +2205,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// the same block.
// Don't vectorize ephemeral values.
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- if (EphValues.count(VL[i])) {
- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+ for (Value *V : VL) {
+ if (EphValues.count(V)) {
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
<< ") is ephemeral.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
}
@@ -2069,7 +2219,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
if (!E->isSame(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
// Record the reuse of the tree node. FIXME, currently this is only used to
@@ -2077,19 +2227,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
E->UserTreeIndices.push_back(UserTreeIdx);
LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
<< ".\n");
- E->trySetUserTEOperand(UserTreeIdx, VL, None);
return;
}
// Check that none of the instructions in the bundle are already in the tree.
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- auto *I = dyn_cast<Instruction>(VL[i]);
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
if (getTreeEntry(I)) {
- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
<< ") is already in tree.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
}
@@ -2097,10 +2246,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// If any of the scalars is marked as a value that needs to stay scalar, then
// we need to gather the scalars.
// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {
+ for (Value *V : VL) {
+ if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
}
@@ -2114,7 +2263,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
@@ -2128,13 +2277,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Res.second)
UniqueValues.emplace_back(V);
}
- if (UniqueValues.size() == VL.size()) {
+ size_t NumUniqueScalarValues = UniqueValues.size();
+ if (NumUniqueScalarValues == VL.size()) {
ReuseShuffleIndicies.clear();
} else {
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
- if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
+ if (NumUniqueScalarValues <= 1 ||
+ !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
}
VL = UniqueValues;
@@ -2142,16 +2293,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
- BSRef = llvm::make_unique<BlockScheduling>(BB);
+ BSRef = std::make_unique<BlockScheduling>(BB);
BlockScheduling &BS = *BSRef.get();
- if (!BS.tryScheduleBundle(VL, this, S)) {
+ Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+ if (!Bundle) {
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -2160,7 +2313,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
(unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
- PHINode *PH = dyn_cast<PHINode>(VL0);
+ auto *PH = cast<PHINode>(VL0);
// Check for terminator values (e.g. invoke).
for (unsigned j = 0; j < VL.size(); ++j)
@@ -2172,23 +2325,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs()
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
}
- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ TreeEntry *TE =
+ newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+ // Keeps the reordered operands to avoid code duplication.
+ SmallVector<ValueList, 2> OperandsVec;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
-
- buildTree_rec(Operands, Depth + 1, {TE, i});
+ TE->setOperand(i, Operands);
+ OperandsVec.push_back(Operands);
}
+ for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
+ buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
return;
}
case Instruction::ExtractValue:
@@ -2198,13 +2357,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Reuse) {
LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
++NumOpsWantToKeepOriginalOrder;
- newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
Op0.assign(VL.size(), VL0->getOperand(0));
- VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
+ VectorizableTree.back()->setOperand(0, Op0);
return;
}
if (!CurrentOrder.empty()) {
@@ -2220,17 +2379,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
auto StoredCurrentOrderAndNum =
NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
++StoredCurrentOrderAndNum->getSecond();
- newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies,
StoredCurrentOrderAndNum->getFirst());
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
Op0.assign(VL.size(), VL0->getOperand(0));
- VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
+ VectorizableTree.back()->setOperand(0, Op0);
return;
}
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
- newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
BS.cancelScheduling(VL, VL0);
return;
}
@@ -2246,7 +2407,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
@@ -2259,7 +2421,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
auto *L = cast<LoadInst>(V);
if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
@@ -2289,15 +2452,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
++NumOpsWantToKeepOriginalOrder;
- newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
- ReuseShuffleIndicies);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
} else {
// Need to reorder.
auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
++I->getSecond();
- newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
- ReuseShuffleIndicies, I->getFirst());
+ TreeEntry *TE =
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies, I->getFirst());
+ TE->setOperandsInOrder();
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
}
return;
@@ -2306,7 +2472,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
case Instruction::ZExt:
@@ -2322,24 +2489,27 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
- for (unsigned i = 0; i < VL.size(); ++i) {
- Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+ for (Value *V : VL) {
+ Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs()
<< "SLP: Gathering casts with different src types.\n");
return;
}
}
- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+ TE->setOperandsInOrder();
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (Value *j : VL)
- Operands.push_back(cast<Instruction>(j)->getOperand(i));
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
buildTree_rec(Operands, Depth + 1, {TE, i});
}
@@ -2351,19 +2521,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
Type *ComparedTy = VL0->getOperand(0)->getType();
- for (unsigned i = 1, e = VL.size(); i < e; ++i) {
- CmpInst *Cmp = cast<CmpInst>(VL[i]);
+ for (Value *V : VL) {
+ CmpInst *Cmp = cast<CmpInst>(V);
if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs()
<< "SLP: Gathering cmp with different predicate.\n");
return;
}
}
- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
ValueList Left, Right;
@@ -2384,7 +2556,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Right.push_back(RHS);
}
}
-
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
return;
@@ -2409,7 +2582,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
// Sort operands of the instructions so that each side is more likely to
@@ -2417,11 +2591,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
return;
}
+ TE->setOperandsInOrder();
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -2434,11 +2611,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
- for (unsigned j = 0; j < VL.size(); ++j) {
- if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+ for (Value *V : VL) {
+ if (cast<Instruction>(V)->getNumOperands() != 2) {
LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
}
@@ -2446,58 +2624,64 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// We can't combine several GEPs into one vector if they operate on
// different types.
Type *Ty0 = VL0->getOperand(0)->getType();
- for (unsigned j = 0; j < VL.size(); ++j) {
- Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+ for (Value *V : VL) {
+ Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
if (Ty0 != CurTy) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
}
// We don't combine GEPs with non-constant indexes.
- for (unsigned j = 0; j < VL.size(); ++j) {
- auto Op = cast<Instruction>(VL[j])->getOperand(1);
+ for (Value *V : VL) {
+ auto Op = cast<Instruction>(V)->getOperand(1);
if (!isa<ConstantInt>(Op)) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
}
- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ TE->setOperandsInOrder();
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (Value *j : VL)
- Operands.push_back(cast<Instruction>(j)->getOperand(i));
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
buildTree_rec(Operands, Depth + 1, {TE, i});
}
return;
}
case Instruction::Store: {
- // Check if the stores are consecutive or of we need to swizzle them.
+ // Check if the stores are consecutive or if we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
- for (Value *j : VL)
- Operands.push_back(cast<Instruction>(j)->getOperand(0));
-
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(0));
+ TE->setOperandsInOrder();
buildTree_rec(Operands, Depth + 1, {TE, 0});
return;
}
@@ -2509,7 +2693,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
@@ -2519,14 +2704,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned j = 0; j != NumArgs; ++j)
if (hasVectorInstrinsicScalarOpd(ID, j))
ScalarArgs[j] = CI->getArgOperand(j);
- for (unsigned i = 1, e = VL.size(); i != e; ++i) {
- CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
+ for (Value *V : VL) {
+ CallInst *CI2 = dyn_cast<CallInst>(V);
if (!CI2 || CI2->getCalledFunction() != Int ||
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
<< "\n");
return;
}
@@ -2537,7 +2723,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *A1J = CI2->getArgOperand(j);
if (ScalarArgs[j] != A1J) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument " << ScalarArgs[j] << "!=" << A1J
<< "\n");
@@ -2551,19 +2738,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
- << *CI << "!=" << *VL[i] << '\n');
+ << *CI << "!=" << *V << '\n');
return;
}
}
- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (Value *j : VL) {
- CallInst *CI2 = dyn_cast<CallInst>(j);
+ for (Value *V : VL) {
+ auto *CI2 = cast<CallInst>(V);
Operands.push_back(CI2->getArgOperand(i));
}
buildTree_rec(Operands, Depth + 1, {TE, i});
@@ -2575,27 +2765,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// then do not vectorize this instruction.
if (!S.isAltShuffle()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
return;
}
+ TE->setOperandsInOrder();
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (Value *j : VL)
- Operands.push_back(cast<Instruction>(j)->getOperand(i));
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
buildTree_rec(Operands, Depth + 1, {TE, i});
}
@@ -2603,7 +2798,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
default:
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
@@ -2738,7 +2934,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
return ReuseShuffleCost +
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
- if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
+ if (E->getOpcode() == Instruction::ExtractElement &&
allSameType(VL) && allSameBlock(VL)) {
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
if (ShuffleKind.hasValue()) {
@@ -2761,11 +2957,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
return ReuseShuffleCost + getGatherCost(VL);
}
- InstructionsState S = getSameOpcode(VL);
- assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
- Instruction *VL0 = cast<Instruction>(S.OpValue);
- unsigned ShuffleOrOp = S.isAltShuffle() ?
- (unsigned) Instruction::ShuffleVector : S.getOpcode();
+ assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ Instruction *VL0 = E->getMainOp();
+ unsigned ShuffleOrOp =
+ E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI:
return 0;
@@ -2851,7 +3046,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
int ScalarEltCost =
- TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
@@ -2864,7 +3059,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
VecCost = ReuseShuffleCost +
- TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+ TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
}
return VecCost - ScalarCost;
}
@@ -2872,14 +3067,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
- int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
+ int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
Builder.getInt1Ty(), VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+ int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::FNeg:
@@ -2940,12 +3135,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
SmallVector<const Value *, 4> Operands(VL0->operand_values());
int ScalarEltCost = TTI->getArithmeticInstrCost(
- S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+ E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
+ int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK,
Op2VK, Op1VP, Op2VP, Operands);
return ReuseShuffleCost + VecCost - ScalarCost;
}
@@ -3027,11 +3222,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
return ReuseShuffleCost + VecCallCost - ScalarCallCost;
}
case Instruction::ShuffleVector: {
- assert(S.isAltShuffle() &&
- ((Instruction::isBinaryOp(S.getOpcode()) &&
- Instruction::isBinaryOp(S.getAltOpcode())) ||
- (Instruction::isCast(S.getOpcode()) &&
- Instruction::isCast(S.getAltOpcode()))) &&
+ assert(E->isAltShuffle() &&
+ ((Instruction::isBinaryOp(E->getOpcode()) &&
+ Instruction::isBinaryOp(E->getAltOpcode())) ||
+ (Instruction::isCast(E->getOpcode()) &&
+ Instruction::isCast(E->getAltOpcode()))) &&
"Invalid Shuffle Vector Operand");
int ScalarCost = 0;
if (NeedToShuffleReuses) {
@@ -3046,25 +3241,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
I, TargetTransformInfo::TCK_RecipThroughput);
}
}
- for (Value *i : VL) {
- Instruction *I = cast<Instruction>(i);
- assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
ScalarCost += TTI->getInstructionCost(
I, TargetTransformInfo::TCK_RecipThroughput);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
int VecCost = 0;
- if (Instruction::isBinaryOp(S.getOpcode())) {
- VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
- VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+ if (Instruction::isBinaryOp(E->getOpcode())) {
+ VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);
+ VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);
} else {
- Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
- Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
+ Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
+ Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
- VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
- VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+ VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);
+ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);
}
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
return ReuseShuffleCost + VecCost - ScalarCost;
@@ -3098,6 +3293,43 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
return true;
}
+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
+ if (RdxOpcode != Instruction::Or)
+ return false;
+
+ unsigned NumElts = VectorizableTree[0]->Scalars.size();
+ Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+
+ // Look past the reduction to find a source value. Arbitrarily follow the
+ // path through operand 0 of any 'or'. Also, peek through optional
+ // shift-left-by-constant.
+ Value *ZextLoad = FirstReduced;
+ while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+ match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+ ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+ // Check if the input to the reduction is an extended load.
+ Value *LoadPtr;
+ if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ return false;
+
+ // Require that the total load bit width is a legal integer type.
+ // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+ // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+ Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+ unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
+ LLVMContext &Context = FirstReduced->getContext();
+ if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
+ return false;
+
+ // Everything matched - assume that we can fold the whole sequence using
+ // load combining.
+ LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "
+ << *(cast<Instruction>(FirstReduced)) << "\n");
+
+ return true;
+}
+
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
@@ -3319,16 +3551,16 @@ void BoUpSLP::reorderInputsAccordingToOpcode(
Right = Ops.getVL(1);
}
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
- const InstructionsState &S) {
+void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
- auto *Front = cast<Instruction>(S.OpValue);
+ auto *Front = E->getMainOp();
auto *BB = Front->getParent();
- assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
- auto *I = cast<Instruction>(V);
- return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
- }));
+ assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
+ [=](Value *V) -> bool {
+ auto *I = cast<Instruction>(V);
+ return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+ }));
// The last instruction in the bundle in program order.
Instruction *LastInst = nullptr;
@@ -3339,7 +3571,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
auto *Bundle =
- BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
+ BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
if (Bundle->OpValue == Bundle->Inst)
@@ -3365,14 +3597,15 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
if (!LastInst) {
- SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
+ SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
- if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
+ if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
LastInst = &I;
if (Bundle.empty())
break;
}
}
+ assert(LastInst && "Failed to find last instruction in bundle");
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
@@ -3385,7 +3618,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
// Generate the 'InsertElement' instruction.
for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
- if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
+ if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
GatherSeq.insert(Insrt);
CSEBlocks.insert(Insrt->getParent());
@@ -3494,8 +3727,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return E->VectorizedValue;
}
- InstructionsState S = getSameOpcode(E->Scalars);
- Instruction *VL0 = cast<Instruction>(S.OpValue);
+ Instruction *VL0 = E->getMainOp();
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType();
@@ -3504,7 +3736,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->NeedToGather) {
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3518,11 +3750,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
- unsigned ShuffleOrOp = S.isAltShuffle() ?
- (unsigned) Instruction::ShuffleVector : S.getOpcode();
+ unsigned ShuffleOrOp =
+ E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
- PHINode *PH = dyn_cast<PHINode>(VL0);
+ auto *PH = cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
@@ -3577,7 +3809,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->VectorizedValue = V;
return V;
}
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3612,7 +3844,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->VectorizedValue = NewV;
return NewV;
}
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3637,7 +3869,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
Value *InVec = vectorizeTree(E->getOperand(0));
@@ -3646,7 +3878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return E->VectorizedValue;
}
- CastInst *CI = dyn_cast<CastInst>(VL0);
+ auto *CI = cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3658,7 +3890,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::FCmp:
case Instruction::ICmp: {
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
Value *L = vectorizeTree(E->getOperand(0));
Value *R = vectorizeTree(E->getOperand(1));
@@ -3670,7 +3902,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
- if (S.getOpcode() == Instruction::FCmp)
+ if (E->getOpcode() == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
else
V = Builder.CreateICmp(P0, L, R);
@@ -3685,7 +3917,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::Select: {
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
Value *Cond = vectorizeTree(E->getOperand(0));
Value *True = vectorizeTree(E->getOperand(1));
@@ -3706,7 +3938,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::FNeg: {
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
Value *Op = vectorizeTree(E->getOperand(0));
@@ -3716,7 +3948,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = Builder.CreateUnOp(
- static_cast<Instruction::UnaryOps>(S.getOpcode()), Op);
+ static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
propagateIRFlags(V, E->Scalars, VL0);
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
@@ -3748,7 +3980,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
Value *LHS = vectorizeTree(E->getOperand(0));
Value *RHS = vectorizeTree(E->getOperand(1));
@@ -3759,7 +3991,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
+ RHS);
propagateIRFlags(V, E->Scalars, VL0);
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
@@ -3776,12 +4009,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
- bool IsReorder = !E->ReorderIndices.empty();
- if (IsReorder) {
- S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
- VL0 = cast<Instruction>(S.OpValue);
- }
- setInsertPointAfterBundle(E->Scalars, S);
+ bool IsReorder = E->updateStateIfReorder();
+ if (IsReorder)
+ VL0 = E->getMainOp();
+ setInsertPointAfterBundle(E);
LoadInst *LI = cast<LoadInst>(VL0);
Type *ScalarLoadTy = LI->getType();
@@ -3797,11 +4028,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (getTreeEntry(PO))
ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
- unsigned Alignment = LI->getAlignment();
+ MaybeAlign Alignment = MaybeAlign(LI->getAlignment());
LI = Builder.CreateLoad(VecTy, VecPtr);
- if (!Alignment) {
- Alignment = DL->getABITypeAlignment(ScalarLoadTy);
- }
+ if (!Alignment)
+ Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy));
LI->setAlignment(Alignment);
Value *V = propagateMetadata(LI, E->Scalars);
if (IsReorder) {
@@ -3824,7 +4054,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
unsigned Alignment = SI->getAlignment();
unsigned AS = SI->getPointerAddressSpace();
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
Value *VecValue = vectorizeTree(E->getOperand(0));
Value *ScalarPtr = SI->getPointerOperand();
@@ -3840,7 +4070,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (!Alignment)
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
- ST->setAlignment(Alignment);
+ ST->setAlignment(Align(Alignment));
Value *V = propagateMetadata(ST, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3851,7 +4081,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::GetElementPtr: {
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
Value *Op0 = vectorizeTree(E->getOperand(0));
@@ -3878,13 +4108,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
- setInsertPointAfterBundle(E->Scalars, S);
- Function *FI;
+ setInsertPointAfterBundle(E);
+
Intrinsic::ID IID = Intrinsic::not_intrinsic;
- Value *ScalarArg = nullptr;
- if (CI && (FI = CI->getCalledFunction())) {
+ if (Function *FI = CI->getCalledFunction())
IID = FI->getIntrinsicID();
- }
+
+ Value *ScalarArg = nullptr;
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
@@ -3926,20 +4156,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::ShuffleVector: {
- assert(S.isAltShuffle() &&
- ((Instruction::isBinaryOp(S.getOpcode()) &&
- Instruction::isBinaryOp(S.getAltOpcode())) ||
- (Instruction::isCast(S.getOpcode()) &&
- Instruction::isCast(S.getAltOpcode()))) &&
+ assert(E->isAltShuffle() &&
+ ((Instruction::isBinaryOp(E->getOpcode()) &&
+ Instruction::isBinaryOp(E->getAltOpcode())) ||
+ (Instruction::isCast(E->getOpcode()) &&
+ Instruction::isCast(E->getAltOpcode()))) &&
"Invalid Shuffle Vector Operand");
- Value *LHS, *RHS;
- if (Instruction::isBinaryOp(S.getOpcode())) {
- setInsertPointAfterBundle(E->Scalars, S);
+ Value *LHS = nullptr, *RHS = nullptr;
+ if (Instruction::isBinaryOp(E->getOpcode())) {
+ setInsertPointAfterBundle(E);
LHS = vectorizeTree(E->getOperand(0));
RHS = vectorizeTree(E->getOperand(1));
} else {
- setInsertPointAfterBundle(E->Scalars, S);
+ setInsertPointAfterBundle(E);
LHS = vectorizeTree(E->getOperand(0));
}
@@ -3949,16 +4179,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V0, *V1;
- if (Instruction::isBinaryOp(S.getOpcode())) {
+ if (Instruction::isBinaryOp(E->getOpcode())) {
V0 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
V1 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+ static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
} else {
V0 = Builder.CreateCast(
- static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+ static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
V1 = Builder.CreateCast(
- static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+ static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
}
// Create shuffle to take alternate operations from the vector.
@@ -3969,8 +4199,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
auto *OpInst = cast<Instruction>(E->Scalars[i]);
- assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
- if (OpInst->getOpcode() == S.getAltOpcode()) {
+ assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+ if (OpInst->getOpcode() == E->getAltOpcode()) {
Mask[i] = Builder.getInt32(e + i);
AltScalars.push_back(E->Scalars[i]);
} else {
@@ -4136,20 +4366,18 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
+#ifndef NDEBUG
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
-#ifndef NDEBUG
for (User *U : Scalar->users()) {
LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
- // It is legal to replace users in the ignorelist by undef.
+ // It is legal to delete users in the ignorelist.
assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
- "Replacing out-of-tree value with undef");
+ "Deleting out-of-tree value");
}
-#endif
- Value *Undef = UndefValue::get(Ty);
- Scalar->replaceAllUsesWith(Undef);
}
+#endif
LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
eraseInstruction(cast<Instruction>(Scalar));
}
@@ -4165,7 +4393,7 @@ void BoUpSLP::optimizeGatherSequence() {
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
for (Instruction *I : GatherSeq) {
- if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
+ if (isDeleted(I))
continue;
// Check if this block is inside a loop.
@@ -4219,6 +4447,8 @@ void BoUpSLP::optimizeGatherSequence() {
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
Instruction *In = &*it++;
+ if (isDeleted(In))
+ continue;
if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
continue;
@@ -4245,11 +4475,11 @@ void BoUpSLP::optimizeGatherSequence() {
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
-bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
- BoUpSLP *SLP,
- const InstructionsState &S) {
+Optional<BoUpSLP::ScheduleData *>
+BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S) {
if (isa<PHINode>(S.OpValue))
- return true;
+ return nullptr;
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
@@ -4262,7 +4492,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
// instructions of the bundle.
for (Value *V : VL) {
if (!extendSchedulingRegion(V, S))
- return false;
+ return None;
}
for (Value *V : VL) {
@@ -4308,6 +4538,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
resetSchedule();
initialFillReadyList(ReadyInsts);
}
+ assert(Bundle && "Failed to find schedule bundle");
LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
<< BB->getName() << "\n");
@@ -4329,9 +4560,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
}
if (!Bundle->isReady()) {
cancelScheduling(VL, S.OpValue);
- return false;
+ return None;
}
- return true;
+ return Bundle;
}
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
@@ -4364,7 +4595,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
// Allocate a new ScheduleData for the instruction.
if (ChunkPos >= ChunkSize) {
- ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
+ ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
ChunkPos = 0;
}
return &(ScheduleDataChunks.back()[ChunkPos++]);
@@ -4977,7 +5208,7 @@ struct SLPVectorizer : public FunctionPass {
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -5052,7 +5283,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// If the target claims to have no vector registers don't attempt
// vectorization.
- if (!TTI->getNumberOfRegisters(true))
+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
return false;
// Don't vectorize when the attribute NoImplicitFloat is used.
@@ -5100,19 +5331,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
return Changed;
}
-/// Check that the Values in the slice in VL array are still existent in
-/// the WeakTrackingVH array.
-/// Vectorization of part of the VL array may cause later values in the VL array
-/// to become invalid. We track when this has happened in the WeakTrackingVH
-/// array.
-static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
- ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
- unsigned SliceSize) {
- VL = VL.slice(SliceBegin, SliceSize);
- VH = VH.slice(SliceBegin, SliceSize);
- return !std::equal(VL.begin(), VL.end(), VH.begin());
-}
-
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned VecRegSize) {
const unsigned ChainLen = Chain.size();
@@ -5124,20 +5342,20 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
- // Keep track of values that were deleted by vectorizing in the loop below.
- const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
-
bool Changed = false;
// Look for profitable vectorizable trees at all offsets, starting at zero.
for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
+ ArrayRef<Value *> Operands = Chain.slice(i, VF);
// Check that a previous iteration of this loop did not delete the Value.
- if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+ if (llvm::any_of(Operands, [&R](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && R.isDeleted(I);
+ }))
continue;
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
<< "\n");
- ArrayRef<Value *> Operands = Chain.slice(i, VF);
R.buildTree(Operands);
if (R.isTreeTinyAndNotFullyVectorizable())
@@ -5329,12 +5547,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
bool CandidateFound = false;
int MinCost = SLPCostThreshold;
- // Keep track of values that were deleted by vectorizing in the loop below.
- SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
-
unsigned NextInst = 0, MaxInst = VL.size();
- for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
- VF /= 2) {
+ for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
// No actual vectorization should happen, if number of parts is the same as
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
@@ -5352,13 +5566,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
break;
+ ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
// Check that a previous iteration of this loop did not delete the Value.
- if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
+ if (llvm::any_of(Ops, [&R](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && R.isDeleted(I);
+ }))
continue;
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
<< "\n");
- ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
R.buildTree(Ops);
Optional<ArrayRef<unsigned>> Order = R.bestOrder();
@@ -5571,7 +5788,7 @@ class HorizontalReduction {
Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
assert(isVectorizable() &&
"Expected add|fadd or min/max reduction operation.");
- Value *Cmp;
+ Value *Cmp = nullptr;
switch (Kind) {
case RK_Arithmetic:
return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
@@ -5579,23 +5796,23 @@ class HorizontalReduction {
case RK_Min:
Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
: Builder.CreateFCmpOLT(LHS, RHS);
- break;
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_Max:
Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
: Builder.CreateFCmpOGT(LHS, RHS);
- break;
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_UMin:
assert(Opcode == Instruction::ICmp && "Expected integer types.");
Cmp = Builder.CreateICmpULT(LHS, RHS);
- break;
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_UMax:
assert(Opcode == Instruction::ICmp && "Expected integer types.");
Cmp = Builder.CreateICmpUGT(LHS, RHS);
- break;
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_None:
- llvm_unreachable("Unknown reduction operation.");
+ break;
}
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ llvm_unreachable("Unknown reduction operation.");
}
public:
@@ -6203,6 +6420,8 @@ public:
}
if (V.isTreeTinyAndNotFullyVectorizable())
break;
+ if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
+ break;
V.computeMinimumValueSizes();
@@ -6275,6 +6494,9 @@ public:
}
// Update users.
ReductionRoot->replaceAllUsesWith(VectorizedTree);
+ // Mark all scalar reduction ops for deletion, they are replaced by the
+ // vector reductions.
+ V.eraseInstructions(IgnoreList);
}
return VectorizedTree != nullptr;
}
@@ -6323,7 +6545,7 @@ private:
IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
- int ScalarReduxCost;
+ int ScalarReduxCost = 0;
switch (ReductionData.getKind()) {
case RK_Arithmetic:
ScalarReduxCost =
@@ -6429,10 +6651,9 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
SmallVectorImpl<Value *> &BuildVectorOpds) {
- Value *V;
do {
BuildVectorOpds.push_back(IV->getInsertedValueOperand());
- V = IV->getAggregateOperand();
+ Value *V = IV->getAggregateOperand();
if (isa<UndefValue>(V))
break;
IV = dyn_cast<InsertValueInst>(V);
@@ -6530,18 +6751,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// horizontal reduction.
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
- SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
+ SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
while (!Stack.empty()) {
- Value *V;
+ Instruction *Inst;
unsigned Level;
- std::tie(V, Level) = Stack.pop_back_val();
- if (!V)
- continue;
- auto *Inst = dyn_cast<Instruction>(V);
- if (!Inst)
- continue;
+ std::tie(Inst, Level) = Stack.pop_back_val();
auto *BI = dyn_cast<BinaryOperator>(Inst);
auto *SI = dyn_cast<SelectInst>(Inst);
if (BI || SI) {
@@ -6582,8 +6798,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
for (auto *Op : Inst->operand_values())
if (VisitedInstrs.insert(Op).second)
if (auto *I = dyn_cast<Instruction>(Op))
- if (!isa<PHINode>(I) && I->getParent() == BB)
- Stack.emplace_back(Op, Level);
+ if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
+ Stack.emplace_back(I, Level);
}
return Res;
}
@@ -6652,11 +6868,10 @@ bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
}
bool SLPVectorizerPass::vectorizeSimpleInstructions(
- SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+ SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
bool OpsChanged = false;
- for (auto &VH : reverse(Instructions)) {
- auto *I = dyn_cast_or_null<Instruction>(VH);
- if (!I)
+ for (auto *I : reverse(Instructions)) {
+ if (R.isDeleted(I))
continue;
if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
@@ -6685,7 +6900,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (!P)
break;
- if (!VisitedInstrs.count(P))
+ if (!VisitedInstrs.count(P) && !R.isDeleted(P))
Incoming.push_back(P);
}
@@ -6729,9 +6944,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
VisitedInstrs.clear();
- SmallVector<WeakVH, 8> PostProcessInstructions;
+ SmallVector<Instruction *, 8> PostProcessInstructions;
SmallDenseSet<Instruction *, 4> KeyNodes;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // Skip instructions marked for the deletion.
+ if (R.isDeleted(&*it))
+ continue;
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second) {
if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
@@ -6811,10 +7029,16 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n");
- // We process the getelementptr list in chunks of 16 (like we do for
- // stores) to minimize compile-time.
- for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
- auto Len = std::min<unsigned>(BE - BI, 16);
+ // Process the GEP list in chunks suitable for the target's supported
+ // vector size. If a vector register can't hold 1 element, we are done.
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
+ unsigned EltSize = R.getVectorElementSize(Entry.second[0]);
+ if (MaxVecRegSize < EltSize)
+ continue;
+
+ unsigned MaxElts = MaxVecRegSize / EltSize;
+ for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
+ auto Len = std::min<unsigned>(BE - BI, MaxElts);
auto GEPList = makeArrayRef(&Entry.second[BI], Len);
// Initialize a set a candidate getelementptrs. Note that we use a
@@ -6824,10 +7048,10 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
// Some of the candidates may have already been vectorized after we
- // initially collected them. If so, the WeakTrackingVHs will have
- // nullified the
- // values, so remove them from the set of candidates.
- Candidates.remove(nullptr);
+ // initially collected them. If so, they are marked as deleted, so remove
+ // them from the set of candidates.
+ Candidates.remove_if(
+ [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
// Remove from the set of candidates all pairs of getelementptrs with
// constant differences. Such getelementptrs are likely not good
@@ -6835,18 +7059,18 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
// computed from the other. We also ensure all candidate getelementptr
// indices are unique.
for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
- auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
+ auto *GEPI = GEPList[I];
if (!Candidates.count(GEPI))
continue;
auto *SCEVI = SE->getSCEV(GEPList[I]);
for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
- auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
+ auto *GEPJ = GEPList[J];
auto *SCEVJ = SE->getSCEV(GEPList[J]);
if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
- Candidates.remove(GEPList[I]);
- Candidates.remove(GEPList[J]);
+ Candidates.remove(GEPI);
+ Candidates.remove(GEPJ);
} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
- Candidates.remove(GEPList[J]);
+ Candidates.remove(GEPJ);
}
}
}