diff options
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 820 |
1 files changed, 522 insertions, 298 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 27a86c0bca91..974eff9974d9 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -194,10 +194,13 @@ static bool allSameBlock(ArrayRef<Value *> VL) { return true; } -/// \returns True if all of the values in \p VL are constants. +/// \returns True if all of the values in \p VL are constants (but not +/// globals/constant expressions). static bool allConstant(ArrayRef<Value *> VL) { + // Constant expressions and globals can't be vectorized like normal integer/FP + // constants. for (Value *i : VL) - if (!isa<Constant>(i)) + if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i)) return false; return true; } @@ -486,6 +489,7 @@ namespace slpvectorizer { /// Bottom Up SLP Vectorizer. class BoUpSLP { struct TreeEntry; + struct ScheduleData; public: using ValueList = SmallVector<Value *, 8>; @@ -614,6 +618,15 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable() const; + /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values + /// can be load combined in the backend. Load combining may not be allowed in + /// the IR optimizer, so we do not want to alter the pattern. For example, + /// partially transforming a scalar bswap() pattern into vector code is + /// effectively impossible for the backend to undo. + /// TODO: If load combining is allowed in the IR optimizer, this analysis + /// may not be necessary. + bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1117,6 +1130,14 @@ public: #endif }; + /// Checks if the instruction is marked for deletion. + bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } + + /// Marks values operands for later deletion by replacing them with Undefs. + void eraseInstructions(ArrayRef<Value *> AV); + + ~BoUpSLP(); + private: /// Checks if all users of \p I are the part of the vectorization tree. bool areAllUsersVectorized(Instruction *I) const; @@ -1153,8 +1174,7 @@ private: /// Set the Builder insert point to one after the last instruction in /// the bundle - void setInsertPointAfterBundle(ArrayRef<Value *> VL, - const InstructionsState &S); + void setInsertPointAfterBundle(TreeEntry *E); /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); @@ -1220,27 +1240,37 @@ private: /// reordering of operands during buildTree_rec() and vectorizeTree(). SmallVector<ValueList, 2> Operands; + /// The main/alternate instruction. + Instruction *MainOp = nullptr; + Instruction *AltOp = nullptr; + public: /// Set this bundle's \p OpIdx'th operand to \p OpVL. - void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL, - ArrayRef<unsigned> ReuseShuffleIndices) { + void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].size() == 0 && "Already resized?"); Operands[OpIdx].resize(Scalars.size()); for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) - Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty()) - ? OpVL[ReuseShuffleIndices[Lane]] - : OpVL[Lane]; - } - - /// If there is a user TreeEntry, then set its operand. - void trySetUserTEOperand(const EdgeInfo &UserTreeIdx, - ArrayRef<Value *> OpVL, - ArrayRef<unsigned> ReuseShuffleIndices) { - if (UserTreeIdx.UserTE) - UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL, - ReuseShuffleIndices); + Operands[OpIdx][Lane] = OpVL[Lane]; + } + + /// Set the operands of this bundle in their original order. + void setOperandsInOrder() { + assert(Operands.empty() && "Already initialized?"); + auto *I0 = cast<Instruction>(Scalars[0]); + Operands.resize(I0->getNumOperands()); + unsigned NumLanes = Scalars.size(); + for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + Operands[OpIdx].resize(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + auto *I = cast<Instruction>(Scalars[Lane]); + assert(I->getNumOperands() == NumOperands && + "Expected same number of operands"); + Operands[OpIdx][Lane] = I->getOperand(OpIdx); + } + } } /// \returns the \p OpIdx operand of this TreeEntry. @@ -1249,6 +1279,9 @@ private: return Operands[OpIdx]; } + /// \returns the number of operands. + unsigned getNumOperands() const { return Operands.size(); } + /// \return the single \p OpIdx operand. Value *getSingleOperand(unsigned OpIdx) const { assert(OpIdx < Operands.size() && "Off bounds"); @@ -1256,6 +1289,58 @@ private: return Operands[OpIdx][0]; } + /// Some of the instructions in the list have alternate opcodes. + bool isAltShuffle() const { + return getOpcode() != getAltOpcode(); + } + + bool isOpcodeOrAlt(Instruction *I) const { + unsigned CheckedOpcode = I->getOpcode(); + return (getOpcode() == CheckedOpcode || + getAltOpcode() == CheckedOpcode); + } + + /// Chooses the correct key for scheduling data. If \p Op has the same (or + /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is + /// \p OpValue. + Value *isOneOf(Value *Op) const { + auto *I = dyn_cast<Instruction>(Op); + if (I && isOpcodeOrAlt(I)) + return Op; + return MainOp; + } + + void setOperations(const InstructionsState &S) { + MainOp = S.MainOp; + AltOp = S.AltOp; + } + + Instruction *getMainOp() const { + return MainOp; + } + + Instruction *getAltOp() const { + return AltOp; + } + + /// The main/alternate opcodes for the list of instructions. + unsigned getOpcode() const { + return MainOp ? MainOp->getOpcode() : 0; + } + + unsigned getAltOpcode() const { + return AltOp ? AltOp->getOpcode() : 0; + } + + /// Update operations state of this entry if reorder occurred. + bool updateStateIfReorder() { + if (ReorderIndices.empty()) + return false; + InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front()); + setOperations(S); + return true; + } + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { @@ -1269,6 +1354,8 @@ private: for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; dbgs() << "NeedToGather: " << NeedToGather << "\n"; + dbgs() << "MainOp: " << *MainOp << "\n"; + dbgs() << "AltOp: " << *AltOp << "\n"; dbgs() << "VectorizedValue: "; if (VectorizedValue) dbgs() << *VectorizedValue; @@ -1279,12 +1366,12 @@ private: if (ReuseShuffleIndices.empty()) dbgs() << "Emtpy"; else - for (unsigned Idx : ReuseShuffleIndices) - dbgs() << Idx << ", "; + for (unsigned ReuseIdx : ReuseShuffleIndices) + dbgs() << ReuseIdx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; - for (unsigned Idx : ReorderIndices) - dbgs() << Idx << ", "; + for (unsigned ReorderIdx : ReorderIndices) + dbgs() << ReorderIdx << ", "; dbgs() << "\n"; dbgs() << "UserTreeIndices: "; for (const auto &EInfo : UserTreeIndices) @@ -1295,11 +1382,13 @@ private: }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, + const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef<unsigned> ReuseShuffleIndices = None, ArrayRef<unsigned> ReorderIndices = None) { - VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree)); + bool Vectorized = (bool)Bundle; + VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); @@ -1307,11 +1396,22 @@ private: Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; + Last->setOperations(S); if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = Last->Idx; - } + ScalarToTreeEntry[VL[i]] = Last; + } + // Update the scheduler bundle to point to this TreeEntry. + unsigned Lane = 0; + for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; + BundleMember = BundleMember->NextInBundle) { + BundleMember->TE = Last; + BundleMember->Lane = Lane; + ++Lane; + } + assert((!Bundle.getValue() || Lane == VL.size()) && + "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } @@ -1319,7 +1419,6 @@ private: if (UserTreeIdx.UserTE) Last->UserTreeIndices.push_back(UserTreeIdx); - Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices); return Last; } @@ -1340,19 +1439,19 @@ private: TreeEntry *getTreeEntry(Value *V) { auto I = ScalarToTreeEntry.find(V); if (I != ScalarToTreeEntry.end()) - return VectorizableTree[I->second].get(); + return I->second; return nullptr; } const TreeEntry *getTreeEntry(Value *V) const { auto I = ScalarToTreeEntry.find(V); if (I != ScalarToTreeEntry.end()) - return VectorizableTree[I->second].get(); + return I->second; return nullptr; } /// Maps a specific scalar to its tree entry. - SmallDenseMap<Value*, int> ScalarToTreeEntry; + SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1408,15 +1507,14 @@ private: /// This is required to ensure that there are no incorrect collisions in the /// AliasCache, which can happen if a new instruction is allocated at the /// same address as a previously deleted instruction. - void eraseInstruction(Instruction *I) { - I->removeFromParent(); - I->dropAllReferences(); - DeletedInstructions.emplace_back(I); + void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) { + auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first; + It->getSecond() = It->getSecond() && ReplaceOpsWithUndef; } /// Temporary store for deleted instructions. Instructions will be deleted /// eventually when the BoUpSLP is destructed. - SmallVector<unique_value, 8> DeletedInstructions; + DenseMap<Instruction *, bool> DeletedInstructions; /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). External User @@ -1453,6 +1551,8 @@ private: UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); OpValue = OpVal; + TE = nullptr; + Lane = -1; } /// Returns true if the dependency information has been calculated. @@ -1559,6 +1659,12 @@ private: /// Opcode of the current instruction in the schedule data. Value *OpValue = nullptr; + + /// The TreeEntry that this instruction corresponds to. + TreeEntry *TE = nullptr; + + /// The lane of this node in the TreeEntry. + int Lane = -1; }; #ifndef NDEBUG @@ -1633,10 +1739,9 @@ private: continue; } // Handle the def-use chain dependencies. - for (Use &U : BundleMember->Inst->operands()) { - auto *I = dyn_cast<Instruction>(U.get()); - if (!I) - continue; + + // Decrement the unscheduled counter and insert to ready list if ready. + auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { if (OpDef && OpDef->hasValidDependencies() && OpDef->incrementUnscheduledDeps(-1) == 0) { @@ -1651,6 +1756,24 @@ private: << "SLP: gets ready (def): " << *DepBundle << "\n"); } }); + }; + + // If BundleMember is a vector bundle, its operands may have been + // reordered duiring buildTree(). We therefore need to get its operands + // through the TreeEntry. + if (TreeEntry *TE = BundleMember->TE) { + int Lane = BundleMember->Lane; + assert(Lane >= 0 && "Lane not set"); + for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); + OpIdx != NumOperands; ++OpIdx) + if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) + DecrUnsched(I); + } else { + // If BundleMember is a stand-alone instruction, no operand reordering + // has taken place, so we directly access its operands. + for (Use &U : BundleMember->Inst->operands()) + if (auto *I = dyn_cast<Instruction>(U.get())) + DecrUnsched(I); } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { @@ -1697,8 +1820,11 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, - const InstructionsState &S); + /// \returns the scheduling bundle. The returned Optional value is non-None + /// if \p VL is allowed to be scheduled. + Optional<ScheduleData *> + tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, + const InstructionsState &S); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); @@ -1945,6 +2071,30 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { } // end namespace llvm +BoUpSLP::~BoUpSLP() { + for (const auto &Pair : DeletedInstructions) { + // Replace operands of ignored instructions with Undefs in case if they were + // marked for deletion. + if (Pair.getSecond()) { + Value *Undef = UndefValue::get(Pair.getFirst()->getType()); + Pair.getFirst()->replaceAllUsesWith(Undef); + } + Pair.getFirst()->dropAllReferences(); + } + for (const auto &Pair : DeletedInstructions) { + assert(Pair.getFirst()->use_empty() && + "trying to erase instruction with users."); + Pair.getFirst()->eraseFromParent(); + } +} + +void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { + for (auto *V : AV) { + if (auto *I = dyn_cast<Instruction>(V)) + eraseInstruction(I, /*ReplaceWithUndef=*/true); + }; +} + void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst) { ExtraValueToDebugLocsMap ExternallyUsedValues; @@ -2026,28 +2176,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } @@ -2055,11 +2205,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // the same block. // Don't vectorize ephemeral values. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (EphValues.count(VL[i])) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (Value *V : VL) { + if (EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } } @@ -2069,7 +2219,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -2077,19 +2227,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, E->UserTreeIndices.push_back(UserTreeIdx); LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); - E->trySetUserTEOperand(UserTreeIdx, VL, None); return; } // Check that none of the instructions in the bundle are already in the tree. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - auto *I = dyn_cast<Instruction>(VL[i]); + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); if (!I) continue; if (getTreeEntry(I)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } } @@ -2097,10 +2246,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { + for (Value *V : VL) { + if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } } @@ -2114,7 +2263,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } @@ -2128,13 +2277,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Res.second) UniqueValues.emplace_back(V); } - if (UniqueValues.size() == VL.size()) { + size_t NumUniqueScalarValues = UniqueValues.size(); + if (NumUniqueScalarValues == VL.size()) { ReuseShuffleIndicies.clear(); } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { + if (NumUniqueScalarValues <= 1 || + !llvm::isPowerOf2_32(NumUniqueScalarValues)) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } VL = UniqueValues; @@ -2142,16 +2293,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto &BSRef = BlocksSchedules[BB]; if (!BSRef) - BSRef = llvm::make_unique<BlockScheduling>(BB); + BSRef = std::make_unique<BlockScheduling>(BB); BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, S)) { + Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); + if (!Bundle) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -2160,7 +2313,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, (unsigned) Instruction::ShuffleVector : S.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); + auto *PH = cast<PHINode>(VL0); // Check for terminator values (e.g. invoke). for (unsigned j = 0; j < VL.size(); ++j) @@ -2172,23 +2325,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = + newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + // Keeps the reordered operands to avoid code duplication. + SmallVector<ValueList, 2> OperandsVec; for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - - buildTree_rec(Operands, Depth + 1, {TE, i}); + TE->setOperand(i, Operands); + OperandsVec.push_back(Operands); } + for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) + buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); return; } case Instruction::ExtractValue: @@ -2198,13 +2357,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); + VectorizableTree.back()->setOperand(0, Op0); return; } if (!CurrentOrder.empty()) { @@ -2220,17 +2379,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); + VectorizableTree.back()->setOperand(0, Op0); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); BS.cancelScheduling(VL, VL0); return; } @@ -2246,7 +2407,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -2259,7 +2421,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto *L = cast<LoadInst>(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -2289,15 +2452,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, - ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, I->getFirst()); + TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } return; @@ -2306,7 +2472,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -2322,24 +2489,27 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (unsigned i = 0; i < VL.size(); ++i) { - Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); + for (Value *V : VL) { + Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2351,19 +2521,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); Type *ComparedTy = VL0->getOperand(0)->getType(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast<CmpInst>(VL[i]); + for (Value *V : VL) { + CmpInst *Cmp = cast<CmpInst>(V); if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); ValueList Left, Right; @@ -2384,7 +2556,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Right.push_back(RHS); } } - + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; @@ -2409,7 +2582,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::And: case Instruction::Or: case Instruction::Xor: { - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -2417,11 +2591,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; } + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -2434,11 +2611,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast<Instruction>(VL[j])->getNumOperands() != 2) { + for (Value *V : VL) { + if (cast<Instruction>(V)->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } @@ -2446,58 +2624,64 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = VL0->getOperand(0)->getType(); - for (unsigned j = 0; j < VL.size(); ++j) { - Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); + for (Value *V : VL) { + Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } // We don't combine GEPs with non-constant indexes. - for (unsigned j = 0; j < VL.size(); ++j) { - auto Op = cast<Instruction>(VL[j])->getOperand(1); + for (Value *V : VL) { + auto Op = cast<Instruction>(V)->getOperand(1); if (!isa<ConstantInt>(Op)) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + TE->setOperandsInOrder(); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } case Instruction::Store: { - // Check if the stores are consecutive or of we need to swizzle them. + // Check if the stores are consecutive or if we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(0)); - + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(0)); + TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); return; } @@ -2509,7 +2693,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -2519,14 +2704,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned j = 0; j != NumArgs; ++j) if (hasVectorInstrinsicScalarOpd(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - CallInst *CI2 = dyn_cast<CallInst>(VL[i]); + for (Value *V : VL) { + CallInst *CI2 = dyn_cast<CallInst>(V); if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n"); return; } @@ -2537,7 +2723,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Value *A1J = CI2->getArgOperand(j); if (ScalarArgs[j] != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << ScalarArgs[j] << "!=" << A1J << "\n"); @@ -2551,19 +2738,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" - << *CI << "!=" << *VL[i] << '\n'); + << *CI << "!=" << *V << '\n'); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + TE->setOperandsInOrder(); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast<CallInst>(j); + for (Value *V : VL) { + auto *CI2 = cast<CallInst>(V); Operands.push_back(CI2->getArgOperand(i)); } buildTree_rec(Operands, Depth + 1, {TE, i}); @@ -2575,27 +2765,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // then do not vectorize this instruction. if (!S.isAltShuffle()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; } + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2603,7 +2798,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2738,7 +2934,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return ReuseShuffleCost + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && + if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { @@ -2761,11 +2957,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return ReuseShuffleCost + getGatherCost(VL); } - InstructionsState S = getSameOpcode(VL); - assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast<Instruction>(S.OpValue); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + Instruction *VL0 = E->getMainOp(); + unsigned ShuffleOrOp = + E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: return 0; @@ -2851,7 +3046,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } @@ -2864,7 +3059,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0); } return VecCost - ScalarCost; } @@ -2872,14 +3067,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, + int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::FNeg: @@ -2940,12 +3135,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { SmallVector<const Value *, 4> Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, + int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); return ReuseShuffleCost + VecCost - ScalarCost; } @@ -3027,11 +3222,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->isAltShuffle() && + ((Instruction::isBinaryOp(E->getOpcode()) && + Instruction::isBinaryOp(E->getAltOpcode())) || + (Instruction::isCast(E->getOpcode()) && + Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); int ScalarCost = 0; if (NeedToShuffleReuses) { @@ -3046,25 +3241,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { I, TargetTransformInfo::TCK_RecipThroughput); } } - for (Value *i : VL) { - Instruction *I = cast<Instruction>(i); - assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + for (Value *V : VL) { + Instruction *I = cast<Instruction>(V); + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; - if (Instruction::isBinaryOp(S.getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); + if (Instruction::isBinaryOp(E->getOpcode())) { + VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy); + VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy); } else { - Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); - Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); + Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); + Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); + VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty); + VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -3098,6 +3293,43 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { return true; } +bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { + if (RdxOpcode != Instruction::Or) + return false; + + unsigned NumElts = VectorizableTree[0]->Scalars.size(); + Value *FirstReduced = VectorizableTree[0]->Scalars[0]; + + // Look past the reduction to find a source value. Arbitrarily follow the + // path through operand 0 of any 'or'. Also, peek through optional + // shift-left-by-constant. + Value *ZextLoad = FirstReduced; + while (match(ZextLoad, m_Or(m_Value(), m_Value())) || + match(ZextLoad, m_Shl(m_Value(), m_Constant()))) + ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0); + + // Check if the input to the reduction is an extended load. + Value *LoadPtr; + if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + return false; + + // Require that the total load bit width is a legal integer type. + // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. + // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. + Type *SrcTy = LoadPtr->getType()->getPointerElementType(); + unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; + LLVMContext &Context = FirstReduced->getContext(); + if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth))) + return false; + + // Everything matched - assume that we can fold the whole sequence using + // load combining. + LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of " + << *(cast<Instruction>(FirstReduced)) << "\n"); + + return true; +} + bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. @@ -3319,16 +3551,16 @@ void BoUpSLP::reorderInputsAccordingToOpcode( Right = Ops.getVL(1); } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, - const InstructionsState &S) { +void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. - auto *Front = cast<Instruction>(S.OpValue); + auto *Front = E->getMainOp(); auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { - auto *I = cast<Instruction>(V); - return !S.isOpcodeOrAlt(I) || I->getParent() == BB; - })); + assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), + [=](Value *V) -> bool { + auto *I = cast<Instruction>(V); + return !E->isOpcodeOrAlt(I) || I->getParent() == BB; + })); // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -3339,7 +3571,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); + BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -3365,14 +3597,15 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end()); + SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { - if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I)) + if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I)) LastInst = &I; if (Bundle.empty()) break; } } + assert(LastInst && "Failed to find last instruction in bundle"); // Set the insertion point after the last instruction in the bundle. Set the // debug location to Front. @@ -3385,7 +3618,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); - if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { + if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) { GatherSeq.insert(Insrt); CSEBlocks.insert(Insrt->getParent()); @@ -3494,8 +3727,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast<Instruction>(S.OpValue); + Instruction *VL0 = E->getMainOp(); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -3504,7 +3736,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3518,11 +3750,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = + E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); + auto *PH = cast<PHINode>(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); @@ -3577,7 +3809,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3612,7 +3844,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = NewV; return NewV; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3637,7 +3869,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *InVec = vectorizeTree(E->getOperand(0)); @@ -3646,7 +3878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } - CastInst *CI = dyn_cast<CastInst>(VL0); + auto *CI = cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3658,7 +3890,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::FCmp: case Instruction::ICmp: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *L = vectorizeTree(E->getOperand(0)); Value *R = vectorizeTree(E->getOperand(1)); @@ -3670,7 +3902,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V; - if (S.getOpcode() == Instruction::FCmp) + if (E->getOpcode() == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3685,7 +3917,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Select: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *Cond = vectorizeTree(E->getOperand(0)); Value *True = vectorizeTree(E->getOperand(1)); @@ -3706,7 +3938,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::FNeg: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *Op = vectorizeTree(E->getOperand(0)); @@ -3716,7 +3948,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateUnOp( - static_cast<Instruction::UnaryOps>(S.getOpcode()), Op); + static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); @@ -3748,7 +3980,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *LHS = vectorizeTree(E->getOperand(0)); Value *RHS = vectorizeTree(E->getOperand(1)); @@ -3759,7 +3991,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); + static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, + RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); @@ -3776,12 +4009,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - bool IsReorder = !E->ReorderIndices.empty(); - if (IsReorder) { - S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); - VL0 = cast<Instruction>(S.OpValue); - } - setInsertPointAfterBundle(E->Scalars, S); + bool IsReorder = E->updateStateIfReorder(); + if (IsReorder) + VL0 = E->getMainOp(); + setInsertPointAfterBundle(E); LoadInst *LI = cast<LoadInst>(VL0); Type *ScalarLoadTy = LI->getType(); @@ -3797,11 +4028,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (getTreeEntry(PO)) ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); - unsigned Alignment = LI->getAlignment(); + MaybeAlign Alignment = MaybeAlign(LI->getAlignment()); LI = Builder.CreateLoad(VecTy, VecPtr); - if (!Alignment) { - Alignment = DL->getABITypeAlignment(ScalarLoadTy); - } + if (!Alignment) + Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy)); LI->setAlignment(Alignment); Value *V = propagateMetadata(LI, E->Scalars); if (IsReorder) { @@ -3824,7 +4054,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); Value *ScalarPtr = SI->getPointerOperand(); @@ -3840,7 +4070,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!Alignment) Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - ST->setAlignment(Alignment); + ST->setAlignment(Align(Alignment)); Value *V = propagateMetadata(ST, E->Scalars); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3851,7 +4081,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *Op0 = vectorizeTree(E->getOperand(0)); @@ -3878,13 +4108,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); - setInsertPointAfterBundle(E->Scalars, S); - Function *FI; + setInsertPointAfterBundle(E); + Intrinsic::ID IID = Intrinsic::not_intrinsic; - Value *ScalarArg = nullptr; - if (CI && (FI = CI->getCalledFunction())) { + if (Function *FI = CI->getCalledFunction()) IID = FI->getIntrinsicID(); - } + + Value *ScalarArg = nullptr; std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; @@ -3926,20 +4156,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->isAltShuffle() && + ((Instruction::isBinaryOp(E->getOpcode()) && + Instruction::isBinaryOp(E->getAltOpcode())) || + (Instruction::isCast(E->getOpcode()) && + Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); - Value *LHS, *RHS; - if (Instruction::isBinaryOp(S.getOpcode())) { - setInsertPointAfterBundle(E->Scalars, S); + Value *LHS = nullptr, *RHS = nullptr; + if (Instruction::isBinaryOp(E->getOpcode())) { + setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); RHS = vectorizeTree(E->getOperand(1)); } else { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); } @@ -3949,16 +4179,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V0, *V1; - if (Instruction::isBinaryOp(S.getOpcode())) { + if (Instruction::isBinaryOp(E->getOpcode())) { V0 = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); + static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); V1 = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS); + static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); } else { V0 = Builder.CreateCast( - static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy); + static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); V1 = Builder.CreateCast( - static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy); + static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); } // Create shuffle to take alternate operations from the vector. @@ -3969,8 +4199,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector<Constant *, 8> Mask(e); for (unsigned i = 0; i < e; ++i) { auto *OpInst = cast<Instruction>(E->Scalars[i]); - assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == S.getAltOpcode()) { + assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); + if (OpInst->getOpcode() == E->getAltOpcode()) { Mask[i] = Builder.getInt32(e + i); AltScalars.push_back(E->Scalars[i]); } else { @@ -4136,20 +4366,18 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; +#ifndef NDEBUG Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { -#ifndef NDEBUG for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - // It is legal to replace users in the ignorelist by undef. + // It is legal to delete users in the ignorelist. assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); + "Deleting out-of-tree value"); } -#endif - Value *Undef = UndefValue::get(Ty); - Scalar->replaceAllUsesWith(Undef); } +#endif LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); eraseInstruction(cast<Instruction>(Scalar)); } @@ -4165,7 +4393,7 @@ void BoUpSLP::optimizeGatherSequence() { << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. for (Instruction *I : GatherSeq) { - if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I)) + if (isDeleted(I)) continue; // Check if this block is inside a loop. @@ -4219,6 +4447,8 @@ void BoUpSLP::optimizeGatherSequence() { // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { Instruction *In = &*it++; + if (isDeleted(In)) + continue; if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) continue; @@ -4245,11 +4475,11 @@ void BoUpSLP::optimizeGatherSequence() { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. -bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, - BoUpSLP *SLP, - const InstructionsState &S) { +Optional<BoUpSLP::ScheduleData *> +BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, + const InstructionsState &S) { if (isa<PHINode>(S.OpValue)) - return true; + return nullptr; // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; @@ -4262,7 +4492,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, // instructions of the bundle. for (Value *V : VL) { if (!extendSchedulingRegion(V, S)) - return false; + return None; } for (Value *V : VL) { @@ -4308,6 +4538,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, resetSchedule(); initialFillReadyList(ReadyInsts); } + assert(Bundle && "Failed to find schedule bundle"); LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " << BB->getName() << "\n"); @@ -4329,9 +4560,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, } if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); - return false; + return None; } - return true; + return Bundle; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, @@ -4364,7 +4595,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { // Allocate a new ScheduleData for the instruction. if (ChunkPos >= ChunkSize) { - ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize)); + ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize)); ChunkPos = 0; } return &(ScheduleDataChunks.back()[ChunkPos++]); @@ -4977,7 +5208,7 @@ struct SLPVectorizer : public FunctionPass { auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -5052,7 +5283,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // If the target claims to have no vector registers don't attempt // vectorization. - if (!TTI->getNumberOfRegisters(true)) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) return false; // Don't vectorize when the attribute NoImplicitFloat is used. @@ -5100,19 +5331,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, return Changed; } -/// Check that the Values in the slice in VL array are still existent in -/// the WeakTrackingVH array. -/// Vectorization of part of the VL array may cause later values in the VL array -/// to become invalid. We track when this has happened in the WeakTrackingVH -/// array. -static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, - ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin, - unsigned SliceSize) { - VL = VL.slice(SliceBegin, SliceSize); - VH = VH.slice(SliceBegin, SliceSize); - return !std::equal(VL.begin(), VL.end(), VH.begin()); -} - bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, unsigned VecRegSize) { const unsigned ChainLen = Chain.size(); @@ -5124,20 +5342,20 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, if (!isPowerOf2_32(Sz) || VF < 2) return false; - // Keep track of values that were deleted by vectorizing in the loop below. - const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end()); - bool Changed = false; // Look for profitable vectorizable trees at all offsets, starting at zero. for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) { + ArrayRef<Value *> Operands = Chain.slice(i, VF); // Check that a previous iteration of this loop did not delete the Value. - if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) + if (llvm::any_of(Operands, [&R](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return I && R.isDeleted(I); + })) continue; LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i << "\n"); - ArrayRef<Value *> Operands = Chain.slice(i, VF); R.buildTree(Operands); if (R.isTreeTinyAndNotFullyVectorizable()) @@ -5329,12 +5547,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool CandidateFound = false; int MinCost = SLPCostThreshold; - // Keep track of values that were deleted by vectorizing in the loop below. - SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end()); - unsigned NextInst = 0, MaxInst = VL.size(); - for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; - VF /= 2) { + for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). @@ -5352,13 +5566,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) break; + ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); // Check that a previous iteration of this loop did not delete the Value. - if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth)) + if (llvm::any_of(Ops, [&R](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return I && R.isDeleted(I); + })) continue; LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n"); - ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); R.buildTree(Ops); Optional<ArrayRef<unsigned>> Order = R.bestOrder(); @@ -5571,7 +5788,7 @@ class HorizontalReduction { Value *createOp(IRBuilder<> &Builder, const Twine &Name) const { assert(isVectorizable() && "Expected add|fadd or min/max reduction operation."); - Value *Cmp; + Value *Cmp = nullptr; switch (Kind) { case RK_Arithmetic: return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS, @@ -5579,23 +5796,23 @@ class HorizontalReduction { case RK_Min: Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS) : Builder.CreateFCmpOLT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_Max: Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS) : Builder.CreateFCmpOGT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_UMin: assert(Opcode == Instruction::ICmp && "Expected integer types."); Cmp = Builder.CreateICmpULT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_UMax: assert(Opcode == Instruction::ICmp && "Expected integer types."); Cmp = Builder.CreateICmpUGT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_None: - llvm_unreachable("Unknown reduction operation."); + break; } - return Builder.CreateSelect(Cmp, LHS, RHS, Name); + llvm_unreachable("Unknown reduction operation."); } public: @@ -6203,6 +6420,8 @@ public: } if (V.isTreeTinyAndNotFullyVectorizable()) break; + if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode())) + break; V.computeMinimumValueSizes(); @@ -6275,6 +6494,9 @@ public: } // Update users. ReductionRoot->replaceAllUsesWith(VectorizedTree); + // Mark all scalar reduction ops for deletion, they are replaced by the + // vector reductions. + V.eraseInstructions(IgnoreList); } return VectorizedTree != nullptr; } @@ -6323,7 +6545,7 @@ private: IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; - int ScalarReduxCost; + int ScalarReduxCost = 0; switch (ReductionData.getKind()) { case RK_Arithmetic: ScalarReduxCost = @@ -6429,10 +6651,9 @@ static bool findBuildVector(InsertElementInst *LastInsertElem, /// \return true if it matches. static bool findBuildAggregate(InsertValueInst *IV, SmallVectorImpl<Value *> &BuildVectorOpds) { - Value *V; do { BuildVectorOpds.push_back(IV->getInsertedValueOperand()); - V = IV->getAggregateOperand(); + Value *V = IV->getAggregateOperand(); if (isa<UndefValue>(V)) break; IV = dyn_cast<InsertValueInst>(V); @@ -6530,18 +6751,13 @@ static bool tryToVectorizeHorReductionOrInstOperands( // horizontal reduction. // Interrupt the process if the Root instruction itself was vectorized or all // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. - SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0}); + SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0}); SmallPtrSet<Value *, 8> VisitedInstrs; bool Res = false; while (!Stack.empty()) { - Value *V; + Instruction *Inst; unsigned Level; - std::tie(V, Level) = Stack.pop_back_val(); - if (!V) - continue; - auto *Inst = dyn_cast<Instruction>(V); - if (!Inst) - continue; + std::tie(Inst, Level) = Stack.pop_back_val(); auto *BI = dyn_cast<BinaryOperator>(Inst); auto *SI = dyn_cast<SelectInst>(Inst); if (BI || SI) { @@ -6582,8 +6798,8 @@ static bool tryToVectorizeHorReductionOrInstOperands( for (auto *Op : Inst->operand_values()) if (VisitedInstrs.insert(Op).second) if (auto *I = dyn_cast<Instruction>(Op)) - if (!isa<PHINode>(I) && I->getParent() == BB) - Stack.emplace_back(Op, Level); + if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB) + Stack.emplace_back(I, Level); } return Res; } @@ -6652,11 +6868,10 @@ bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, } bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) { + SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) { bool OpsChanged = false; - for (auto &VH : reverse(Instructions)) { - auto *I = dyn_cast_or_null<Instruction>(VH); - if (!I) + for (auto *I : reverse(Instructions)) { + if (R.isDeleted(I)) continue; if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); @@ -6685,7 +6900,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (!P) break; - if (!VisitedInstrs.count(P)) + if (!VisitedInstrs.count(P) && !R.isDeleted(P)) Incoming.push_back(P); } @@ -6729,9 +6944,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { VisitedInstrs.clear(); - SmallVector<WeakVH, 8> PostProcessInstructions; + SmallVector<Instruction *, 8> PostProcessInstructions; SmallDenseSet<Instruction *, 4> KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Skip instructions marked for the deletion. + if (R.isDeleted(&*it)) + continue; // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { if (it->use_empty() && KeyNodes.count(&*it) > 0 && @@ -6811,10 +7029,16 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " << Entry.second.size() << ".\n"); - // We process the getelementptr list in chunks of 16 (like we do for - // stores) to minimize compile-time. - for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) { - auto Len = std::min<unsigned>(BE - BI, 16); + // Process the GEP list in chunks suitable for the target's supported + // vector size. If a vector register can't hold 1 element, we are done. + unsigned MaxVecRegSize = R.getMaxVecRegSize(); + unsigned EltSize = R.getVectorElementSize(Entry.second[0]); + if (MaxVecRegSize < EltSize) + continue; + + unsigned MaxElts = MaxVecRegSize / EltSize; + for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { + auto Len = std::min<unsigned>(BE - BI, MaxElts); auto GEPList = makeArrayRef(&Entry.second[BI], Len); // Initialize a set a candidate getelementptrs. Note that we use a @@ -6824,10 +7048,10 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); // Some of the candidates may have already been vectorized after we - // initially collected them. If so, the WeakTrackingVHs will have - // nullified the - // values, so remove them from the set of candidates. - Candidates.remove(nullptr); + // initially collected them. If so, they are marked as deleted, so remove + // them from the set of candidates. + Candidates.remove_if( + [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); }); // Remove from the set of candidates all pairs of getelementptrs with // constant differences. Such getelementptrs are likely not good @@ -6835,18 +7059,18 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { // computed from the other. We also ensure all candidate getelementptr // indices are unique. for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { - auto *GEPI = cast<GetElementPtrInst>(GEPList[I]); + auto *GEPI = GEPList[I]; if (!Candidates.count(GEPI)) continue; auto *SCEVI = SE->getSCEV(GEPList[I]); for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { - auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]); + auto *GEPJ = GEPList[J]; auto *SCEVJ = SE->getSCEV(GEPList[J]); if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { - Candidates.remove(GEPList[I]); - Candidates.remove(GEPList[J]); + Candidates.remove(GEPI); + Candidates.remove(GEPJ); } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { - Candidates.remove(GEPList[J]); + Candidates.remove(GEPJ); } } } |