diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-12-25 22:36:56 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2022-05-14 11:44:01 +0000 |
commit | 0eae32dcef82f6f06de6419a0d623d7def0cc8f6 (patch) | |
tree | 55b7e05be47b835fd137915bee1e64026c35e71c /contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | |
parent | 4824e7fd18a1223177218d4aec1b3c6c5c4a444e (diff) | |
parent | 77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 633 |
1 files changed, 432 insertions, 201 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 95061e9053fa..37ae13666f7a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -631,27 +631,26 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { /// after: 6 3 5 4 7 2 1 0 static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) { const unsigned Sz = Order.size(); - SmallBitVector UsedIndices(Sz); - SmallVector<int> MaskedIndices; + SmallBitVector UnusedIndices(Sz, /*t=*/true); + SmallBitVector MaskedIndices(Sz); for (unsigned I = 0; I < Sz; ++I) { if (Order[I] < Sz) - UsedIndices.set(Order[I]); + UnusedIndices.reset(Order[I]); else - MaskedIndices.push_back(I); + MaskedIndices.set(I); } - if (MaskedIndices.empty()) + if (MaskedIndices.none()) return; - SmallVector<int> AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - Order[MaskedIndices[I]] = AvailableIndices[I]; + assert(UnusedIndices.count() == MaskedIndices.count() && + "Non-synced masked/available indices."); + int Idx = UnusedIndices.find_first(); + int MIdx = MaskedIndices.find_first(); + while (MIdx >= 0) { + assert(Idx >= 0 && "Indices must be synced."); + Order[MIdx] = Idx; + Idx = UnusedIndices.find_next(Idx); + MIdx = MaskedIndices.find_next(MIdx); + } } namespace llvm { @@ -812,6 +811,13 @@ public: /// ExtractElement, ExtractValue), which can be part of the graph. Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); + /// Gets reordering data for the given tree entry. If the entry is vectorized + /// - just return ReorderIndices, otherwise check if the scalars can be + /// reordered and return the most optimal order. + /// \param TopToBottom If true, include the order of vectorized stores and + /// insertelement nodes, otherwise skip them. + Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom); + /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered @@ -1010,18 +1016,25 @@ public: std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } - // The hard-coded scores listed here are not very important. When computing - // the scores of matching one sub-tree with another, we are basically - // counting the number of values that are matching. So even if all scores - // are set to 1, we would still get a decent matching result. + // The hard-coded scores listed here are not very important, though it shall + // be higher for better matches to improve the resulting cost. When + // computing the scores of matching one sub-tree with another, we are + // basically counting the number of values that are matching. So even if all + // scores are set to 1, we would still get a decent matching result. // However, sometimes we have to break ties. For example we may have to // choose between matching loads vs matching opcodes. This is what these - // scores are helping us with: they provide the order of preference. + // scores are helping us with: they provide the order of preference. Also, + // this is important if the scalar is externally used or used in another + // tree entry node in the different lane. /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). - static const int ScoreConsecutiveLoads = 3; + static const int ScoreConsecutiveLoads = 4; + /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). + static const int ScoreReversedLoads = 3; /// ExtractElementInst from same vector and consecutive indexes. - static const int ScoreConsecutiveExtracts = 3; + static const int ScoreConsecutiveExtracts = 4; + /// ExtractElementInst from same vector and reversed indices. + static const int ScoreReversedExtracts = 3; /// Constants. static const int ScoreConstants = 2; /// Instructions with the same opcode. @@ -1041,7 +1054,10 @@ public: /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, - ScalarEvolution &SE) { + ScalarEvolution &SE, int NumLanes) { + if (V1 == V2) + return VLOperands::ScoreSplat; + auto *LI1 = dyn_cast<LoadInst>(V1); auto *LI2 = dyn_cast<LoadInst>(V2); if (LI1 && LI2) { @@ -1051,8 +1067,17 @@ public: Optional<int> Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); - return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreFail; + if (!Dist) + return VLOperands::ScoreFail; + // The distance is too large - still may be profitable to use masked + // loads/gathers. + if (std::abs(*Dist) > NumLanes / 2) + return VLOperands::ScoreAltOpcodes; + // This still will detect consecutive loads, but we might have "holes" + // in some cases. It is ok for non-power-2 vectorization and may produce + // better results. It should not affect current vectorization. + return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreReversedLoads; } auto *C1 = dyn_cast<Constant>(V1); @@ -1062,18 +1087,41 @@ public: // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. - Value *EV; - ConstantInt *Ex1Idx, *Ex2Idx; - if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) && - match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) && - Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue()) - return VLOperands::ScoreConsecutiveExtracts; + Value *EV1; + ConstantInt *Ex1Idx; + if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { + // Undefs are always profitable for extractelements. + if (isa<UndefValue>(V2)) + return VLOperands::ScoreConsecutiveExtracts; + Value *EV2 = nullptr; + ConstantInt *Ex2Idx = nullptr; + if (match(V2, + m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx), + m_Undef())))) { + // Undefs are always profitable for extractelements. + if (!Ex2Idx) + return VLOperands::ScoreConsecutiveExtracts; + if (isUndefVector(EV2) && EV2->getType() == EV1->getType()) + return VLOperands::ScoreConsecutiveExtracts; + if (EV2 == EV1) { + int Idx1 = Ex1Idx->getZExtValue(); + int Idx2 = Ex2Idx->getZExtValue(); + int Dist = Idx2 - Idx1; + // The distance is too large - still may be profitable to use + // shuffles. + if (std::abs(Dist) > NumLanes / 2) + return VLOperands::ScoreAltOpcodes; + return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts + : VLOperands::ScoreReversedExtracts; + } + } + } auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (I1 && I2) { - if (I1 == I2) - return VLOperands::ScoreSplat; + if (I1->getParent() != I2->getParent()) + return VLOperands::ScoreFail; InstructionsState S = getSameOpcode({I1, I2}); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. @@ -1088,11 +1136,13 @@ public: return VLOperands::ScoreFail; } - /// Holds the values and their lane that are taking part in the look-ahead + /// Holds the values and their lanes that are taking part in the look-ahead /// score calculation. This is used in the external uses cost calculation. - SmallDenseMap<Value *, int> InLookAheadValues; + /// Need to hold all the lanes in case of splat/broadcast at least to + /// correctly check for the use in the different lane. + SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues; - /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are + /// \returns the additional cost due to uses of \p LHS and \p RHS that are /// either external to the vectorized code, or require shuffling. int getExternalUsesCost(const std::pair<Value *, int> &LHS, const std::pair<Value *, int> &RHS) { @@ -1116,22 +1166,30 @@ public: for (User *U : V->users()) { if (const TreeEntry *UserTE = R.getTreeEntry(U)) { // The user is in the VectorizableTree. Check if we need to insert. - auto It = llvm::find(UserTE->Scalars, U); - assert(It != UserTE->Scalars.end() && "U is in UserTE"); - int UserLn = std::distance(UserTE->Scalars.begin(), It); + int UserLn = UserTE->findLaneForValue(U); assert(UserLn >= 0 && "Bad lane"); - if (UserLn != Ln) + // If the values are different, check just the line of the current + // value. If the values are the same, need to add UserInDiffLaneCost + // only if UserLn does not match both line numbers. + if ((LHS.first != RHS.first && UserLn != Ln) || + (LHS.first == RHS.first && UserLn != LHS.second && + UserLn != RHS.second)) { Cost += UserInDiffLaneCost; + break; + } } else { // Check if the user is in the look-ahead code. auto It2 = InLookAheadValues.find(U); if (It2 != InLookAheadValues.end()) { // The user is in the look-ahead code. Check the lane. - if (It2->second != Ln) + if (!It2->getSecond().contains(Ln)) { Cost += UserInDiffLaneCost; + break; + } } else { // The user is neither in SLP tree nor in the look-ahead code. Cost += ExternalUseCost; + break; } } // Limit the number of visited uses to cap compilation time. @@ -1170,32 +1228,36 @@ public: Value *V1 = LHS.first; Value *V2 = RHS.first; // Get the shallow score of V1 and V2. - int ShallowScoreAtThisLevel = - std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - - getExternalUsesCost(LHS, RHS)); + int ShallowScoreAtThisLevel = std::max( + (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) - + getExternalUsesCost(LHS, RHS)); int Lane1 = LHS.second; int Lane2 = RHS.second; // If reached MaxLevel, // or if V1 and V2 are not instructions, // or if they are SPLAT, - // or if they are not consecutive, early return the current cost. + // or if they are not consecutive, + // or if profitable to vectorize loads or extractelements, early return + // the current cost. auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || ShallowScoreAtThisLevel == VLOperands::ScoreFail || - (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) + (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) || + (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) && + ShallowScoreAtThisLevel)) return ShallowScoreAtThisLevel; assert(I1 && I2 && "Should have early exited."); // Keep track of in-tree values for determining the external-use cost. - InLookAheadValues[V1] = Lane1; - InLookAheadValues[V2] = Lane2; + InLookAheadValues[V1].insert(Lane1); + InLookAheadValues[V2].insert(Lane2); // Contains the I2 operand indexes that got matched with I1 operands. SmallSet<unsigned, 4> Op2Used; - // Recursion towards the operands of I1 and I2. We are trying all possbile + // Recursion towards the operands of I1 and I2. We are trying all possible // operand pairs, and keeping track of the best score. for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); OpIdx1 != NumOperands1; ++OpIdx1) { @@ -1319,27 +1381,79 @@ public: return None; } - /// Helper for reorderOperandVecs. \Returns the lane that we should start - /// reordering from. This is the one which has the least number of operands - /// that can freely move about. + /// Helper for reorderOperandVecs. + /// \returns the lane that we should start reordering from. This is the one + /// which has the least number of operands that can freely move about or + /// less profitable because it already has the most optimal set of operands. unsigned getBestLaneToStartReordering() const { - unsigned BestLane = 0; unsigned Min = UINT_MAX; - for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; - ++Lane) { - unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); - if (NumFreeOps < Min) { - Min = NumFreeOps; - BestLane = Lane; + unsigned SameOpNumber = 0; + // std::pair<unsigned, unsigned> is used to implement a simple voting + // algorithm and choose the lane with the least number of operands that + // can freely move about or less profitable because it already has the + // most optimal set of operands. The first unsigned is a counter for + // voting, the second unsigned is the counter of lanes with instructions + // with same/alternate opcodes and same parent basic block. + MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap; + // Try to be closer to the original results, if we have multiple lanes + // with same cost. If 2 lanes have the same cost, use the one with the + // lowest index. + for (int I = getNumLanes(); I > 0; --I) { + unsigned Lane = I - 1; + OperandsOrderData NumFreeOpsHash = + getMaxNumOperandsThatCanBeReordered(Lane); + // Compare the number of operands that can move and choose the one with + // the least number. + if (NumFreeOpsHash.NumOfAPOs < Min) { + Min = NumFreeOpsHash.NumOfAPOs; + SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; + HashMap.clear(); + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + } else if (NumFreeOpsHash.NumOfAPOs == Min && + NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { + // Select the most optimal lane in terms of number of operands that + // should be moved around. + SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + } else if (NumFreeOpsHash.NumOfAPOs == Min && + NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { + ++HashMap[NumFreeOpsHash.Hash].first; + } + } + // Select the lane with the minimum counter. + unsigned BestLane = 0; + unsigned CntMin = UINT_MAX; + for (const auto &Data : reverse(HashMap)) { + if (Data.second.first < CntMin) { + CntMin = Data.second.first; + BestLane = Data.second.second; } } return BestLane; } - /// \Returns the maximum number of operands that are allowed to be reordered - /// for \p Lane. This is used as a heuristic for selecting the first lane to - /// start operand reordering. - unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { + /// Data structure that helps to reorder operands. + struct OperandsOrderData { + /// The best number of operands with the same APOs, which can be + /// reordered. + unsigned NumOfAPOs = UINT_MAX; + /// Number of operands with the same/alternate instruction opcode and + /// parent. + unsigned NumOpsWithSameOpcodeParent = 0; + /// Hash for the actual operands ordering. + /// Used to count operands, actually their position id and opcode + /// value. It is used in the voting mechanism to find the lane with the + /// least number of operands that can freely move about or less profitable + /// because it already has the most optimal set of operands. Can be + /// replaced with SmallVector<unsigned> instead but hash code is faster + /// and requires less memory. + unsigned Hash = 0; + }; + /// \returns the maximum number of operands that are allowed to be reordered + /// for \p Lane and the number of compatible instructions(with the same + /// parent/opcode). This is used as a heuristic for selecting the first lane + /// to start operand reordering. + OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { unsigned CntTrue = 0; unsigned NumOperands = getNumOperands(); // Operands with the same APO can be reordered. We therefore need to count @@ -1348,11 +1462,45 @@ public: // a map. Instead we can simply count the number of operands that // correspond to one of them (in this case the 'true' APO), and calculate // the other by subtracting it from the total number of operands. - for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) - if (getData(OpIdx, Lane).APO) + // Operands with the same instruction opcode and parent are more + // profitable since we don't need to move them in many cases, with a high + // probability such lane already can be vectorized effectively. + bool AllUndefs = true; + unsigned NumOpsWithSameOpcodeParent = 0; + Instruction *OpcodeI = nullptr; + BasicBlock *Parent = nullptr; + unsigned Hash = 0; + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + const OperandData &OpData = getData(OpIdx, Lane); + if (OpData.APO) ++CntTrue; - unsigned CntFalse = NumOperands - CntTrue; - return std::max(CntTrue, CntFalse); + // Use Boyer-Moore majority voting for finding the majority opcode and + // the number of times it occurs. + if (auto *I = dyn_cast<Instruction>(OpData.V)) { + if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() || + I->getParent() != Parent) { + if (NumOpsWithSameOpcodeParent == 0) { + NumOpsWithSameOpcodeParent = 1; + OpcodeI = I; + Parent = I->getParent(); + } else { + --NumOpsWithSameOpcodeParent; + } + } else { + ++NumOpsWithSameOpcodeParent; + } + } + Hash = hash_combine( + Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1))); + AllUndefs = AllUndefs && isa<UndefValue>(OpData.V); + } + if (AllUndefs) + return {}; + OperandsOrderData Data; + Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue); + Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; + Data.Hash = Hash; + return Data; } /// Go through the instructions in VL and append their operands. @@ -1500,11 +1648,37 @@ public: ReorderingModes[OpIdx] = ReorderingMode::Failed; } + // Check that we don't have same operands. No need to reorder if operands + // are just perfect diamond or shuffled diamond match. Do not do it only + // for possible broadcasts or non-power of 2 number of scalars (just for + // now). + auto &&SkipReordering = [this]() { + SmallPtrSet<Value *, 4> UniqueValues; + ArrayRef<OperandData> Op0 = OpsVec.front(); + for (const OperandData &Data : Op0) + UniqueValues.insert(Data.V); + for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) { + if (any_of(Op, [&UniqueValues](const OperandData &Data) { + return !UniqueValues.contains(Data.V); + })) + return false; + } + // TODO: Check if we can remove a check for non-power-2 number of + // scalars after full support of non-power-2 vectorization. + return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size()); + }; + // If the initial strategy fails for any of the operand indexes, then we // perform reordering again in a second pass. This helps avoid assigning // high priority to the failed strategy, and should improve reordering for // the non-failed operand indexes. for (int Pass = 0; Pass != 2; ++Pass) { + // Check if no need to reorder operands since they're are perfect or + // shuffled diamond match. + // Need to to do it to avoid extra external use cost counting for + // shuffled matches, which may cause regressions. + if (SkipReordering()) + break; // Skip the second pass if the first pass did not fail. bool StrategyFailed = false; // Mark all operand data as free to use. @@ -1792,9 +1966,10 @@ private: if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].empty() && "Already resized?"); - Operands[OpIdx].resize(Scalars.size()); - for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) - Operands[OpIdx][Lane] = OpVL[Lane]; + assert(OpVL.size() <= Scalars.size() && + "Number of operands is greater than the number of scalars."); + Operands[OpIdx].resize(OpVL.size()); + copy(OpVL, Operands[OpIdx].begin()); } /// Set the operands of this bundle in their original order. @@ -1944,7 +2119,7 @@ private: if (ReuseShuffleIndices.empty()) dbgs() << "Empty"; else - for (unsigned ReuseIdx : ReuseShuffleIndices) + for (int ReuseIdx : ReuseShuffleIndices) dbgs() << ReuseIdx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; @@ -2819,6 +2994,50 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { return None; } +Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, + bool TopToBottom) { + // No need to reorder if need to shuffle reuses, still need to shuffle the + // node. + if (!TE.ReuseShuffleIndices.empty()) + return None; + if (TE.State == TreeEntry::Vectorize && + (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || + (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) && + !TE.isAltShuffle()) + return TE.ReorderIndices; + if (TE.State == TreeEntry::NeedToGather) { + // TODO: add analysis of other gather nodes with extractelement + // instructions and other values/instructions, not only undefs. + if (((TE.getOpcode() == Instruction::ExtractElement && + !TE.isAltShuffle()) || + (all_of(TE.Scalars, + [](Value *V) { + return isa<UndefValue, ExtractElementInst>(V); + }) && + any_of(TE.Scalars, + [](Value *V) { return isa<ExtractElementInst>(V); }))) && + all_of(TE.Scalars, + [](Value *V) { + auto *EE = dyn_cast<ExtractElementInst>(V); + return !EE || isa<FixedVectorType>(EE->getVectorOperandType()); + }) && + allSameType(TE.Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector. + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder); + if (Reuse || !CurrentOrder.empty()) { + if (!CurrentOrder.empty()) + fixupOrderingIndices(CurrentOrder); + return CurrentOrder; + } + } + if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) + return CurrentOrder; + } + return None; +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries; @@ -2826,42 +3045,15 @@ void BoUpSLP::reorderTopToBottom() { // their ordering. DenseMap<const TreeEntry *, OrdersType> GathersToOrders; // Find all reorderable nodes with the given VF. - // Currently the are vectorized loads,extracts + some gathering of extracts. + // Currently the are vectorized stores,loads,extracts + some gathering of + // extracts. for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( const std::unique_ptr<TreeEntry> &TE) { - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst, - InsertElementInst>(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional<OrdersType> CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/true)) { VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if (Reuse || !CurrentOrder.empty()) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional<OrdersType> CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -2993,44 +3185,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { const std::unique_ptr<TreeEntry> &TE) { if (TE->State != TreeEntry::Vectorize) NonVectorized.push_back(TE.get()); - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional<OrdersType> CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector with a single user only. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if ((Reuse || !CurrentOrder.empty()) && - !any_of(VectorizableTree, - [&TE](const std::unique_ptr<TreeEntry> &Entry) { - return Entry->State == TreeEntry::NeedToGather && - Entry.get() != TE.get() && - Entry->isSame(TE->Scalars); - })) { - OrderedEntries.insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional<OrdersType> CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - OrderedEntries.insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -3392,9 +3551,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check that every instruction appears once in this bundle. DenseMap<Value *, unsigned> UniquePositions; for (Value *V : VL) { + if (isConstant(V)) { + ReuseShuffleIndicies.emplace_back( + isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1 - : Res.first->second); + ReuseShuffleIndicies.emplace_back(Res.first->second); if (Res.second) UniqueValues.emplace_back(V); } @@ -3404,6 +3568,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (NumUniqueScalarValues <= 1 || + (UniquePositions.size() == 1 && all_of(UniqueValues, + [](Value *V) { + return isa<UndefValue>(V) || + !isConstant(V); + })) || !llvm::isPowerOf2_32(NumUniqueScalarValues)) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); @@ -3508,11 +3677,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - // If any of the scalars is marked as a value that needs to stay scalar, then - // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. for (Value *V : VL) { - if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { + if (is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -4219,10 +4386,17 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, SmallVectorImpl<unsigned> &CurrentOrder) const { - Instruction *E0 = cast<Instruction>(OpValue); - assert(E0->getOpcode() == Instruction::ExtractElement || - E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); + const auto *It = find_if(VL, [](Value *V) { + return isa<ExtractElementInst, ExtractValueInst>(V); + }); + assert(It != VL.end() && "Expected at least one extract instruction."); + auto *E0 = cast<Instruction>(*It); + assert(all_of(VL, + [](Value *V) { + return isa<UndefValue, ExtractElementInst, ExtractValueInst>( + V); + }) && + "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); @@ -4255,23 +4429,28 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, // Also, later we can check that all the indices are used and we have a // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. - CurrentOrder.assign(E, E + 1); + CurrentOrder.assign(E, E); unsigned I = 0; for (; I < E; ++I) { - auto *Inst = cast<Instruction>(VL[I]); + auto *Inst = dyn_cast<Instruction>(VL[I]); + if (!Inst) + continue; if (Inst->getOperand(0) != Vec) break; + if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) + if (isa<UndefValue>(EE->getIndexOperand())) + continue; Optional<unsigned> Idx = getExtractIndex(Inst); if (!Idx) break; const unsigned ExtIdx = *Idx; if (ExtIdx != I) { - if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) + if (ExtIdx >= E || CurrentOrder[ExtIdx] != E) break; ShouldKeepOrder = false; CurrentOrder[ExtIdx] = I; } else { - if (CurrentOrder[I] != E + 1) + if (CurrentOrder[I] != E) break; CurrentOrder[I] = I; } @@ -4287,8 +4466,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, bool BoUpSLP::areAllUsersVectorized(Instruction *I, ArrayRef<Value *> VectorizedVals) const { return (I->hasOneUse() && is_contained(VectorizedVals, I)) || - llvm::all_of(I->users(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; + all_of(I->users(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U); }); } @@ -4348,6 +4527,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, for (auto *V : VL) { ++Idx; + // Need to exclude undefs from analysis. + if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem) + continue; + // Reached the start of a new vector registers. if (Idx % EltsPerVector == 0) { AllConsecutive = true; @@ -4357,9 +4540,11 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, // Check all extracts for a vector register on the target directly // extract values in order. unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); - unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); - AllConsecutive &= PrevIdx + 1 == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; + if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) { + unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); + AllConsecutive &= PrevIdx + 1 == CurrentIdx && + CurrentIdx % EltsPerVector == Idx % EltsPerVector; + } if (AllConsecutive) continue; @@ -4442,9 +4627,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, - VectorizedVals](InstructionCost &Cost, - bool IsGather) { + VectorizedVals, E](InstructionCost &Cost) { DenseMap<Value *, int> ExtractVectorsTys; + SmallPtrSet<Value *, 4> CheckedExtracts; for (auto *V : VL) { if (isa<UndefValue>(V)) continue; @@ -4452,7 +4637,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals)) + // Also, avoid adjusting the cost for extractelements with multiple uses + // in different graph entries. + const TreeEntry *VE = getTreeEntry(V); + if (!CheckedExtracts.insert(V).second || + !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || + (VE && VE != E)) continue; auto *EE = cast<ExtractElementInst>(V); Optional<unsigned> EEIdx = getExtractIndex(EE); @@ -4549,11 +4739,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, } return GatherCost; } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); - } if ((E->getOpcode() == Instruction::ExtractElement || all_of(E->Scalars, [](Value *V) { @@ -4571,13 +4756,20 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // single input vector or of 2 input vectors. InstructionCost Cost = computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); - AdjustExtractsCost(Cost, /*IsGather=*/true); + AdjustExtractsCost(Cost); if (NeedToShuffleReuses) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); return Cost; } } + if (isSplat(VL)) { + // Found the broadcasting of the single scalar, calculate the cost as the + // broadcast. + assert(VecTy == FinalVecTy && + "No reused scalars expected for broadcast."); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); + } InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) ReuseShuffleCost = TTI->getShuffleCost( @@ -4755,7 +4947,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } } else { - AdjustExtractsCost(CommonCost, /*IsGather=*/false); + AdjustExtractsCost(CommonCost); } return CommonCost; } @@ -5211,15 +5403,15 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, FoundOr = true; } // Check if the input is an extended load of the required or/shift expression. - Value *LoadPtr; + Value *Load; if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || - !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load)) return false; // Require that the total load bit width is a legal integer type. // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. - Type *SrcTy = LoadPtr->getType()->getPointerElementType(); + Type *SrcTy = Load->getType(); unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) return false; @@ -9061,8 +9253,7 @@ private: "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; - return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, - ReductionOps.back()); + return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); } }; @@ -9473,6 +9664,59 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, return Changed; } +/// Compare two cmp instructions. If IsCompatibility is true, function returns +/// true if 2 cmps have same/swapped predicates and mos compatible corresponding +/// operands. If IsCompatibility is false, function implements strict weak +/// ordering relation between two cmp instructions, returning true if the first +/// instruction is "less" than the second, i.e. its predicate is less than the +/// predicate of the second or the operands IDs are less than the operands IDs +/// of the second cmp instruction. +template <bool IsCompatibility> +static bool compareCmp(Value *V, Value *V2, + function_ref<bool(Instruction *)> IsDeleted) { + auto *CI1 = cast<CmpInst>(V); + auto *CI2 = cast<CmpInst>(V2); + if (IsDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType()->getTypeID() < + CI2->getOperand(0)->getType()->getTypeID()) + return !IsCompatibility; + if (CI1->getOperand(0)->getType()->getTypeID() > + CI2->getOperand(0)->getType()->getTypeID()) + return false; + CmpInst::Predicate Pred1 = CI1->getPredicate(); + CmpInst::Predicate Pred2 = CI2->getPredicate(); + CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1); + CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2); + CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1); + CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2); + if (BasePred1 < BasePred2) + return !IsCompatibility; + if (BasePred1 > BasePred2) + return false; + // Compare operands. + bool LEPreds = Pred1 <= Pred2; + bool GEPreds = Pred1 >= Pred2; + for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { + auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1); + auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1); + if (Op1->getValueID() < Op2->getValueID()) + return !IsCompatibility; + if (Op1->getValueID() > Op2->getValueID()) + return false; + if (auto *I1 = dyn_cast<Instruction>(Op1)) + if (auto *I2 = dyn_cast<Instruction>(Op2)) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return false; + } + } + return IsCompatibility; +} + bool SLPVectorizerPass::vectorizeSimpleInstructions( SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, bool AtTerminator) { @@ -9504,37 +9748,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( } // Try to vectorize list of compares. // Sort by type, compare predicate, etc. - // TODO: Add analysis on the operand opcodes (profitable to vectorize - // instructions with same/alternate opcodes/const values). auto &&CompareSorter = [&R](Value *V, Value *V2) { - auto *CI1 = cast<CmpInst>(V); - auto *CI2 = cast<CmpInst>(V2); - if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) - return false; - if (CI1->getOperand(0)->getType()->getTypeID() < - CI2->getOperand(0)->getType()->getTypeID()) - return true; - if (CI1->getOperand(0)->getType()->getTypeID() > - CI2->getOperand(0)->getType()->getTypeID()) - return false; - return CI1->getPredicate() < CI2->getPredicate() || - (CI1->getPredicate() > CI2->getPredicate() && - CI1->getPredicate() < - CmpInst::getSwappedPredicate(CI2->getPredicate())); + return compareCmp<false>(V, V2, + [&R](Instruction *I) { return R.isDeleted(I); }); }; auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { if (V1 == V2) return true; - auto *CI1 = cast<CmpInst>(V1); - auto *CI2 = cast<CmpInst>(V2); - if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) - return false; - if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType()) - return false; - return CI1->getPredicate() == CI2->getPredicate() || - CI1->getPredicate() == - CmpInst::getSwappedPredicate(CI2->getPredicate()); + return compareCmp<true>(V1, V2, + [&R](Instruction *I) { return R.isDeleted(I); }); }; auto Limit = [&R](Value *V) { unsigned EltSize = R.getVectorElementSize(V); @@ -9592,10 +9815,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return true; if (Opcodes1.size() > Opcodes2.size()) return false; + Optional<bool> ConstOrder; for (int I = 0, E = Opcodes1.size(); I < E; ++I) { // Undefs are compatible with any other value. - if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) + if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) { + if (!ConstOrder) + ConstOrder = + !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]); continue; + } if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); @@ -9614,14 +9842,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; return I1->getOpcode() < I2->getOpcode(); } - if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) + if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) { + if (!ConstOrder) + ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID(); continue; + } if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) return true; if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) return false; } - return false; + return ConstOrder && *ConstOrder; }; auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { if (V1 == V2) |