diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 371 |
1 files changed, 353 insertions, 18 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 258f6c67e54d..90598937affc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -103,11 +103,13 @@ private: bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); bool foldShuffleOfBinops(Instruction &I); + bool foldShuffleFromReductions(Instruction &I); + bool foldSelectShuffle(Instruction &I, bool FromReduction = false); void replaceValue(Value &Old, Value &New) { Old.replaceAllUsesWith(&New); - New.takeName(&Old); if (auto *NewI = dyn_cast<Instruction>(&New)) { + New.takeName(&Old); Worklist.pushUsersToWorkList(*NewI); Worklist.pushValue(NewI); } @@ -255,12 +257,12 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { ExtractElementInst *VectorCombine::getShuffleExtract( ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex = InvalidIndex) const { - assert(isa<ConstantInt>(Ext0->getIndexOperand()) && - isa<ConstantInt>(Ext1->getIndexOperand()) && - "Expected constant extract indexes"); + auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); + auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); + assert(Index0C && Index1C && "Expected constant extract indexes"); - unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue(); - unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue(); + unsigned Index0 = Index0C->getZExtValue(); + unsigned Index1 = Index1C->getZExtValue(); // If the extract indexes are identical, no shuffle is needed. if (Index0 == Index1) @@ -306,9 +308,10 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, const Instruction &I, ExtractElementInst *&ConvertToShuffle, unsigned PreferredExtractIndex) { - assert(isa<ConstantInt>(Ext0->getOperand(1)) && - isa<ConstantInt>(Ext1->getOperand(1)) && - "Expected constant extract indexes"); + auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1)); + auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1)); + assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes"); + unsigned Opcode = I.getOpcode(); Type *ScalarTy = Ext0->getType(); auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); @@ -331,8 +334,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // Get cost estimates for the extract elements. These costs will factor into // both sequences. - unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue(); - unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue(); + unsigned Ext0Index = Ext0IndexC->getZExtValue(); + unsigned Ext1Index = Ext1IndexC->getZExtValue(); InstructionCost Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); @@ -694,8 +697,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { ScalarInst->copyIRFlags(&I); // Fold the vector constants in the original vectors into a new base vector. - Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1) - : ConstantExpr::get(Opcode, VecC0, VecC1); + Value *NewVecC = + IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1) + : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); replaceValue(I, *Insert); return true; @@ -1015,12 +1019,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { return false; NumInstChecked++; } - } - - if (!LastCheckedInst) - LastCheckedInst = UI; - else if (LastCheckedInst->comesBefore(UI)) LastCheckedInst = UI; + } auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT); if (!ScalarIdx.isSafe()) { @@ -1117,6 +1117,339 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { return true; } +/// Given a commutative reduction, the order of the input lanes does not alter +/// the results. We can use this to remove certain shuffles feeding the +/// reduction, removing the need to shuffle at all. +bool VectorCombine::foldShuffleFromReductions(Instruction &I) { + auto *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + return false; + switch (II->getIntrinsicID()) { + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: + break; + default: + return false; + } + + // Find all the inputs when looking through operations that do not alter the + // lane order (binops, for example). Currently we look for a single shuffle, + // and can ignore splat values. + std::queue<Value *> Worklist; + SmallPtrSet<Value *, 4> Visited; + ShuffleVectorInst *Shuffle = nullptr; + if (auto *Op = dyn_cast<Instruction>(I.getOperand(0))) + Worklist.push(Op); + + while (!Worklist.empty()) { + Value *CV = Worklist.front(); + Worklist.pop(); + if (Visited.contains(CV)) + continue; + + // Splats don't change the order, so can be safely ignored. + if (isSplatValue(CV)) + continue; + + Visited.insert(CV); + + if (auto *CI = dyn_cast<Instruction>(CV)) { + if (CI->isBinaryOp()) { + for (auto *Op : CI->operand_values()) + Worklist.push(Op); + continue; + } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) { + if (Shuffle && Shuffle != SV) + return false; + Shuffle = SV; + continue; + } + } + + // Anything else is currently an unknown node. + return false; + } + + if (!Shuffle) + return false; + + // Check all uses of the binary ops and shuffles are also included in the + // lane-invariant operations (Visited should be the list of lanewise + // instructions, including the shuffle that we found). + for (auto *V : Visited) + for (auto *U : V->users()) + if (!Visited.contains(U) && U != &I) + return false; + + FixedVectorType *VecType = + dyn_cast<FixedVectorType>(II->getOperand(0)->getType()); + if (!VecType) + return false; + FixedVectorType *ShuffleInputType = + dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType()); + if (!ShuffleInputType) + return false; + int NumInputElts = ShuffleInputType->getNumElements(); + + // Find the mask from sorting the lanes into order. This is most likely to + // become a identity or concat mask. Undef elements are pushed to the end. + SmallVector<int> ConcatMask; + Shuffle->getShuffleMask(ConcatMask); + sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); + bool UsesSecondVec = + any_of(ConcatMask, [&](int M) { return M >= NumInputElts; }); + InstructionCost OldCost = TTI.getShuffleCost( + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, + Shuffle->getShuffleMask()); + InstructionCost NewCost = TTI.getShuffleCost( + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, + ConcatMask); + + LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle + << "\n"); + LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); + if (NewCost < OldCost) { + Builder.SetInsertPoint(Shuffle); + Value *NewShuffle = Builder.CreateShuffleVector( + Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); + LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); + replaceValue(*Shuffle, *NewShuffle); + } + + // See if we can re-use foldSelectShuffle, getting it to reduce the size of + // the shuffle into a nicer order, as it can ignore the order of the shuffles. + return foldSelectShuffle(*Shuffle, true); +} + +/// This method looks for groups of shuffles acting on binops, of the form: +/// %x = shuffle ... +/// %y = shuffle ... +/// %a = binop %x, %y +/// %b = binop %x, %y +/// shuffle %a, %b, selectmask +/// We may, especially if the shuffle is wider than legal, be able to convert +/// the shuffle to a form where only parts of a and b need to be computed. On +/// architectures with no obvious "select" shuffle, this can reduce the total +/// number of operations if the target reports them as cheaper. +bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { + auto *SVI = dyn_cast<ShuffleVectorInst>(&I); + auto *VT = dyn_cast<FixedVectorType>(I.getType()); + if (!SVI || !VT) + return false; + auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0)); + auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1)); + if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || + VT != Op0->getType()) + return false; + auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0)); + auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1)); + auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0)); + auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1)); + auto checkSVNonOpUses = [&](Instruction *I) { + if (!I || I->getOperand(0)->getType() != VT) + return true; + return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; }); + }; + if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || + checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) + return false; + + // Collect all the uses that are shuffles that we can transform together. We + // may not have a single shuffle, but a group that can all be transformed + // together profitably. + SmallVector<ShuffleVectorInst *> Shuffles; + auto collectShuffles = [&](Instruction *I) { + for (auto *U : I->users()) { + auto *SV = dyn_cast<ShuffleVectorInst>(U); + if (!SV || SV->getType() != VT) + return false; + if (!llvm::is_contained(Shuffles, SV)) + Shuffles.push_back(SV); + } + return true; + }; + if (!collectShuffles(Op0) || !collectShuffles(Op1)) + return false; + // From a reduction, we need to be processing a single shuffle, otherwise the + // other uses will not be lane-invariant. + if (FromReduction && Shuffles.size() > 1) + return false; + + // For each of the output shuffles, we try to sort all the first vector + // elements to the beginning, followed by the second array elements at the + // end. If the binops are legalized to smaller vectors, this may reduce total + // number of binops. We compute the ReconstructMask mask needed to convert + // back to the original lane order. + SmallVector<int> V1, V2; + SmallVector<SmallVector<int>> ReconstructMasks; + int MaxV1Elt = 0, MaxV2Elt = 0; + unsigned NumElts = VT->getNumElements(); + for (ShuffleVectorInst *SVN : Shuffles) { + SmallVector<int> Mask; + SVN->getShuffleMask(Mask); + + // Check the operands are the same as the original, or reversed (in which + // case we need to commute the mask). + Value *SVOp0 = SVN->getOperand(0); + Value *SVOp1 = SVN->getOperand(1); + if (SVOp0 == Op1 && SVOp1 == Op0) { + std::swap(SVOp0, SVOp1); + ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); + } + if (SVOp0 != Op0 || SVOp1 != Op1) + return false; + + // Calculate the reconstruction mask for this shuffle, as the mask needed to + // take the packed values from Op0/Op1 and reconstructing to the original + // order. + SmallVector<int> ReconstructMask; + for (unsigned I = 0; I < Mask.size(); I++) { + if (Mask[I] < 0) { + ReconstructMask.push_back(-1); + } else if (Mask[I] < static_cast<int>(NumElts)) { + MaxV1Elt = std::max(MaxV1Elt, Mask[I]); + auto It = find(V1, Mask[I]); + if (It != V1.end()) + ReconstructMask.push_back(It - V1.begin()); + else { + ReconstructMask.push_back(V1.size()); + V1.push_back(Mask[I]); + } + } else { + MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); + auto It = find(V2, Mask[I] - NumElts); + if (It != V2.end()) + ReconstructMask.push_back(NumElts + It - V2.begin()); + else { + ReconstructMask.push_back(NumElts + V2.size()); + V2.push_back(Mask[I] - NumElts); + } + } + } + + // For reductions, we know that the lane ordering out doesn't alter the + // result. In-order can help simplify the shuffle away. + if (FromReduction) + sort(ReconstructMask); + ReconstructMasks.push_back(ReconstructMask); + } + + // If the Maximum element used from V1 and V2 are not larger than the new + // vectors, the vectors are already packes and performing the optimization + // again will likely not help any further. This also prevents us from getting + // stuck in a cycle in case the costs do not also rule it out. + if (V1.empty() || V2.empty() || + (MaxV1Elt == static_cast<int>(V1.size()) - 1 && + MaxV2Elt == static_cast<int>(V2.size()) - 1)) + return false; + + // Calculate the masks needed for the new input shuffles, which get padded + // with undef + SmallVector<int> V1A, V1B, V2A, V2B; + for (unsigned I = 0; I < V1.size(); I++) { + V1A.push_back(SVI0A->getMaskValue(V1[I])); + V1B.push_back(SVI0B->getMaskValue(V1[I])); + } + for (unsigned I = 0; I < V2.size(); I++) { + V2A.push_back(SVI1A->getMaskValue(V2[I])); + V2B.push_back(SVI1B->getMaskValue(V2[I])); + } + while (V1A.size() < NumElts) { + V1A.push_back(UndefMaskElem); + V1B.push_back(UndefMaskElem); + } + while (V2A.size() < NumElts) { + V2A.push_back(UndefMaskElem); + V2B.push_back(UndefMaskElem); + } + + auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) { + return C + + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask()); + }; + auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { + return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); + }; + + // Get the costs of the shuffles + binops before and after with the new + // shuffle masks. + InstructionCost CostBefore = + TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) + + TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); + CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), + InstructionCost(0), AddShuffleCost); + // This set helps us only cost each unique shuffle once. + SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles( + {SVI0A, SVI0B, SVI1A, SVI1B}); + CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), + InstructionCost(0), AddShuffleCost); + + // The new binops will be unused for lanes past the used shuffle lengths. + // These types attempt to get the correct cost for that from the target. + FixedVectorType *Op0SmallVT = + FixedVectorType::get(VT->getScalarType(), V1.size()); + FixedVectorType *Op1SmallVT = + FixedVectorType::get(VT->getScalarType(), V2.size()); + InstructionCost CostAfter = + TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) + + TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT); + CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), + InstructionCost(0), AddShuffleMaskCost); + std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); + CostAfter += + std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), + InstructionCost(0), AddShuffleMaskCost); + + if (CostBefore <= CostAfter) + return false; + + // The cost model has passed, create the new instructions. + Builder.SetInsertPoint(SVI0A); + Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0), + SVI0A->getOperand(1), V1A); + Builder.SetInsertPoint(SVI0B); + Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0), + SVI0B->getOperand(1), V1B); + Builder.SetInsertPoint(SVI1A); + Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0), + SVI1A->getOperand(1), V2A); + Builder.SetInsertPoint(SVI1B); + Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0), + SVI1B->getOperand(1), V2B); + Builder.SetInsertPoint(Op0); + Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), + NSV0A, NSV0B); + if (auto *I = dyn_cast<Instruction>(NOp0)) + I->copyIRFlags(Op0, true); + Builder.SetInsertPoint(Op1); + Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(), + NSV1A, NSV1B); + if (auto *I = dyn_cast<Instruction>(NOp1)) + I->copyIRFlags(Op1, true); + + for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { + Builder.SetInsertPoint(Shuffles[S]); + Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); + replaceValue(*Shuffles[S], *NSV); + } + + Worklist.pushValue(NSV0A); + Worklist.pushValue(NSV0B); + Worklist.pushValue(NSV1A); + Worklist.pushValue(NSV1B); + for (auto *S : Shuffles) + Worklist.add(S); + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -1136,6 +1469,8 @@ bool VectorCombine::run() { MadeChange |= foldBitcastShuf(I); MadeChange |= foldExtractedCmps(I); MadeChange |= foldShuffleOfBinops(I); + MadeChange |= foldShuffleFromReductions(I); + MadeChange |= foldSelectShuffle(I); } MadeChange |= scalarizeBinopOrCmp(I); MadeChange |= scalarizeLoadExtract(I); |