aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp371
1 files changed, 353 insertions, 18 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 258f6c67e54d..90598937affc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -103,11 +103,13 @@ private:
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
bool foldShuffleOfBinops(Instruction &I);
+ bool foldShuffleFromReductions(Instruction &I);
+ bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
void replaceValue(Value &Old, Value &New) {
Old.replaceAllUsesWith(&New);
- New.takeName(&Old);
if (auto *NewI = dyn_cast<Instruction>(&New)) {
+ New.takeName(&Old);
Worklist.pushUsersToWorkList(*NewI);
Worklist.pushValue(NewI);
}
@@ -255,12 +257,12 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
ExtractElementInst *VectorCombine::getShuffleExtract(
ExtractElementInst *Ext0, ExtractElementInst *Ext1,
unsigned PreferredExtractIndex = InvalidIndex) const {
- assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
- isa<ConstantInt>(Ext1->getIndexOperand()) &&
- "Expected constant extract indexes");
+ auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
+ auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
+ assert(Index0C && Index1C && "Expected constant extract indexes");
- unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
- unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
+ unsigned Index0 = Index0C->getZExtValue();
+ unsigned Index1 = Index1C->getZExtValue();
// If the extract indexes are identical, no shuffle is needed.
if (Index0 == Index1)
@@ -306,9 +308,10 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
const Instruction &I,
ExtractElementInst *&ConvertToShuffle,
unsigned PreferredExtractIndex) {
- assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
- isa<ConstantInt>(Ext1->getOperand(1)) &&
- "Expected constant extract indexes");
+ auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1));
+ auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1));
+ assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
+
unsigned Opcode = I.getOpcode();
Type *ScalarTy = Ext0->getType();
auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
@@ -331,8 +334,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
// Get cost estimates for the extract elements. These costs will factor into
// both sequences.
- unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
- unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+ unsigned Ext0Index = Ext0IndexC->getZExtValue();
+ unsigned Ext1Index = Ext1IndexC->getZExtValue();
InstructionCost Extract0Cost =
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
@@ -694,8 +697,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
ScalarInst->copyIRFlags(&I);
// Fold the vector constants in the original vectors into a new base vector.
- Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
- : ConstantExpr::get(Opcode, VecC0, VecC1);
+ Value *NewVecC =
+ IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
+ : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
replaceValue(I, *Insert);
return true;
@@ -1015,12 +1019,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
return false;
NumInstChecked++;
}
- }
-
- if (!LastCheckedInst)
- LastCheckedInst = UI;
- else if (LastCheckedInst->comesBefore(UI))
LastCheckedInst = UI;
+ }
auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT);
if (!ScalarIdx.isSafe()) {
@@ -1117,6 +1117,339 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
return true;
}
+/// Given a commutative reduction, the order of the input lanes does not alter
+/// the results. We can use this to remove certain shuffles feeding the
+/// reduction, removing the need to shuffle at all.
+bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
+ auto *II = dyn_cast<IntrinsicInst>(&I);
+ if (!II)
+ return false;
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::vector_reduce_add:
+ case Intrinsic::vector_reduce_mul:
+ case Intrinsic::vector_reduce_and:
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_xor:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_umax:
+ break;
+ default:
+ return false;
+ }
+
+ // Find all the inputs when looking through operations that do not alter the
+ // lane order (binops, for example). Currently we look for a single shuffle,
+ // and can ignore splat values.
+ std::queue<Value *> Worklist;
+ SmallPtrSet<Value *, 4> Visited;
+ ShuffleVectorInst *Shuffle = nullptr;
+ if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
+ Worklist.push(Op);
+
+ while (!Worklist.empty()) {
+ Value *CV = Worklist.front();
+ Worklist.pop();
+ if (Visited.contains(CV))
+ continue;
+
+ // Splats don't change the order, so can be safely ignored.
+ if (isSplatValue(CV))
+ continue;
+
+ Visited.insert(CV);
+
+ if (auto *CI = dyn_cast<Instruction>(CV)) {
+ if (CI->isBinaryOp()) {
+ for (auto *Op : CI->operand_values())
+ Worklist.push(Op);
+ continue;
+ } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
+ if (Shuffle && Shuffle != SV)
+ return false;
+ Shuffle = SV;
+ continue;
+ }
+ }
+
+ // Anything else is currently an unknown node.
+ return false;
+ }
+
+ if (!Shuffle)
+ return false;
+
+ // Check all uses of the binary ops and shuffles are also included in the
+ // lane-invariant operations (Visited should be the list of lanewise
+ // instructions, including the shuffle that we found).
+ for (auto *V : Visited)
+ for (auto *U : V->users())
+ if (!Visited.contains(U) && U != &I)
+ return false;
+
+ FixedVectorType *VecType =
+ dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
+ if (!VecType)
+ return false;
+ FixedVectorType *ShuffleInputType =
+ dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());
+ if (!ShuffleInputType)
+ return false;
+ int NumInputElts = ShuffleInputType->getNumElements();
+
+ // Find the mask from sorting the lanes into order. This is most likely to
+ // become a identity or concat mask. Undef elements are pushed to the end.
+ SmallVector<int> ConcatMask;
+ Shuffle->getShuffleMask(ConcatMask);
+ sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
+ bool UsesSecondVec =
+ any_of(ConcatMask, [&](int M) { return M >= NumInputElts; });
+ InstructionCost OldCost = TTI.getShuffleCost(
+ UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+ Shuffle->getShuffleMask());
+ InstructionCost NewCost = TTI.getShuffleCost(
+ UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+ ConcatMask);
+
+ LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
+ << "\n");
+ LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
+ if (NewCost < OldCost) {
+ Builder.SetInsertPoint(Shuffle);
+ Value *NewShuffle = Builder.CreateShuffleVector(
+ Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
+ LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
+ replaceValue(*Shuffle, *NewShuffle);
+ }
+
+ // See if we can re-use foldSelectShuffle, getting it to reduce the size of
+ // the shuffle into a nicer order, as it can ignore the order of the shuffles.
+ return foldSelectShuffle(*Shuffle, true);
+}
+
+/// This method looks for groups of shuffles acting on binops, of the form:
+/// %x = shuffle ...
+/// %y = shuffle ...
+/// %a = binop %x, %y
+/// %b = binop %x, %y
+/// shuffle %a, %b, selectmask
+/// We may, especially if the shuffle is wider than legal, be able to convert
+/// the shuffle to a form where only parts of a and b need to be computed. On
+/// architectures with no obvious "select" shuffle, this can reduce the total
+/// number of operations if the target reports them as cheaper.
+bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
+ auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
+ auto *VT = dyn_cast<FixedVectorType>(I.getType());
+ if (!SVI || !VT)
+ return false;
+ auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
+ if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
+ VT != Op0->getType())
+ return false;
+ auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0));
+ auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1));
+ auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0));
+ auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1));
+ auto checkSVNonOpUses = [&](Instruction *I) {
+ if (!I || I->getOperand(0)->getType() != VT)
+ return true;
+ return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; });
+ };
+ if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
+ checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
+ return false;
+
+ // Collect all the uses that are shuffles that we can transform together. We
+ // may not have a single shuffle, but a group that can all be transformed
+ // together profitably.
+ SmallVector<ShuffleVectorInst *> Shuffles;
+ auto collectShuffles = [&](Instruction *I) {
+ for (auto *U : I->users()) {
+ auto *SV = dyn_cast<ShuffleVectorInst>(U);
+ if (!SV || SV->getType() != VT)
+ return false;
+ if (!llvm::is_contained(Shuffles, SV))
+ Shuffles.push_back(SV);
+ }
+ return true;
+ };
+ if (!collectShuffles(Op0) || !collectShuffles(Op1))
+ return false;
+ // From a reduction, we need to be processing a single shuffle, otherwise the
+ // other uses will not be lane-invariant.
+ if (FromReduction && Shuffles.size() > 1)
+ return false;
+
+ // For each of the output shuffles, we try to sort all the first vector
+ // elements to the beginning, followed by the second array elements at the
+ // end. If the binops are legalized to smaller vectors, this may reduce total
+ // number of binops. We compute the ReconstructMask mask needed to convert
+ // back to the original lane order.
+ SmallVector<int> V1, V2;
+ SmallVector<SmallVector<int>> ReconstructMasks;
+ int MaxV1Elt = 0, MaxV2Elt = 0;
+ unsigned NumElts = VT->getNumElements();
+ for (ShuffleVectorInst *SVN : Shuffles) {
+ SmallVector<int> Mask;
+ SVN->getShuffleMask(Mask);
+
+ // Check the operands are the same as the original, or reversed (in which
+ // case we need to commute the mask).
+ Value *SVOp0 = SVN->getOperand(0);
+ Value *SVOp1 = SVN->getOperand(1);
+ if (SVOp0 == Op1 && SVOp1 == Op0) {
+ std::swap(SVOp0, SVOp1);
+ ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+ }
+ if (SVOp0 != Op0 || SVOp1 != Op1)
+ return false;
+
+ // Calculate the reconstruction mask for this shuffle, as the mask needed to
+ // take the packed values from Op0/Op1 and reconstructing to the original
+ // order.
+ SmallVector<int> ReconstructMask;
+ for (unsigned I = 0; I < Mask.size(); I++) {
+ if (Mask[I] < 0) {
+ ReconstructMask.push_back(-1);
+ } else if (Mask[I] < static_cast<int>(NumElts)) {
+ MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
+ auto It = find(V1, Mask[I]);
+ if (It != V1.end())
+ ReconstructMask.push_back(It - V1.begin());
+ else {
+ ReconstructMask.push_back(V1.size());
+ V1.push_back(Mask[I]);
+ }
+ } else {
+ MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
+ auto It = find(V2, Mask[I] - NumElts);
+ if (It != V2.end())
+ ReconstructMask.push_back(NumElts + It - V2.begin());
+ else {
+ ReconstructMask.push_back(NumElts + V2.size());
+ V2.push_back(Mask[I] - NumElts);
+ }
+ }
+ }
+
+ // For reductions, we know that the lane ordering out doesn't alter the
+ // result. In-order can help simplify the shuffle away.
+ if (FromReduction)
+ sort(ReconstructMask);
+ ReconstructMasks.push_back(ReconstructMask);
+ }
+
+ // If the Maximum element used from V1 and V2 are not larger than the new
+ // vectors, the vectors are already packes and performing the optimization
+ // again will likely not help any further. This also prevents us from getting
+ // stuck in a cycle in case the costs do not also rule it out.
+ if (V1.empty() || V2.empty() ||
+ (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
+ MaxV2Elt == static_cast<int>(V2.size()) - 1))
+ return false;
+
+ // Calculate the masks needed for the new input shuffles, which get padded
+ // with undef
+ SmallVector<int> V1A, V1B, V2A, V2B;
+ for (unsigned I = 0; I < V1.size(); I++) {
+ V1A.push_back(SVI0A->getMaskValue(V1[I]));
+ V1B.push_back(SVI0B->getMaskValue(V1[I]));
+ }
+ for (unsigned I = 0; I < V2.size(); I++) {
+ V2A.push_back(SVI1A->getMaskValue(V2[I]));
+ V2B.push_back(SVI1B->getMaskValue(V2[I]));
+ }
+ while (V1A.size() < NumElts) {
+ V1A.push_back(UndefMaskElem);
+ V1B.push_back(UndefMaskElem);
+ }
+ while (V2A.size() < NumElts) {
+ V2A.push_back(UndefMaskElem);
+ V2B.push_back(UndefMaskElem);
+ }
+
+ auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) {
+ return C +
+ TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask());
+ };
+ auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
+ return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);
+ };
+
+ // Get the costs of the shuffles + binops before and after with the new
+ // shuffle masks.
+ InstructionCost CostBefore =
+ TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) +
+ TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);
+ CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
+ InstructionCost(0), AddShuffleCost);
+ // This set helps us only cost each unique shuffle once.
+ SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles(
+ {SVI0A, SVI0B, SVI1A, SVI1B});
+ CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+ InstructionCost(0), AddShuffleCost);
+
+ // The new binops will be unused for lanes past the used shuffle lengths.
+ // These types attempt to get the correct cost for that from the target.
+ FixedVectorType *Op0SmallVT =
+ FixedVectorType::get(VT->getScalarType(), V1.size());
+ FixedVectorType *Op1SmallVT =
+ FixedVectorType::get(VT->getScalarType(), V2.size());
+ InstructionCost CostAfter =
+ TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) +
+ TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT);
+ CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
+ InstructionCost(0), AddShuffleMaskCost);
+ std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
+ CostAfter +=
+ std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
+ InstructionCost(0), AddShuffleMaskCost);
+
+ if (CostBefore <= CostAfter)
+ return false;
+
+ // The cost model has passed, create the new instructions.
+ Builder.SetInsertPoint(SVI0A);
+ Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0),
+ SVI0A->getOperand(1), V1A);
+ Builder.SetInsertPoint(SVI0B);
+ Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0),
+ SVI0B->getOperand(1), V1B);
+ Builder.SetInsertPoint(SVI1A);
+ Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0),
+ SVI1A->getOperand(1), V2A);
+ Builder.SetInsertPoint(SVI1B);
+ Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0),
+ SVI1B->getOperand(1), V2B);
+ Builder.SetInsertPoint(Op0);
+ Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
+ NSV0A, NSV0B);
+ if (auto *I = dyn_cast<Instruction>(NOp0))
+ I->copyIRFlags(Op0, true);
+ Builder.SetInsertPoint(Op1);
+ Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
+ NSV1A, NSV1B);
+ if (auto *I = dyn_cast<Instruction>(NOp1))
+ I->copyIRFlags(Op1, true);
+
+ for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
+ Builder.SetInsertPoint(Shuffles[S]);
+ Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
+ replaceValue(*Shuffles[S], *NSV);
+ }
+
+ Worklist.pushValue(NSV0A);
+ Worklist.pushValue(NSV0B);
+ Worklist.pushValue(NSV1A);
+ Worklist.pushValue(NSV1B);
+ for (auto *S : Shuffles)
+ Worklist.add(S);
+ return true;
+}
+
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@@ -1136,6 +1469,8 @@ bool VectorCombine::run() {
MadeChange |= foldBitcastShuf(I);
MadeChange |= foldExtractedCmps(I);
MadeChange |= foldShuffleOfBinops(I);
+ MadeChange |= foldShuffleFromReductions(I);
+ MadeChange |= foldSelectShuffle(I);
}
MadeChange |= scalarizeBinopOrCmp(I);
MadeChange |= scalarizeLoadExtract(I);