diff options
Diffstat (limited to 'lib/Transforms/InstCombine/InstCombineVectorOps.cpp')
-rw-r--r-- | lib/Transforms/InstCombine/InstCombineVectorOps.cpp | 538 |
1 files changed, 389 insertions, 149 deletions
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 1c2de6352fa5..0ad1fc0e791f 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -46,40 +46,34 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" /// Return true if the value is cheaper to scalarize than it is to leave as a -/// vector operation. isConstant indicates whether we're extracting one known -/// element. If false we're extracting a variable index. -static bool cheapToScalarize(Value *V, bool isConstant) { - if (Constant *C = dyn_cast<Constant>(V)) { - if (isConstant) return true; +/// vector operation. IsConstantExtractIndex indicates whether we are extracting +/// one known element from a vector constant. +/// +/// FIXME: It's possible to create more instructions than previously existed. +static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) { + // If we can pick a scalar constant value out of a vector, that is free. + if (auto *C = dyn_cast<Constant>(V)) + return IsConstantExtractIndex || C->getSplatValue(); + + // An insertelement to the same constant index as our extract will simplify + // to the scalar inserted element. An insertelement to a different constant + // index is irrelevant to our extract. + if (match(V, m_InsertElement(m_Value(), m_Value(), m_ConstantInt()))) + return IsConstantExtractIndex; + + if (match(V, m_OneUse(m_Load(m_Value())))) + return true; - // If all elts are the same, we can extract it and use any of the values. - if (Constant *Op0 = C->getAggregateElement(0U)) { - for (unsigned i = 1, e = V->getType()->getVectorNumElements(); i != e; - ++i) - if (C->getAggregateElement(i) != Op0) - return false; + Value *V0, *V1; + if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1))))) + if (cheapToScalarize(V0, IsConstantExtractIndex) || + cheapToScalarize(V1, IsConstantExtractIndex)) return true; - } - } - Instruction *I = dyn_cast<Instruction>(V); - if (!I) return false; - // Insert element gets simplified to the inserted element or is deleted if - // this is constant idx extract element and its a constant idx insertelt. - if (I->getOpcode() == Instruction::InsertElement && isConstant && - isa<ConstantInt>(I->getOperand(2))) - return true; - if (I->getOpcode() == Instruction::Load && I->hasOneUse()) - return true; - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) - if (BO->hasOneUse() && - (cheapToScalarize(BO->getOperand(0), isConstant) || - cheapToScalarize(BO->getOperand(1), isConstant))) - return true; - if (CmpInst *CI = dyn_cast<CmpInst>(I)) - if (CI->hasOneUse() && - (cheapToScalarize(CI->getOperand(0), isConstant) || - cheapToScalarize(CI->getOperand(1), isConstant))) + CmpInst::Predicate UnusedPred; + if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1))))) + if (cheapToScalarize(V0, IsConstantExtractIndex) || + cheapToScalarize(V1, IsConstantExtractIndex)) return true; return false; @@ -166,92 +160,176 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { return &EI; } +static Instruction *foldBitcastExtElt(ExtractElementInst &Ext, + InstCombiner::BuilderTy &Builder, + bool IsBigEndian) { + Value *X; + uint64_t ExtIndexC; + if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) || + !X->getType()->isVectorTy() || + !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC))) + return nullptr; + + // If this extractelement is using a bitcast from a vector of the same number + // of elements, see if we can find the source element from the source vector: + // extelt (bitcast VecX), IndexC --> bitcast X[IndexC] + Type *SrcTy = X->getType(); + Type *DestTy = Ext.getType(); + unsigned NumSrcElts = SrcTy->getVectorNumElements(); + unsigned NumElts = Ext.getVectorOperandType()->getNumElements(); + if (NumSrcElts == NumElts) + if (Value *Elt = findScalarElement(X, ExtIndexC)) + return new BitCastInst(Elt, DestTy); + + // If the source elements are wider than the destination, try to shift and + // truncate a subset of scalar bits of an insert op. + if (NumSrcElts < NumElts) { + Value *Scalar; + uint64_t InsIndexC; + if (!match(X, m_InsertElement(m_Value(), m_Value(Scalar), + m_ConstantInt(InsIndexC)))) + return nullptr; + + // The extract must be from the subset of vector elements that we inserted + // into. Example: if we inserted element 1 of a <2 x i64> and we are + // extracting an i16 (narrowing ratio = 4), then this extract must be from 1 + // of elements 4-7 of the bitcasted vector. + unsigned NarrowingRatio = NumElts / NumSrcElts; + if (ExtIndexC / NarrowingRatio != InsIndexC) + return nullptr; + + // We are extracting part of the original scalar. How that scalar is + // inserted into the vector depends on the endian-ness. Example: + // Vector Byte Elt Index: 0 1 2 3 4 5 6 7 + // +--+--+--+--+--+--+--+--+ + // inselt <2 x i32> V, <i32> S, 1: |V0|V1|V2|V3|S0|S1|S2|S3| + // extelt <4 x i16> V', 3: | |S2|S3| + // +--+--+--+--+--+--+--+--+ + // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value. + // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value. + // In this example, we must right-shift little-endian. Big-endian is just a + // truncate. + unsigned Chunk = ExtIndexC % NarrowingRatio; + if (IsBigEndian) + Chunk = NarrowingRatio - 1 - Chunk; + + // Bail out if this is an FP vector to FP vector sequence. That would take + // more instructions than we started with unless there is no shift, and it + // may not be handled as well in the backend. + bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy(); + bool NeedDestBitcast = DestTy->isFloatingPointTy(); + if (NeedSrcBitcast && NeedDestBitcast) + return nullptr; + + unsigned SrcWidth = SrcTy->getScalarSizeInBits(); + unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); + unsigned ShAmt = Chunk * DestWidth; + + // TODO: This limitation is more strict than necessary. We could sum the + // number of new instructions and subtract the number eliminated to know if + // we can proceed. + if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse()) + if (NeedSrcBitcast || NeedDestBitcast) + return nullptr; + + if (NeedSrcBitcast) { + Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth); + Scalar = Builder.CreateBitCast(Scalar, SrcIntTy); + } + + if (ShAmt) { + // Bail out if we could end with more instructions than we started with. + if (!Ext.getVectorOperand()->hasOneUse()) + return nullptr; + Scalar = Builder.CreateLShr(Scalar, ShAmt); + } + + if (NeedDestBitcast) { + Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth); + return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy); + } + return new TruncInst(Scalar, DestTy); + } + + return nullptr; +} + Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { - if (Value *V = SimplifyExtractElementInst(EI.getVectorOperand(), - EI.getIndexOperand(), + Value *SrcVec = EI.getVectorOperand(); + Value *Index = EI.getIndexOperand(); + if (Value *V = SimplifyExtractElementInst(SrcVec, Index, SQ.getWithInstruction(&EI))) return replaceInstUsesWith(EI, V); - // If vector val is constant with all elements the same, replace EI with - // that element. We handle a known element # below. - if (Constant *C = dyn_cast<Constant>(EI.getOperand(0))) - if (cheapToScalarize(C, false)) - return replaceInstUsesWith(EI, C->getAggregateElement(0U)); - // If extracting a specified index from the vector, see if we can recursively // find a previously computed scalar that was inserted into the vector. - if (ConstantInt *IdxC = dyn_cast<ConstantInt>(EI.getOperand(1))) { - unsigned VectorWidth = EI.getVectorOperandType()->getNumElements(); + auto *IndexC = dyn_cast<ConstantInt>(Index); + if (IndexC) { + unsigned NumElts = EI.getVectorOperandType()->getNumElements(); // InstSimplify should handle cases where the index is invalid. - if (!IdxC->getValue().ule(VectorWidth)) + if (!IndexC->getValue().ule(NumElts)) return nullptr; - unsigned IndexVal = IdxC->getZExtValue(); - // This instruction only demands the single element from the input vector. // If the input vector has a single use, simplify it based on this use // property. - if (EI.getOperand(0)->hasOneUse() && VectorWidth != 1) { - APInt UndefElts(VectorWidth, 0); - APInt DemandedMask(VectorWidth, 0); - DemandedMask.setBit(IndexVal); - if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), DemandedMask, + if (SrcVec->hasOneUse() && NumElts != 1) { + APInt UndefElts(NumElts, 0); + APInt DemandedElts(NumElts, 0); + DemandedElts.setBit(IndexC->getZExtValue()); + if (Value *V = SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) { EI.setOperand(0, V); return &EI; } } - // If this extractelement is directly using a bitcast from a vector of - // the same number of elements, see if we can find the source element from - // it. In this case, we will end up needing to bitcast the scalars. - if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) { - if (VectorType *VT = dyn_cast<VectorType>(BCI->getOperand(0)->getType())) - if (VT->getNumElements() == VectorWidth) - if (Value *Elt = findScalarElement(BCI->getOperand(0), IndexVal)) - return new BitCastInst(Elt, EI.getType()); - } + if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian())) + return I; // If there's a vector PHI feeding a scalar use through this extractelement // instruction, try to scalarize the PHI. - if (PHINode *PN = dyn_cast<PHINode>(EI.getOperand(0))) { - Instruction *scalarPHI = scalarizePHI(EI, PN); - if (scalarPHI) - return scalarPHI; - } + if (auto *Phi = dyn_cast<PHINode>(SrcVec)) + if (Instruction *ScalarPHI = scalarizePHI(EI, Phi)) + return ScalarPHI; } - if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) { - // Push extractelement into predecessor operation if legal and - // profitable to do so. - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { - if (I->hasOneUse() && - cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) { - Value *newEI0 = - Builder.CreateExtractElement(BO->getOperand(0), EI.getOperand(1), - EI.getName()+".lhs"); - Value *newEI1 = - Builder.CreateExtractElement(BO->getOperand(1), EI.getOperand(1), - EI.getName()+".rhs"); - return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), - newEI0, newEI1, BO); - } - } else if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) { + BinaryOperator *BO; + if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) { + // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index) + Value *X = BO->getOperand(0), *Y = BO->getOperand(1); + Value *E0 = Builder.CreateExtractElement(X, Index); + Value *E1 = Builder.CreateExtractElement(Y, Index); + return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO); + } + + Value *X, *Y; + CmpInst::Predicate Pred; + if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) && + cheapToScalarize(SrcVec, IndexC)) { + // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index) + Value *E0 = Builder.CreateExtractElement(X, Index); + Value *E1 = Builder.CreateExtractElement(Y, Index); + return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1); + } + + if (auto *I = dyn_cast<Instruction>(SrcVec)) { + if (auto *IE = dyn_cast<InsertElementInst>(I)) { // Extracting the inserted element? - if (IE->getOperand(2) == EI.getOperand(1)) + if (IE->getOperand(2) == Index) return replaceInstUsesWith(EI, IE->getOperand(1)); // If the inserted and extracted elements are constants, they must not // be the same value, extract from the pre-inserted value instead. - if (isa<Constant>(IE->getOperand(2)) && isa<Constant>(EI.getOperand(1))) { - Worklist.AddValue(EI.getOperand(0)); + if (isa<Constant>(IE->getOperand(2)) && IndexC) { + Worklist.AddValue(SrcVec); EI.setOperand(0, IE->getOperand(0)); return &EI; } - } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) { + } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) { // If this is extracting an element from a shufflevector, figure out where // it came from and extract from the appropriate input element instead. - if (ConstantInt *Elt = dyn_cast<ConstantInt>(EI.getOperand(1))) { + if (auto *Elt = dyn_cast<ConstantInt>(Index)) { int SrcIdx = SVI->getMaskValue(Elt->getZExtValue()); Value *Src; unsigned LHSWidth = @@ -270,13 +348,12 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { ConstantInt::get(Int32Ty, SrcIdx, false)); } - } else if (CastInst *CI = dyn_cast<CastInst>(I)) { + } else if (auto *CI = dyn_cast<CastInst>(I)) { // Canonicalize extractelement(cast) -> cast(extractelement). // Bitcasts can change the number of vector elements, and they cost // nothing. if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) { - Value *EE = Builder.CreateExtractElement(CI->getOperand(0), - EI.getIndexOperand()); + Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index); Worklist.AddValue(EE); return CastInst::Create(CI->getOpcode(), EE, EI.getType()); } @@ -791,43 +868,62 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp)) replaceInstUsesWith(IE, VecOp); - // If the inserted element was extracted from some other vector, and if the - // indexes are constant, try to turn this into a shufflevector operation. - if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) { - if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) { - unsigned NumInsertVectorElts = IE.getType()->getNumElements(); - unsigned NumExtractVectorElts = - EI->getOperand(0)->getType()->getVectorNumElements(); - unsigned ExtractedIdx = - cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); - unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); - - if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract. - return replaceInstUsesWith(IE, VecOp); - - if (InsertedIdx >= NumInsertVectorElts) // Out of range insert. - return replaceInstUsesWith(IE, UndefValue::get(IE.getType())); - - // If we are extracting a value from a vector, then inserting it right - // back into the same place, just use the input vector. - if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx) - return replaceInstUsesWith(IE, VecOp); - - // If this insertelement isn't used by some other insertelement, turn it - // (and any insertelements it points to), into one big shuffle. - if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) { - SmallVector<Constant*, 16> Mask; - ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this); - - // The proposed shuffle may be trivial, in which case we shouldn't - // perform the combine. - if (LR.first != &IE && LR.second != &IE) { - // We now have a shuffle of LHS, RHS, Mask. - if (LR.second == nullptr) - LR.second = UndefValue::get(LR.first->getType()); - return new ShuffleVectorInst(LR.first, LR.second, - ConstantVector::get(Mask)); - } + // If the inserted element was extracted from some other vector and both + // indexes are constant, try to turn this into a shuffle. + uint64_t InsertedIdx, ExtractedIdx; + Value *ExtVecOp; + if (match(IdxOp, m_ConstantInt(InsertedIdx)) && + match(ScalarOp, m_ExtractElement(m_Value(ExtVecOp), + m_ConstantInt(ExtractedIdx)))) { + unsigned NumInsertVectorElts = IE.getType()->getNumElements(); + unsigned NumExtractVectorElts = ExtVecOp->getType()->getVectorNumElements(); + if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract. + return replaceInstUsesWith(IE, VecOp); + + if (InsertedIdx >= NumInsertVectorElts) // Out of range insert. + return replaceInstUsesWith(IE, UndefValue::get(IE.getType())); + + // If we are extracting a value from a vector, then inserting it right + // back into the same place, just use the input vector. + if (ExtVecOp == VecOp && ExtractedIdx == InsertedIdx) + return replaceInstUsesWith(IE, VecOp); + + // TODO: Looking at the user(s) to determine if this insert is a + // fold-to-shuffle opportunity does not match the usual instcombine + // constraints. We should decide if the transform is worthy based only + // on this instruction and its operands, but that may not work currently. + // + // Here, we are trying to avoid creating shuffles before reaching + // the end of a chain of extract-insert pairs. This is complicated because + // we do not generally form arbitrary shuffle masks in instcombine + // (because those may codegen poorly), but collectShuffleElements() does + // exactly that. + // + // The rules for determining what is an acceptable target-independent + // shuffle mask are fuzzy because they evolve based on the backend's + // capabilities and real-world impact. + auto isShuffleRootCandidate = [](InsertElementInst &Insert) { + if (!Insert.hasOneUse()) + return true; + auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back()); + if (!InsertUser) + return true; + return false; + }; + + // Try to form a shuffle from a chain of extract-insert ops. + if (isShuffleRootCandidate(IE)) { + SmallVector<Constant*, 16> Mask; + ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this); + + // The proposed shuffle may be trivial, in which case we shouldn't + // perform the combine. + if (LR.first != &IE && LR.second != &IE) { + // We now have a shuffle of LHS, RHS, Mask. + if (LR.second == nullptr) + LR.second = UndefValue::get(LR.first->getType()); + return new ShuffleVectorInst(LR.first, LR.second, + ConstantVector::get(Mask)); } } } @@ -857,7 +953,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { /// Return true if we can evaluate the specified expression tree if the vector /// elements were shuffled in a different order. -static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, +static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask, unsigned Depth = 5) { // We can always reorder the elements of a constant. if (isa<Constant>(V)) @@ -904,8 +1000,15 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::GetElementPtr: { + // Bail out if we would create longer vector ops. We could allow creating + // longer vector ops, but that may result in more expensive codegen. We + // would also need to limit the transform to avoid undefined behavior for + // integer div/rem. + Type *ITy = I->getType(); + if (ITy->isVectorTy() && Mask.size() > ITy->getVectorNumElements()) + return false; for (Value *Operand : I->operands()) { - if (!CanEvaluateShuffled(Operand, Mask, Depth-1)) + if (!canEvaluateShuffled(Operand, Mask, Depth - 1)) return false; } return true; @@ -925,7 +1028,7 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, SeenOnce = true; } } - return CanEvaluateShuffled(I->getOperand(0), Mask, Depth-1); + return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1); } } return false; @@ -1009,12 +1112,12 @@ static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) { llvm_unreachable("failed to rebuild vector instructions"); } -Value * -InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { +static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { // Mask.size() does not need to be equal to the number of vector elements. assert(V->getType()->isVectorTy() && "can't reorder non-vector elements"); Type *EltTy = V->getType()->getScalarType(); + Type *I32Ty = IntegerType::getInt32Ty(V->getContext()); if (isa<UndefValue>(V)) return UndefValue::get(VectorType::get(EltTy, Mask.size())); @@ -1025,9 +1128,9 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { SmallVector<Constant *, 16> MaskValues; for (int i = 0, e = Mask.size(); i != e; ++i) { if (Mask[i] == -1) - MaskValues.push_back(UndefValue::get(Builder.getInt32Ty())); + MaskValues.push_back(UndefValue::get(I32Ty)); else - MaskValues.push_back(Builder.getInt32(Mask[i])); + MaskValues.push_back(ConstantInt::get(I32Ty, Mask[i])); } return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()), ConstantVector::get(MaskValues)); @@ -1069,7 +1172,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { SmallVector<Value*, 8> NewOps; bool NeedsRebuild = (Mask.size() != I->getType()->getVectorNumElements()); for (int i = 0, e = I->getNumOperands(); i != e; ++i) { - Value *V = EvaluateInDifferentElementOrder(I->getOperand(i), Mask); + Value *V = evaluateInDifferentElementOrder(I->getOperand(i), Mask); NewOps.push_back(V); NeedsRebuild |= (V != I->getOperand(i)); } @@ -1096,11 +1199,11 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { // If element is not in Mask, no need to handle the operand 1 (element to // be inserted). Just evaluate values in operand 0 according to Mask. if (!Found) - return EvaluateInDifferentElementOrder(I->getOperand(0), Mask); + return evaluateInDifferentElementOrder(I->getOperand(0), Mask); - Value *V = EvaluateInDifferentElementOrder(I->getOperand(0), Mask); + Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask); return InsertElementInst::Create(V, I->getOperand(1), - Builder.getInt32(Index), "", I); + ConstantInt::get(I32Ty, Index), "", I); } } llvm_unreachable("failed to reorder elements of vector instruction!"); @@ -1350,12 +1453,144 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, return NewBO; } +/// Match a shuffle-select-shuffle pattern where the shuffles are widening and +/// narrowing (concatenating with undef and extracting back to the original +/// length). This allows replacing the wide select with a narrow select. +static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf, + InstCombiner::BuilderTy &Builder) { + // This must be a narrowing identity shuffle. It extracts the 1st N elements + // of the 1st vector operand of a shuffle. + if (!match(Shuf.getOperand(1), m_Undef()) || !Shuf.isIdentityWithExtract()) + return nullptr; + + // The vector being shuffled must be a vector select that we can eliminate. + // TODO: The one-use requirement could be eased if X and/or Y are constants. + Value *Cond, *X, *Y; + if (!match(Shuf.getOperand(0), + m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y))))) + return nullptr; + + // We need a narrow condition value. It must be extended with undef elements + // and have the same number of elements as this shuffle. + unsigned NarrowNumElts = Shuf.getType()->getVectorNumElements(); + Value *NarrowCond; + if (!match(Cond, m_OneUse(m_ShuffleVector(m_Value(NarrowCond), m_Undef(), + m_Constant()))) || + NarrowCond->getType()->getVectorNumElements() != NarrowNumElts || + !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding()) + return nullptr; + + // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) --> + // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask) + Value *Undef = UndefValue::get(X->getType()); + Value *NarrowX = Builder.CreateShuffleVector(X, Undef, Shuf.getMask()); + Value *NarrowY = Builder.CreateShuffleVector(Y, Undef, Shuf.getMask()); + return SelectInst::Create(NarrowCond, NarrowX, NarrowY); +} + +/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask. +static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { + Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); + if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1)) + return nullptr; + + Value *X, *Y; + Constant *Mask; + if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask)))) + return nullptr; + + // We are extracting a subvector from a shuffle. Remove excess elements from + // the 1st shuffle mask to eliminate the extract. + // + // This transform is conservatively limited to identity extracts because we do + // not allow arbitrary shuffle mask creation as a target-independent transform + // (because we can't guarantee that will lower efficiently). + // + // If the extracting shuffle has an undef mask element, it transfers to the + // new shuffle mask. Otherwise, copy the original mask element. Example: + // shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> --> + // shuf X, Y, <C0, undef, C2, undef> + unsigned NumElts = Shuf.getType()->getVectorNumElements(); + SmallVector<Constant *, 16> NewMask(NumElts); + assert(NumElts < Mask->getType()->getVectorNumElements() && + "Identity with extract must have less elements than its inputs"); + + for (unsigned i = 0; i != NumElts; ++i) { + Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i); + Constant *MaskElt = Mask->getAggregateElement(i); + NewMask[i] = isa<UndefValue>(ExtractMaskElt) ? ExtractMaskElt : MaskElt; + } + return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask)); +} + +/// Try to replace a shuffle with an insertelement. +static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) { + Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1); + SmallVector<int, 16> Mask = Shuf.getShuffleMask(); + + // The shuffle must not change vector sizes. + // TODO: This restriction could be removed if the insert has only one use + // (because the transform would require a new length-changing shuffle). + int NumElts = Mask.size(); + if (NumElts != (int)(V0->getType()->getVectorNumElements())) + return nullptr; + + // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC' + auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) { + // We need an insertelement with a constant index. + if (!match(V0, m_InsertElement(m_Value(), m_Value(Scalar), + m_ConstantInt(IndexC)))) + return false; + + // Test the shuffle mask to see if it splices the inserted scalar into the + // operand 1 vector of the shuffle. + int NewInsIndex = -1; + for (int i = 0; i != NumElts; ++i) { + // Ignore undef mask elements. + if (Mask[i] == -1) + continue; + + // The shuffle takes elements of operand 1 without lane changes. + if (Mask[i] == NumElts + i) + continue; + + // The shuffle must choose the inserted scalar exactly once. + if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue()) + return false; + + // The shuffle is placing the inserted scalar into element i. + NewInsIndex = i; + } + + assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?"); + + // Index is updated to the potentially translated insertion lane. + IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex); + return true; + }; + + // If the shuffle is unnecessary, insert the scalar operand directly into + // operand 1 of the shuffle. Example: + // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0 + Value *Scalar; + ConstantInt *IndexC; + if (isShufflingScalarIntoOp1(Scalar, IndexC)) + return InsertElementInst::Create(V1, Scalar, IndexC); + + // Try again after commuting shuffle. Example: + // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> --> + // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3 + std::swap(V0, V1); + ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); + if (isShufflingScalarIntoOp1(Scalar, IndexC)) + return InsertElementInst::Create(V1, Scalar, IndexC); + + return nullptr; +} + Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); - SmallVector<int, 16> Mask = SVI.getShuffleMask(); - Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); - if (auto *V = SimplifyShuffleVectorInst( LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI))) return replaceInstUsesWith(SVI, V); @@ -1363,9 +1598,10 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (Instruction *I = foldSelectShuffle(SVI, Builder, DL)) return I; - bool MadeChange = false; - unsigned VWidth = SVI.getType()->getVectorNumElements(); + if (Instruction *I = narrowVectorSelect(SVI, Builder)) + return I; + unsigned VWidth = SVI.getType()->getVectorNumElements(); APInt UndefElts(VWidth, 0); APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) { @@ -1374,18 +1610,22 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return &SVI; } + if (Instruction *I = foldIdentityExtractShuffle(SVI)) + return I; + + // This transform has the potential to lose undef knowledge, so it is + // intentionally placed after SimplifyDemandedVectorElts(). + if (Instruction *I = foldShuffleWithInsert(SVI)) + return I; + + SmallVector<int, 16> Mask = SVI.getShuffleMask(); + Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); unsigned LHSWidth = LHS->getType()->getVectorNumElements(); + bool MadeChange = false; // Canonicalize shuffle(x ,x,mask) -> shuffle(x, undef,mask') // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask'). if (LHS == RHS || isa<UndefValue>(LHS)) { - if (isa<UndefValue>(LHS) && LHS == RHS) { - // shuffle(undef,undef,mask) -> undef. - Value *Result = (VWidth == LHSWidth) - ? LHS : UndefValue::get(SVI.getType()); - return replaceInstUsesWith(SVI, Result); - } - // Remap any references to RHS to use LHS. SmallVector<Constant*, 16> Elts; for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) { @@ -1421,8 +1661,8 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (isRHSID) return replaceInstUsesWith(SVI, RHS); } - if (isa<UndefValue>(RHS) && CanEvaluateShuffled(LHS, Mask)) { - Value *V = EvaluateInDifferentElementOrder(LHS, Mask); + if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) { + Value *V = evaluateInDifferentElementOrder(LHS, Mask); return replaceInstUsesWith(SVI, V); } |