diff options
Diffstat (limited to 'llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp')
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 312 | 
1 files changed, 199 insertions, 113 deletions
| diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 47ce83974c8d..7cfe4c8b5892 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -87,7 +87,10 @@ bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,    Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,                                            Depth, I);    if (!NewVal) return false; -  U = NewVal; +  if (Instruction* OpInst = dyn_cast<Instruction>(U)) +    salvageDebugInfo(*OpInst); +     +  replaceUse(U, NewVal);    return true;  } @@ -173,15 +176,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,      assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");      assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); -    // Output known-0 are known to be clear if zero in either the LHS | RHS. -    APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; -    // Output known-1 bits are only known if set in both the LHS & RHS. -    APInt IKnownOne = RHSKnown.One & LHSKnown.One; +    Known = LHSKnown & RHSKnown;      // If the client is only demanding bits that we know, return the known      // constant. -    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) -      return Constant::getIntegerValue(VTy, IKnownOne); +    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) +      return Constant::getIntegerValue(VTy, Known.One);      // If all of the demanded bits are known 1 on one side, return the other.      // These bits cannot contribute to the result of the 'and'. @@ -194,8 +194,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,      if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero))        return I; -    Known.Zero = std::move(IKnownZero); -    Known.One  = std::move(IKnownOne);      break;    }    case Instruction::Or: { @@ -207,15 +205,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,      assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");      assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); -    // Output known-0 bits are only known if clear in both the LHS & RHS. -    APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; -    // Output known-1 are known. to be set if s.et in either the LHS | RHS. -    APInt IKnownOne = RHSKnown.One | LHSKnown.One; +    Known = LHSKnown | RHSKnown;      // If the client is only demanding bits that we know, return the known      // constant. -    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) -      return Constant::getIntegerValue(VTy, IKnownOne); +    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) +      return Constant::getIntegerValue(VTy, Known.One);      // If all of the demanded bits are known zero on one side, return the other.      // These bits cannot contribute to the result of the 'or'. @@ -228,8 +223,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,      if (ShrinkDemandedConstant(I, 1, DemandedMask))        return I; -    Known.Zero = std::move(IKnownZero); -    Known.One  = std::move(IKnownOne);      break;    }    case Instruction::Xor: { @@ -239,17 +232,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,      assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");      assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); -    // Output known-0 bits are known if clear or set in both the LHS & RHS. -    APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | -                       (RHSKnown.One & LHSKnown.One); -    // Output known-1 are known to be set if set in only one of the LHS, RHS. -    APInt IKnownOne =  (RHSKnown.Zero & LHSKnown.One) | -                       (RHSKnown.One & LHSKnown.Zero); +    Known = LHSKnown ^ RHSKnown;      // If the client is only demanding bits that we know, return the known      // constant. -    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) -      return Constant::getIntegerValue(VTy, IKnownOne); +    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) +      return Constant::getIntegerValue(VTy, Known.One);      // If all of the demanded bits are known zero on one side, return the other.      // These bits cannot contribute to the result of the 'xor'. @@ -309,10 +297,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,          return InsertNewInstWith(NewXor, *I);        } -    // Output known-0 bits are known if clear or set in both the LHS & RHS. -    Known.Zero = std::move(IKnownZero); -    // Output known-1 are known to be set if set in only one of the LHS, RHS. -    Known.One  = std::move(IKnownOne);      break;    }    case Instruction::Select: { @@ -396,8 +380,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,      if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))        return I;      assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?"); -    Known = InputKnown.zextOrTrunc(BitWidth, -                                   true /* ExtendedBitsAreKnownZero */); +    Known = InputKnown.zextOrTrunc(BitWidth);      assert(!Known.hasConflict() && "Bits known to be one AND zero?");      break;    } @@ -453,6 +436,43 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,      break;    }    case Instruction::Add: +    if ((DemandedMask & 1) == 0) { +      // If we do not need the low bit, try to convert bool math to logic: +      // add iN (zext i1 X), (sext i1 Y) --> sext (~X & Y) to iN +      Value *X, *Y; +      if (match(I, m_c_Add(m_OneUse(m_ZExt(m_Value(X))), +                           m_OneUse(m_SExt(m_Value(Y))))) && +          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) { +        // Truth table for inputs and output signbits: +        //       X:0 | X:1 +        //      ---------- +        // Y:0  |  0 | 0 | +        // Y:1  | -1 | 0 | +        //      ---------- +        IRBuilderBase::InsertPointGuard Guard(Builder); +        Builder.SetInsertPoint(I); +        Value *AndNot = Builder.CreateAnd(Builder.CreateNot(X), Y); +        return Builder.CreateSExt(AndNot, VTy); +      } + +      // add iN (sext i1 X), (sext i1 Y) --> sext (X | Y) to iN +      // TODO: Relax the one-use checks because we are removing an instruction? +      if (match(I, m_Add(m_OneUse(m_SExt(m_Value(X))), +                         m_OneUse(m_SExt(m_Value(Y))))) && +          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) { +        // Truth table for inputs and output signbits: +        //       X:0 | X:1 +        //      ----------- +        // Y:0  | -1 | -1 | +        // Y:1  | -1 |  0 | +        //      ----------- +        IRBuilderBase::InsertPointGuard Guard(Builder); +        Builder.SetInsertPoint(I); +        Value *Or = Builder.CreateOr(X, Y); +        return Builder.CreateSExt(Or, VTy); +      } +    } +    LLVM_FALLTHROUGH;    case Instruction::Sub: {      /// If the high-bits of an ADD/SUB are not demanded, then we do not care      /// about the high bits of the operands. @@ -515,11 +535,27 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,        if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))          return I;        assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + +      bool SignBitZero = Known.Zero.isSignBitSet(); +      bool SignBitOne = Known.One.isSignBitSet();        Known.Zero <<= ShiftAmt;        Known.One  <<= ShiftAmt;        // low bits known zero.        if (ShiftAmt)          Known.Zero.setLowBits(ShiftAmt); + +      // If this shift has "nsw" keyword, then the result is either a poison +      // value or has the same sign bit as the first operand. +      if (IOp->hasNoSignedWrap()) { +        if (SignBitZero) +          Known.Zero.setSignBit(); +        else if (SignBitOne) +          Known.One.setSignBit(); +        if (Known.hasConflict()) +          return UndefValue::get(I->getType()); +      } +    } else { +      computeKnownBits(I, Known, Depth, CxtI);      }      break;    } @@ -543,6 +579,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,        Known.One.lshrInPlace(ShiftAmt);        if (ShiftAmt)          Known.Zero.setHighBits(ShiftAmt);  // high bits known zero. +    } else { +      computeKnownBits(I, Known, Depth, CxtI);      }      break;    } @@ -603,6 +641,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,        } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.          Known.One |= HighBits;        } +    } else { +      computeKnownBits(I, Known, Depth, CxtI);      }      break;    } @@ -624,6 +664,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,        // Propagate zero bits from the input.        Known.Zero.setHighBits(std::min(            BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros)); +    } else { +      computeKnownBits(I, Known, Depth, CxtI);      }      break;    } @@ -682,7 +724,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,      Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;      break;    } -  case Instruction::Call: +  case Instruction::Call: { +    bool KnownBitsComputed = false;      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {        switch (II->getIntrinsicID()) {        default: break; @@ -714,8 +757,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,            NewVal->takeName(I);            return InsertNewInstWith(NewVal, *I);          } - -        // TODO: Could compute known zero/one bits based on the input.          break;        }        case Intrinsic::fshr: @@ -740,6 +781,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,                       RHSKnown.Zero.lshr(BitWidth - ShiftAmt);          Known.One = LHSKnown.One.shl(ShiftAmt) |                      RHSKnown.One.lshr(BitWidth - ShiftAmt); +        KnownBitsComputed = true;          break;        }        case Intrinsic::x86_mmx_pmovmskb: @@ -768,16 +810,21 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,          // We know that the upper bits are set to zero.          Known.Zero.setBitsFrom(ArgWidth); -        return nullptr; +        KnownBitsComputed = true; +        break;        }        case Intrinsic::x86_sse42_crc32_64_64:          Known.Zero.setBitsFrom(32); -        return nullptr; +        KnownBitsComputed = true; +        break;        }      } -    computeKnownBits(V, Known, Depth, CxtI); + +    if (!KnownBitsComputed) +      computeKnownBits(V, Known, Depth, CxtI);      break;    } +  }    // If the client is only demanding bits that we know, return the known    // constant. @@ -811,15 +858,12 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,      computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,                       CxtI); -    // Output known-0 are known to be clear if zero in either the LHS | RHS. -    APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; -    // Output known-1 bits are only known if set in both the LHS & RHS. -    APInt IKnownOne = RHSKnown.One & LHSKnown.One; +    Known = LHSKnown & RHSKnown;      // If the client is only demanding bits that we know, return the known      // constant. -    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) -      return Constant::getIntegerValue(ITy, IKnownOne); +    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) +      return Constant::getIntegerValue(ITy, Known.One);      // If all of the demanded bits are known 1 on one side, return the other.      // These bits cannot contribute to the result of the 'and' in this @@ -829,8 +873,6 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,      if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))        return I->getOperand(1); -    Known.Zero = std::move(IKnownZero); -    Known.One  = std::move(IKnownOne);      break;    }    case Instruction::Or: { @@ -842,15 +884,12 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,      computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,                       CxtI); -    // Output known-0 bits are only known if clear in both the LHS & RHS. -    APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; -    // Output known-1 are known to be set if set in either the LHS | RHS. -    APInt IKnownOne = RHSKnown.One | LHSKnown.One; +    Known = LHSKnown | RHSKnown;      // If the client is only demanding bits that we know, return the known      // constant. -    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) -      return Constant::getIntegerValue(ITy, IKnownOne); +    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) +      return Constant::getIntegerValue(ITy, Known.One);      // If all of the demanded bits are known zero on one side, return the      // other.  These bits cannot contribute to the result of the 'or' in this @@ -860,8 +899,6 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,      if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))        return I->getOperand(1); -    Known.Zero = std::move(IKnownZero); -    Known.One  = std::move(IKnownOne);      break;    }    case Instruction::Xor: { @@ -872,17 +909,12 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,      computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,                       CxtI); -    // Output known-0 bits are known if clear or set in both the LHS & RHS. -    APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | -                       (RHSKnown.One & LHSKnown.One); -    // Output known-1 are known to be set if set in only one of the LHS, RHS. -    APInt IKnownOne =  (RHSKnown.Zero & LHSKnown.One) | -                       (RHSKnown.One & LHSKnown.Zero); +    Known = LHSKnown ^ RHSKnown;      // If the client is only demanding bits that we know, return the known      // constant. -    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) -      return Constant::getIntegerValue(ITy, IKnownOne); +    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) +      return Constant::getIntegerValue(ITy, Known.One);      // If all of the demanded bits are known zero on one side, return the      // other. @@ -891,10 +923,6 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,      if (DemandedMask.isSubsetOf(LHSKnown.Zero))        return I->getOperand(1); -    // Output known-0 bits are known if clear or set in both the LHS & RHS. -    Known.Zero = std::move(IKnownZero); -    // Output known-1 are known to be set if set in only one of the LHS, RHS. -    Known.One  = std::move(IKnownOne);      break;    }    default: @@ -1008,17 +1036,69 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,        DemandedElts.getActiveBits() == 3)      return nullptr; -  unsigned VWidth = II->getType()->getVectorNumElements(); +  auto *IIVTy = cast<VectorType>(II->getType()); +  unsigned VWidth = IIVTy->getNumElements();    if (VWidth == 1)      return nullptr; -  ConstantInt *NewDMask = nullptr; +  IRBuilderBase::InsertPointGuard Guard(Builder); +  Builder.SetInsertPoint(II); + +  // Assume the arguments are unchanged and later override them, if needed. +  SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());    if (DMaskIdx < 0) { -    // Pretend that a prefix of elements is demanded to simplify the code -    // below. -    DemandedElts = (1 << DemandedElts.getActiveBits()) - 1; +    // Buffer case. + +    const unsigned ActiveBits = DemandedElts.getActiveBits(); +    const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); + +    // Start assuming the prefix of elements is demanded, but possibly clear +    // some other bits if there are trailing zeros (unused components at front) +    // and update offset. +    DemandedElts = (1 << ActiveBits) - 1; + +    if (UnusedComponentsAtFront > 0) { +      static const unsigned InvalidOffsetIdx = 0xf; + +      unsigned OffsetIdx; +      switch (II->getIntrinsicID()) { +      case Intrinsic::amdgcn_raw_buffer_load: +        OffsetIdx = 1; +        break; +      case Intrinsic::amdgcn_s_buffer_load: +        // If resulting type is vec3, there is no point in trimming the +        // load with updated offset, as the vec3 would most likely be widened to +        // vec4 anyway during lowering. +        if (ActiveBits == 4 && UnusedComponentsAtFront == 1) +          OffsetIdx = InvalidOffsetIdx; +        else +          OffsetIdx = 1; +        break; +      case Intrinsic::amdgcn_struct_buffer_load: +        OffsetIdx = 2; +        break; +      default: +        // TODO: handle tbuffer* intrinsics. +        OffsetIdx = InvalidOffsetIdx; +        break; +      } + +      if (OffsetIdx != InvalidOffsetIdx) { +        // Clear demanded bits and update the offset. +        DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); +        auto *Offset = II->getArgOperand(OffsetIdx); +        unsigned SingleComponentSizeInBits = +            getDataLayout().getTypeSizeInBits(II->getType()->getScalarType()); +        unsigned OffsetAdd = +            UnusedComponentsAtFront * SingleComponentSizeInBits / 8; +        auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); +        Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal); +      } +    }    } else { +    // Image case. +      ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx));      unsigned DMaskVal = DMask->getZExtValue() & 0xf; @@ -1037,7 +1117,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,      }      if (DMaskVal != NewDMaskVal) -      NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal); +      Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);    }    unsigned NewNumElts = DemandedElts.countPopulation(); @@ -1045,39 +1125,25 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,      return UndefValue::get(II->getType());    if (NewNumElts >= VWidth && DemandedElts.isMask()) { -    if (NewDMask) -      II->setArgOperand(DMaskIdx, NewDMask); +    if (DMaskIdx >= 0) +      II->setArgOperand(DMaskIdx, Args[DMaskIdx]);      return nullptr;    } -  // Determine the overload types of the original intrinsic. -  auto IID = II->getIntrinsicID(); -  SmallVector<Intrinsic::IITDescriptor, 16> Table; -  getIntrinsicInfoTableEntries(IID, Table); -  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table; -    // Validate function argument and return types, extracting overloaded types    // along the way. -  FunctionType *FTy = II->getCalledFunction()->getFunctionType();    SmallVector<Type *, 6> OverloadTys; -  Intrinsic::matchIntrinsicSignature(FTy, TableRef, OverloadTys); +  if (!Intrinsic::getIntrinsicSignature(II->getCalledFunction(), OverloadTys)) +    return nullptr;    Module *M = II->getParent()->getParent()->getParent(); -  Type *EltTy = II->getType()->getVectorElementType(); -  Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts); +  Type *EltTy = IIVTy->getElementType(); +  Type *NewTy = +      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);    OverloadTys[0] = NewTy; -  Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys); - -  SmallVector<Value *, 16> Args; -  for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) -    Args.push_back(II->getArgOperand(I)); - -  if (NewDMask) -    Args[DMaskIdx] = NewDMask; - -  IRBuilderBase::InsertPointGuard Guard(Builder); -  Builder.SetInsertPoint(II); +  Function *NewIntrin = +      Intrinsic::getDeclaration(M, II->getIntrinsicID(), OverloadTys);    CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);    NewCall->takeName(II); @@ -1088,7 +1154,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,                                         DemandedElts.countTrailingZeros());    } -  SmallVector<uint32_t, 8> EltMask; +  SmallVector<int, 8> EltMask;    unsigned NewLoadIdx = 0;    for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {      if (!!DemandedElts[OrigLoadIdx]) @@ -1120,7 +1186,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,                                                  APInt &UndefElts,                                                  unsigned Depth,                                                  bool AllowMultipleUsers) { -  unsigned VWidth = V->getType()->getVectorNumElements(); +  // Cannot analyze scalable type. The number of vector elements is not a +  // compile-time constant. +  if (isa<ScalableVectorType>(V->getType())) +    return nullptr; + +  unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();    APInt EltMask(APInt::getAllOnesValue(VWidth));    assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!"); @@ -1199,10 +1270,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,      auto *II = dyn_cast<IntrinsicInst>(Inst);      Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum);      if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) { -      if (II) -        II->setArgOperand(OpNum, V); -      else -        Inst->setOperand(OpNum, V); +      replaceOperand(*Inst, OpNum, V);        MadeChange = true;      }    }; @@ -1268,7 +1336,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,      // If this is inserting an element that isn't demanded, remove this      // insertelement.      if (IdxNo >= VWidth || !DemandedElts[IdxNo]) { -      Worklist.Add(I); +      Worklist.push(I);        return I->getOperand(0);      } @@ -1282,7 +1350,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,             Shuffle->getOperand(1)->getType() &&             "Expected shuffle operands to have same type");      unsigned OpWidth = -        Shuffle->getOperand(0)->getType()->getVectorNumElements(); +        cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements(); +    // Handle trivial case of a splat. Only check the first element of LHS +    // operand. +    if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) && +        DemandedElts.isAllOnesValue()) { +      if (!isa<UndefValue>(I->getOperand(1))) { +        I->setOperand(1, UndefValue::get(I->getOperand(1)->getType())); +        MadeChange = true; +      } +      APInt LeftDemanded(OpWidth, 1); +      APInt LHSUndefElts(OpWidth, 0); +      simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts); +      if (LHSUndefElts[0]) +        UndefElts = EltMask; +      else +        UndefElts.clearAllBits(); +      break; +    } +      APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0);      for (unsigned i = 0; i < VWidth; i++) {        if (DemandedElts[i]) { @@ -1396,15 +1482,14 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,      }      if (NewUndefElts) {        // Add additional discovered undefs. -      SmallVector<Constant*, 16> Elts; +      SmallVector<int, 16> Elts;        for (unsigned i = 0; i < VWidth; ++i) {          if (UndefElts[i]) -          Elts.push_back(UndefValue::get(Type::getInt32Ty(I->getContext()))); +          Elts.push_back(UndefMaskElem);          else -          Elts.push_back(ConstantInt::get(Type::getInt32Ty(I->getContext()), -                                          Shuffle->getMaskValue(i))); +          Elts.push_back(Shuffle->getMaskValue(i));        } -      I->setOperand(2, ConstantVector::get(Elts)); +      Shuffle->setShuffleMask(Elts);        MadeChange = true;      }      break; @@ -1549,7 +1634,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,        // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.        // Instead we should return a zero vector.        if (!DemandedElts[0]) { -        Worklist.Add(II); +        Worklist.push(II);          return ConstantAggregateZero::get(II->getType());        } @@ -1568,7 +1653,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,        // If lowest element of a scalar op isn't used then use Arg0.        if (!DemandedElts[0]) { -        Worklist.Add(II); +        Worklist.push(II);          return II->getArgOperand(0);        }        // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions @@ -1588,7 +1673,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,        // If lowest element of a scalar op isn't used then use Arg0.        if (!DemandedElts[0]) { -        Worklist.Add(II); +        Worklist.push(II);          return II->getArgOperand(0);        } @@ -1615,7 +1700,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,        // If lowest element of a scalar op isn't used then use Arg0.        if (!DemandedElts[0]) { -        Worklist.Add(II); +        Worklist.push(II);          return II->getArgOperand(0);        } @@ -1649,7 +1734,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,        // If lowest element of a scalar op isn't used then use Arg0.        if (!DemandedElts[0]) { -        Worklist.Add(II); +        Worklist.push(II);          return II->getArgOperand(0);        } @@ -1678,7 +1763,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,      case Intrinsic::x86_avx512_packusdw_512:      case Intrinsic::x86_avx512_packuswb_512: {        auto *Ty0 = II->getArgOperand(0)->getType(); -      unsigned InnerVWidth = Ty0->getVectorNumElements(); +      unsigned InnerVWidth = cast<VectorType>(Ty0)->getNumElements();        assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");        unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; @@ -1747,6 +1832,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,      case Intrinsic::amdgcn_raw_buffer_load:      case Intrinsic::amdgcn_raw_buffer_load_format:      case Intrinsic::amdgcn_raw_tbuffer_load: +    case Intrinsic::amdgcn_s_buffer_load:      case Intrinsic::amdgcn_struct_buffer_load:      case Intrinsic::amdgcn_struct_buffer_load_format:      case Intrinsic::amdgcn_struct_tbuffer_load: | 
