diff options
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 555 | 
1 files changed, 445 insertions, 110 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 258b1737deb3..49929441ef21 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -13,6 +13,10 @@  //  //===----------------------------------------------------------------------===// +#define AMDGPU_LOG2E_F     1.44269504088896340735992468100189214f +#define AMDGPU_LN2_F       0.693147180559945309417232121458176568f +#define AMDGPU_LN10_F      2.30258509299404568401799145468436421f +  #include "AMDGPUISelLowering.h"  #include "AMDGPU.h"  #include "AMDGPUCallLowering.h" @@ -20,6 +24,7 @@  #include "AMDGPUIntrinsicInfo.h"  #include "AMDGPURegisterInfo.h"  #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h"  #include "R600MachineFunctionInfo.h"  #include "SIInstrInfo.h"  #include "SIMachineFunctionInfo.h" @@ -127,27 +132,20 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {    return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);  } -bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) -{ -  assert(Op.getOpcode() == ISD::OR); - -  SDValue N0 = Op->getOperand(0); -  SDValue N1 = Op->getOperand(1); -  EVT VT = N0.getValueType(); - -  if (VT.isInteger() && !VT.isVector()) { -    KnownBits LHSKnown, RHSKnown; -    DAG.computeKnownBits(N0, LHSKnown); +unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { +  KnownBits Known; +  EVT VT = Op.getValueType(); +  DAG.computeKnownBits(Op, Known); -    if (LHSKnown.Zero.getBoolValue()) { -      DAG.computeKnownBits(N1, RHSKnown); +  return VT.getSizeInBits() - Known.countMinLeadingZeros(); +} -      if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) -        return true; -    } -  } +unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { +  EVT VT = Op.getValueType(); -  return false; +  // In order for this to be a signed 24-bit value, bit 23, must +  // be a sign bit. +  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);  }  AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, @@ -323,6 +321,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,    setOperationAction(ISD::FROUND, MVT::f32, Custom);    setOperationAction(ISD::FROUND, MVT::f64, Custom); +  setOperationAction(ISD::FLOG, MVT::f32, Custom); +  setOperationAction(ISD::FLOG10, MVT::f32, Custom); + +  if (Subtarget->has16BitInsts()) { +    setOperationAction(ISD::FLOG, MVT::f16, Custom); +    setOperationAction(ISD::FLOG10, MVT::f16, Custom); +  } +    setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);    setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); @@ -399,8 +405,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,    setOperationAction(ISD::MUL, MVT::i64, Expand);    setOperationAction(ISD::MULHU, MVT::i64, Expand);    setOperationAction(ISD::MULHS, MVT::i64, Expand); -  setOperationAction(ISD::UDIV, MVT::i32, Expand); -  setOperationAction(ISD::UREM, MVT::i32, Expand);    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); @@ -416,8 +420,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);    if (Subtarget->hasFFBL()) -    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); +    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); +  setOperationAction(ISD::CTTZ, MVT::i64, Custom); +  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);    setOperationAction(ISD::CTLZ, MVT::i64, Custom);    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); @@ -475,6 +481,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,      setOperationAction(ISD::CTTZ, VT, Expand);      setOperationAction(ISD::CTLZ, VT, Expand);      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); +    setOperationAction(ISD::SETCC, VT, Expand);    }    static const MVT::SimpleValueType FloatVectorTypes[] = { @@ -492,6 +499,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,      setOperationAction(ISD::FEXP2, VT, Expand);      setOperationAction(ISD::FLOG2, VT, Expand);      setOperationAction(ISD::FREM, VT, Expand); +    setOperationAction(ISD::FLOG, VT, Expand); +    setOperationAction(ISD::FLOG10, VT, Expand);      setOperationAction(ISD::FPOW, VT, Expand);      setOperationAction(ISD::FFLOOR, VT, Expand);      setOperationAction(ISD::FTRUNC, VT, Expand); @@ -507,6 +516,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,      setOperationAction(ISD::SELECT_CC, VT, Expand);      setOperationAction(ISD::FCOPYSIGN, VT, Expand);      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); +    setOperationAction(ISD::SETCC, VT, Expand);    }    // This causes using an unrolled select operation rather than expansion with @@ -822,6 +832,17 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {    return isZExtFree(Val.getValueType(), VT2);  } +// v_mad_mix* support a conversion from f16 to f32. +// +// There is only one special case when denormals are enabled we don't currently, +// where this is OK to use. +bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode, +                                           EVT DestVT, EVT SrcVT) const { +  return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() && +         DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && +         SrcVT.getScalarType() == MVT::f16; +} +  bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {    // There aren't really 64-bit registers, but pairs of 32-bit ones and only a    // limited number of native 64-bit operations. Shrinking an operation to fit @@ -847,9 +868,12 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,    case CallingConv::AMDGPU_PS:    case CallingConv::AMDGPU_CS:    case CallingConv::AMDGPU_HS: +  case CallingConv::AMDGPU_ES: +  case CallingConv::AMDGPU_LS:      return CC_AMDGPU;    case CallingConv::C:    case CallingConv::Fast: +  case CallingConv::Cold:      return CC_AMDGPU_Func;    default:      report_fatal_error("Unsupported calling convention."); @@ -867,9 +891,12 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,    case CallingConv::AMDGPU_PS:    case CallingConv::AMDGPU_CS:    case CallingConv::AMDGPU_HS: +  case CallingConv::AMDGPU_ES: +  case CallingConv::AMDGPU_LS:      return RetCC_SI_Shader;    case CallingConv::C:    case CallingConv::Fast: +  case CallingConv::Cold:      return RetCC_AMDGPU_Func;    default:      report_fatal_error("Unsupported calling convention."); @@ -1000,12 +1027,49 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,    return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);  } -SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, -                                        SmallVectorImpl<SDValue> &InVals) const { +SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, +                                                  SelectionDAG &DAG, +                                                  MachineFrameInfo &MFI, +                                                  int ClobberedFI) const { +  SmallVector<SDValue, 8> ArgChains; +  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); +  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; + +  // Include the original chain at the beginning of the list. When this is +  // used by target LowerCall hooks, this helps legalize find the +  // CALLSEQ_BEGIN node. +  ArgChains.push_back(Chain); + +  // Add a chain value for each stack argument corresponding +  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), +                            UE = DAG.getEntryNode().getNode()->use_end(); +       U != UE; ++U) { +    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) { +      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) { +        if (FI->getIndex() < 0) { +          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); +          int64_t InLastByte = InFirstByte; +          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; + +          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || +              (FirstByte <= InFirstByte && InFirstByte <= LastByte)) +            ArgChains.push_back(SDValue(L, 1)); +        } +      } +    } +  } + +  // Build a tokenfactor for all the chains. +  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); +} + +SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, +                                                 SmallVectorImpl<SDValue> &InVals, +                                                 StringRef Reason) const {    SDValue Callee = CLI.Callee;    SelectionDAG &DAG = CLI.DAG; -  const Function &Fn = *DAG.getMachineFunction().getFunction(); +  const Function &Fn = DAG.getMachineFunction().getFunction();    StringRef FuncName("<unknown>"); @@ -1015,7 +1079,7 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,      FuncName = G->getGlobal()->getName();    DiagnosticInfoUnsupported NoCalls( -      Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc()); +    Fn, Reason + FuncName, CLI.DL.getDebugLoc());    DAG.getContext()->diagnose(NoCalls);    if (!CLI.IsTailCall) { @@ -1026,9 +1090,14 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,    return DAG.getEntryNode();  } +SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, +                                        SmallVectorImpl<SDValue> &InVals) const { +  return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); +} +  SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,                                                        SelectionDAG &DAG) const { -  const Function &Fn = *DAG.getMachineFunction().getFunction(); +  const Function &Fn = DAG.getMachineFunction().getFunction();    DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",                                              SDLoc(Op).getDebugLoc()); @@ -1057,14 +1126,20 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,    case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);    case ISD::FROUND: return LowerFROUND(Op, DAG);    case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); +  case ISD::FLOG: +    return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F); +  case ISD::FLOG10: +    return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);    case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);    case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);    case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);    case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);    case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); +  case ISD::CTTZ: +  case ISD::CTTZ_ZERO_UNDEF:    case ISD::CTLZ:    case ISD::CTLZ_ZERO_UNDEF: -    return LowerCTLZ(Op, DAG); +    return LowerCTLZ_CTTZ(Op, DAG);    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);    }    return Op; @@ -1115,7 +1190,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,      }    } -  const Function &Fn = *DAG.getMachineFunction().getFunction(); +  const Function &Fn = DAG.getMachineFunction().getFunction();    DiagnosticInfoUnsupported BadInit(        Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());    DAG.getContext()->diagnose(BadInit); @@ -1261,7 +1336,6 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,      return scalarizeVectorLoad(Load, DAG);    SDValue BasePtr = Load->getBasePtr(); -  EVT PtrVT = BasePtr.getValueType();    EVT MemVT = Load->getMemoryVT();    SDLoc SL(Op); @@ -1282,8 +1356,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,    SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,                                    Load->getChain(), BasePtr, SrcValue, LoMemVT,                                    BaseAlign, Load->getMemOperand()->getFlags()); -  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, -                              DAG.getConstant(Size, SL, PtrVT)); +  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);    SDValue HiLoad =        DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),                       HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), @@ -1322,10 +1395,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);    std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); -  EVT PtrVT = BasePtr.getValueType(); -  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, -                              DAG.getConstant(LoMemVT.getStoreSize(), SL, -                                              PtrVT)); +  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());    const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();    unsigned BaseAlign = Store->getAlignment(); @@ -1454,49 +1524,181 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,  void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,                                        SelectionDAG &DAG,                                        SmallVectorImpl<SDValue> &Results) const { -  assert(Op.getValueType() == MVT::i64); -    SDLoc DL(Op);    EVT VT = Op.getValueType(); + +  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64"); +    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); -  SDValue one = DAG.getConstant(1, DL, HalfVT); -  SDValue zero = DAG.getConstant(0, DL, HalfVT); +  SDValue One = DAG.getConstant(1, DL, HalfVT); +  SDValue Zero = DAG.getConstant(0, DL, HalfVT);    //HiLo split    SDValue LHS = Op.getOperand(0); -  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); -  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); +  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); +  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);    SDValue RHS = Op.getOperand(1); -  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); -  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); +  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); +  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One); -  if (VT == MVT::i64 && -    DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && -    DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { +  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && +      DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {      SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),                                LHS_Lo, RHS_Lo); -    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero}); -    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero}); +    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero}); +    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});      Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));      Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));      return;    } +  if (isTypeLegal(MVT::i64)) { +    // Compute denominator reciprocal. +    unsigned FMAD = Subtarget->hasFP32Denormals() ? +                    (unsigned)AMDGPUISD::FMAD_FTZ : +                    (unsigned)ISD::FMAD; + +    SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); +    SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); +    SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, +      DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32), +      Cvt_Lo); +    SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1); +    SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp, +      DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32)); +    SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1, +      DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32)); +    SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2); +    SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc, +      DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32), +      Mul1); +    SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2); +    SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc); +    SDValue Rcp64 = DAG.getBitcast(VT, +                        DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi})); + +    SDValue Zero64 = DAG.getConstant(0, DL, VT); +    SDValue One64  = DAG.getConstant(1, DL, VT); +    SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); +    SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); + +    SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); +    SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); +    SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); +    SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, +                                    Zero); +    SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, +                                    One); + +    SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo, +                                  Mulhi1_Lo, Zero1); +    SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi, +                                  Mulhi1_Hi, Add1_Lo.getValue(1)); +    SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi); +    SDValue Add1 = DAG.getBitcast(VT, +                        DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); + +    SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); +    SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); +    SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, +                                    Zero); +    SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, +                                    One); + +    SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo, +                                  Mulhi2_Lo, Zero1); +    SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc, +                                   Mulhi2_Hi, Add1_Lo.getValue(1)); +    SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC, +                                  Zero, Add2_Lo.getValue(1)); +    SDValue Add2 = DAG.getBitcast(VT, +                        DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); +    SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); + +    SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); + +    SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero); +    SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One); +    SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo, +                                  Mul3_Lo, Zero1); +    SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi, +                                  Mul3_Hi, Sub1_Lo.getValue(1)); +    SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); +    SDValue Sub1 = DAG.getBitcast(VT, +                        DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi})); + +    SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT); +    SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero, +                                 ISD::SETUGE); +    SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero, +                                 ISD::SETUGE); +    SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ); + +    // TODO: Here and below portions of the code can be enclosed into if/endif. +    // Currently control flow is unconditional and we have 4 selects after +    // potential endif to substitute PHIs. + +    // if C3 != 0 ... +    SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo, +                                  RHS_Lo, Zero1); +    SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi, +                                  RHS_Hi, Sub1_Lo.getValue(1)); +    SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, +                                  Zero, Sub2_Lo.getValue(1)); +    SDValue Sub2 = DAG.getBitcast(VT, +                        DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); + +    SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64); + +    SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero, +                                 ISD::SETUGE); +    SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero, +                                 ISD::SETUGE); +    SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ); + +    // if (C6 != 0) +    SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); + +    SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo, +                                  RHS_Lo, Zero1); +    SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, +                                  RHS_Hi, Sub2_Lo.getValue(1)); +    SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi, +                                  Zero, Sub3_Lo.getValue(1)); +    SDValue Sub3 = DAG.getBitcast(VT, +                        DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); + +    // endif C6 +    // endif C3 + +    SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE); +    SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE); + +    SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE); +    SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE); + +    Results.push_back(Div); +    Results.push_back(Rem); + +    return; +  } + +  // r600 expandion.    // Get Speculative values    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); -  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); -  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero}); +  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ); +  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});    REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); -  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); -  SDValue DIV_Lo = zero; +  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ); +  SDValue DIV_Lo = Zero;    const unsigned halfBitWidth = HalfVT.getSizeInBits(); @@ -1505,7 +1707,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,      SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);      // Get value of high bit      SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); -    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); +    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);      HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);      // Shift @@ -1514,7 +1716,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,      REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);      SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); -    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); +    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); @@ -1971,13 +2173,45 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {    return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);  } -SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, +                                        double Log2BaseInverted) const { +  EVT VT = Op.getValueType(); + +  SDLoc SL(Op); +  SDValue Operand = Op.getOperand(0); +  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand); +  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); + +  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); +} + +static bool isCtlzOpc(unsigned Opc) { +  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + +static bool isCttzOpc(unsigned Opc) { +  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; +} + +SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {    SDLoc SL(Op);    SDValue Src = Op.getOperand(0); -  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; +  bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF || +                   Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + +  unsigned ISDOpc, NewOpc; +  if (isCtlzOpc(Op.getOpcode())) { +    ISDOpc = ISD::CTLZ_ZERO_UNDEF; +    NewOpc = AMDGPUISD::FFBH_U32; +  } else if (isCttzOpc(Op.getOpcode())) { +    ISDOpc = ISD::CTTZ_ZERO_UNDEF; +    NewOpc = AMDGPUISD::FFBL_B32; +  } else +    llvm_unreachable("Unexpected OPCode!!!"); +    if (ZeroUndef && Src.getValueType() == MVT::i32) -    return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); +    return DAG.getNode(NewOpc, SL, MVT::i32, Src);    SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); @@ -1990,24 +2224,32 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {    EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),                                     *DAG.getContext(), MVT::i32); -  SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); +  SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo; +  SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ); -  SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); -  SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); +  SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo); +  SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);    const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); -  SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); - -  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) -  SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); +  SDValue Add, NewOpr; +  if (isCtlzOpc(Op.getOpcode())) { +    Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32); +    // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) +    NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi); +  } else { +    Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32); +    // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x)) +    NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo); +  }    if (!ZeroUndef) {      // Test if the full 64-bit input is zero.      // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,      // which we probably don't want. -    SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); -    SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); +    SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi; +    SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ); +    SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);      // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction      // with the same cycles, otherwise it is slower. @@ -2018,11 +2260,11 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {      // The instruction returns -1 for 0 input, but the defined intrinsic      // behavior is to return the number of bits. -    NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, -                          SrcIsZero, Bits32, NewCtlz); +    NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, +                         SrcIsZero, Bits32, NewOpr);    } -  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); +  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);  }  SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, @@ -2389,21 +2631,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,  //===----------------------------------------------------------------------===//  static bool isU24(SDValue Op, SelectionDAG &DAG) { -  KnownBits Known; -  EVT VT = Op.getValueType(); -  DAG.computeKnownBits(Op, Known); - -  return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24; +  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;  }  static bool isI24(SDValue Op, SelectionDAG &DAG) {    EVT VT = Op.getValueType(); - -  // In order for this to be a signed 24-bit value, bit 23, must -  // be a sign bit.    return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated                                       // as unsigned 24-bit values. -         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; +    AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;  }  static bool simplifyI24(SDNode *Node24, unsigned OpIdx, @@ -2665,11 +2900,21 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,    case ISD::ZERO_EXTEND:    case ISD::SIGN_EXTEND:    case ISD::ANY_EXTEND: { +    SDValue X = LHS->getOperand(0); + +    if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && +        isTypeLegal(MVT::v2i16)) { +      // Prefer build_vector as the canonical form if packed types are legal. +      // (shl ([asz]ext i16:x), 16 -> build_vector 0, x +      SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, +       { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) }); +      return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); +    } +      // shl (ext x) => zext (shl x), if shift does not overflow int      if (VT != MVT::i64)        break;      KnownBits Known; -    SDValue X = LHS->getOperand(0);      DAG.computeKnownBits(X, Known);      unsigned LZ = Known.countMinLeadingZeros();      if (LZ < RHSVal) @@ -2678,21 +2923,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,      SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));      return DAG.getZExtOrTrunc(Shl, SL, VT);    } -  case ISD::OR: -    if (!isOrEquivalentToAdd(DAG, LHS)) -      break; -    LLVM_FALLTHROUGH; -  case ISD::ADD: { -    // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) -    if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { -      SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), -                                SDValue(RHS, 0)); -      SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, -                                    SDLoc(C2), VT); -      return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); -    } -    break; -  }    }    if (VT != MVT::i64) @@ -2924,13 +3154,10 @@ static bool isNegativeOne(SDValue Val) {    return false;  } -static bool isCtlzOpc(unsigned Opc) { -  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; -} - -SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, +SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,                                            SDValue Op, -                                          const SDLoc &DL) const { +                                          const SDLoc &DL, +                                          unsigned Opc) const {    EVT VT = Op.getValueType();    EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);    if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && @@ -2940,11 +3167,11 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,    if (VT != MVT::i32)      Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); -  SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op); +  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);    if (VT != MVT::i32) -    FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH); +    FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); -  return FFBH; +  return FFBX;  }  // The native instructions return -1 on 0 input. Optimize out a select that @@ -2954,7 +3181,7 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,  // against the bitwidth.  //  // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. -SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, +SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,                                                   SDValue LHS, SDValue RHS,                                                   DAGCombinerInfo &DCI) const {    ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); @@ -2965,20 +3192,25 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();    SDValue CmpLHS = Cond.getOperand(0); +  unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : +                                           AMDGPUISD::FFBH_U32; +    // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x +  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x    if (CCOpcode == ISD::SETEQ && -      isCtlzOpc(RHS.getOpcode()) && +      (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&        RHS.getOperand(0) == CmpLHS &&        isNegativeOne(LHS)) { -    return getFFBH_U32(DAG, CmpLHS, SL); +    return getFFBX_U32(DAG, CmpLHS, SL, Opc);    }    // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x +  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x    if (CCOpcode == ISD::SETNE && -      isCtlzOpc(LHS.getOpcode()) && +      (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&        LHS.getOperand(0) == CmpLHS &&        isNegativeOne(RHS)) { -    return getFFBH_U32(DAG, CmpLHS, SL); +    return getFFBX_U32(DAG, CmpLHS, SL, Opc);    }    return SDValue(); @@ -3111,7 +3343,7 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,    }    // There's no reason to not do this if the condition has other uses. -  return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); +  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);  }  static bool isConstantFPZero(SDValue N) { @@ -3581,6 +3813,48 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,    return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);  } +SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, +                                                  EVT VT, +                                                  const SDLoc &SL, +                                                  int64_t Offset) const { +  MachineFunction &MF = DAG.getMachineFunction(); +  MachineFrameInfo &MFI = MF.getFrameInfo(); + +  int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); +  auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); +  SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); + +  return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, +                     MachineMemOperand::MODereferenceable | +                     MachineMemOperand::MOInvariant); +} + +SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, +                                                   const SDLoc &SL, +                                                   SDValue Chain, +                                                   SDValue StackPtr, +                                                   SDValue ArgVal, +                                                   int64_t Offset) const { +  MachineFunction &MF = DAG.getMachineFunction(); +  MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); + +  SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset); +  SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, +                               MachineMemOperand::MODereferenceable); +  return Store; +} + +SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, +                                             const TargetRegisterClass *RC, +                                             EVT VT, const SDLoc &SL, +                                             const ArgDescriptor &Arg) const { +  assert(Arg && "Attempting to load missing argument"); + +  if (Arg.isRegister()) +    return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); +  return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); +} +  uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(      const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {    unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); @@ -3608,6 +3882,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {    NODE_NAME_CASE(ELSE)    NODE_NAME_CASE(LOOP)    NODE_NAME_CASE(CALL) +  NODE_NAME_CASE(TC_RETURN)    NODE_NAME_CASE(TRAP)    NODE_NAME_CASE(RET_FLAG)    NODE_NAME_CASE(RETURN_TO_EPILOG) @@ -3655,6 +3930,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {    NODE_NAME_CASE(BFM)    NODE_NAME_CASE(FFBH_U32)    NODE_NAME_CASE(FFBH_I32) +  NODE_NAME_CASE(FFBL_B32)    NODE_NAME_CASE(MUL_U24)    NODE_NAME_CASE(MUL_I24)    NODE_NAME_CASE(MULHI_U24) @@ -3663,6 +3939,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {    NODE_NAME_CASE(MUL_LOHI_I24)    NODE_NAME_CASE(MAD_U24)    NODE_NAME_CASE(MAD_I24) +  NODE_NAME_CASE(MAD_I64_I32) +  NODE_NAME_CASE(MAD_U64_U32)    NODE_NAME_CASE(TEXTURE_FETCH)    NODE_NAME_CASE(EXPORT)    NODE_NAME_CASE(EXPORT_DONE) @@ -3704,6 +3982,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {    NODE_NAME_CASE(ATOMIC_DEC)    NODE_NAME_CASE(BUFFER_LOAD)    NODE_NAME_CASE(BUFFER_LOAD_FORMAT) +  NODE_NAME_CASE(BUFFER_STORE) +  NODE_NAME_CASE(BUFFER_STORE_FORMAT) +  NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) +  NODE_NAME_CASE(BUFFER_ATOMIC_ADD) +  NODE_NAME_CASE(BUFFER_ATOMIC_SUB) +  NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) +  NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) +  NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) +  NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) +  NODE_NAME_CASE(BUFFER_ATOMIC_AND) +  NODE_NAME_CASE(BUFFER_ATOMIC_OR) +  NODE_NAME_CASE(BUFFER_ATOMIC_XOR) +  NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)    case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;    }    return nullptr; @@ -3754,7 +4045,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(    Known.resetAll(); // Don't know anything. -  KnownBits Known2;    unsigned Opc = Op.getOpcode();    switch (Opc) { @@ -3787,6 +4077,51 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(      Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);      break;    } +  case AMDGPUISD::MUL_U24: +  case AMDGPUISD::MUL_I24: { +    KnownBits LHSKnown, RHSKnown; +    DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1); +    DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1); + +    unsigned TrailZ = LHSKnown.countMinTrailingZeros() + +                      RHSKnown.countMinTrailingZeros(); +    Known.Zero.setLowBits(std::min(TrailZ, 32u)); + +    unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u); +    unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u); +    unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); +    if (MaxValBits >= 32) +      break; +    bool Negative = false; +    if (Opc == AMDGPUISD::MUL_I24) { +      bool LHSNegative = !!(LHSKnown.One  & (1 << 23)); +      bool LHSPositive = !!(LHSKnown.Zero & (1 << 23)); +      bool RHSNegative = !!(RHSKnown.One  & (1 << 23)); +      bool RHSPositive = !!(RHSKnown.Zero & (1 << 23)); +      if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive)) +        break; +      Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative); +    } +    if (Negative) +      Known.One.setHighBits(32 - MaxValBits); +    else +      Known.Zero.setHighBits(32 - MaxValBits); +    break; +  } +  case ISD::INTRINSIC_WO_CHAIN: { +    unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); +    switch (IID) { +    case Intrinsic::amdgcn_mbcnt_lo: +    case Intrinsic::amdgcn_mbcnt_hi: { +      // These return at most the wavefront size - 1. +      unsigned Size = Op.getValueType().getSizeInBits(); +      Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2()); +      break; +    } +    default: +      break; +    } +  }    }  }  | 
