diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2070 | 
1 files changed, 1693 insertions, 377 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b7b90e23e895..34826809c1a6 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,6 +15,7 @@  #include "AMDGPU.h"  #include "AMDGPUInstrInfo.h"  #include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h"  #include "MCTargetDesc/AMDGPUMCTargetDesc.h"  #include "SIMachineFunctionInfo.h"  #include "SIRegisterInfo.h" @@ -28,6 +29,7 @@  #include "llvm/CodeGen/ByteProvider.h"  #include "llvm/CodeGen/FunctionLoweringInfo.h"  #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"  #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"  #include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunction.h" @@ -146,8 +148,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));    if (Subtarget->has16BitInsts()) { -    addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); -    addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); +    if (Subtarget->useRealTrue16Insts()) { +      addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass); +      addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass); +    } else { +      addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); +      addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); +    }      // Unless there are also VOP3P operations, not operations are really legal.      addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); @@ -158,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);      addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);      addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); +    addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass); +    addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);    }    addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -219,7 +228,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    setOperationAction(ISD::SELECT, MVT::f64, Promote);    AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); -  setOperationAction(ISD::FSQRT, MVT::f64, Custom); +  setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);    setOperationAction(ISD::SELECT_CC,                       {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); @@ -262,13 +271,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    // We only support LOAD/STORE and vector manipulation ops for vectors    // with > 4 elements.    for (MVT VT : -       {MVT::v8i32,  MVT::v8f32,  MVT::v9i32,   MVT::v9f32,  MVT::v10i32, -        MVT::v10f32, MVT::v11i32, MVT::v11f32,  MVT::v12i32, MVT::v12f32, -        MVT::v16i32, MVT::v16f32, MVT::v2i64,   MVT::v2f64,  MVT::v4i16, -        MVT::v4f16,  MVT::v3i64,  MVT::v3f64,   MVT::v6i32,  MVT::v6f32, -        MVT::v4i64,  MVT::v4f64,  MVT::v8i64,   MVT::v8f64,  MVT::v8i16, -        MVT::v8f16,  MVT::v16i16, MVT::v16f16,  MVT::v16i64, MVT::v16f64, -        MVT::v32i32, MVT::v32f32}) { +       {MVT::v8i32,  MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32, +        MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, +        MVT::v16i32, MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16, +        MVT::v4f16,  MVT::v3i64,  MVT::v3f64,  MVT::v6i32,  MVT::v6f32, +        MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,  MVT::v8i16, +        MVT::v8f16,  MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, +        MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {        switch (Op) {        case ISD::LOAD: @@ -420,6 +429,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    if (Subtarget->has16BitInsts()) {      setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);      setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); +  } else { +    setOperationAction(ISD::FSQRT, MVT::f16, Custom);    }    if (Subtarget->hasMadMacF32Insts()) @@ -470,9 +481,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                       {MVT::f32, MVT::f64}, Legal);    if (Subtarget->haveRoundOpsF64()) -    setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal); +    setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64, +                       Legal);    else -    setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR}, +    setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},                         MVT::f64, Custom);    setOperationAction(ISD::FFLOOR, MVT::f64, Legal); @@ -544,8 +556,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      if (STI.hasMadF16())        setOperationAction(ISD::FMAD, MVT::f16, Legal); -    for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, -                   MVT::v8f16, MVT::v16i16, MVT::v16f16}) { +    for (MVT VT : +         {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, +          MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) {        for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {          switch (Op) {          case ISD::LOAD: @@ -631,6 +644,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      setOperationAction(ISD::STORE, MVT::v16f16, Promote);      AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); +    setOperationAction(ISD::LOAD, MVT::v32i16, Promote); +    AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32); +    setOperationAction(ISD::LOAD, MVT::v32f16, Promote); +    AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); + +    setOperationAction(ISD::STORE, MVT::v32i16, Promote); +    AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32); +    setOperationAction(ISD::STORE, MVT::v32f16, Promote); +    AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); +      setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},                         MVT::v2i32, Expand);      setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); @@ -653,12 +676,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);      setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, -                       {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom); +                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, +                       Custom);      setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, -                       {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand); +                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, +                       Expand); -    for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) { +    for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, +                      MVT::v32i16, MVT::v32f16}) {        setOperationAction(            {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},            Vec16, Custom); @@ -681,10 +707,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      setOperationAction(ISD::VECTOR_SHUFFLE,                         {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, -                        MVT::v16f16, MVT::v16i16}, +                        MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},                         Custom); -    for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16}) +    for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})        // Split vector operations.        setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,                            ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, @@ -692,7 +718,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                            ISD::SSUBSAT},                           VT, Custom); -    for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16}) +    for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})        // Split vector operations.        setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},                           VT, Custom); @@ -728,7 +754,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    setOperationAction(ISD::SELECT,                       {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, -                      MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}, +                      MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, +                      MVT::v32i16, MVT::v32f16},                       Custom);    setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); @@ -736,6 +763,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    if (Subtarget->hasMad64_32())      setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); +  if (Subtarget->hasPrefetch()) +    setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + +  if (Subtarget->hasIEEEMinMax()) +    setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, +                       {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal); +    setOperationAction(ISD::INTRINSIC_WO_CHAIN,                       {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,                        MVT::v2i16, MVT::v2f16, MVT::i128}, @@ -753,16 +787,28 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                        MVT::i8, MVT::i128},                       Custom); +  setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); +  setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); + +  // TODO: Could move this to custom lowering, could benefit from combines on +  // extract of relevant bits. +  setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal); + +  setOperationAction(ISD::MUL, MVT::i1, Promote); +    setTargetDAGCombine({ISD::ADD,                         ISD::UADDO_CARRY,                         ISD::SUB,                         ISD::USUBO_CARRY,                         ISD::FADD,                         ISD::FSUB, +                       ISD::FDIV,                         ISD::FMINNUM,                         ISD::FMAXNUM,                         ISD::FMINNUM_IEEE,                         ISD::FMAXNUM_IEEE, +                       ISD::FMINIMUM, +                       ISD::FMAXIMUM,                         ISD::FMA,                         ISD::SMIN,                         ISD::SMAX, @@ -772,6 +818,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                         ISD::AND,                         ISD::OR,                         ISD::XOR, +                       ISD::FSHR,                         ISD::SINT_TO_FP,                         ISD::UINT_TO_FP,                         ISD::FCANONICALIZE, @@ -1002,12 +1049,20 @@ static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {  MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {    if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)      return MVT::v5i32; +  if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && +      DL.getPointerSizeInBits(AS) == 192) +    return MVT::v6i32;    return AMDGPUTargetLowering::getPointerTy(DL, AS);  }  /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka  /// v8i32 when padding is added. +/// The in-memory representation of a p9 is {p8, i32, i32}, which is +/// also v8i32 with padding.  MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { -  if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) +  if ((AMDGPUAS::BUFFER_FAT_POINTER == AS && +       DL.getPointerSizeInBits(AS) == 160) || +      (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && +       DL.getPointerSizeInBits(AS) == 192))      return MVT::v8i32;    return AMDGPUTargetLowering::getPointerMemTy(DL, AS);  } @@ -1186,9 +1241,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,    case Intrinsic::amdgcn_global_atomic_fadd:    case Intrinsic::amdgcn_global_atomic_fmin:    case Intrinsic::amdgcn_global_atomic_fmax: +  case Intrinsic::amdgcn_global_atomic_fmin_num: +  case Intrinsic::amdgcn_global_atomic_fmax_num:    case Intrinsic::amdgcn_flat_atomic_fadd:    case Intrinsic::amdgcn_flat_atomic_fmin:    case Intrinsic::amdgcn_flat_atomic_fmax: +  case Intrinsic::amdgcn_flat_atomic_fmin_num: +  case Intrinsic::amdgcn_flat_atomic_fmax_num:    case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:    case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {      Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1271,6 +1330,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,    case Intrinsic::amdgcn_flat_atomic_fadd:    case Intrinsic::amdgcn_flat_atomic_fmin:    case Intrinsic::amdgcn_flat_atomic_fmax: +  case Intrinsic::amdgcn_flat_atomic_fmin_num: +  case Intrinsic::amdgcn_flat_atomic_fmax_num:    case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:    case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:    case Intrinsic::amdgcn_global_atomic_csub: { @@ -1284,7 +1345,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,    }  } -bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { +bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM, +                                                 unsigned AddrSpace, +                                                 uint64_t FlatVariant) const {    if (!Subtarget->hasFlatInstOffsets()) {      // Flat instructions do not have offsets, and only have the register      // address. @@ -1292,29 +1355,27 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {    }    return AM.Scale == 0 && -         (AM.BaseOffs == 0 || -          Subtarget->getInstrInfo()->isLegalFLATOffset( -              AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT)); +         (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( +                                  AM.BaseOffs, AddrSpace, FlatVariant));  }  bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {    if (Subtarget->hasFlatGlobalInsts()) -    return AM.Scale == 0 && -           (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( -                                    AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS, -                                    SIInstrFlags::FlatGlobal)); +    return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS, +                                     SIInstrFlags::FlatGlobal);    if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { -      // Assume the we will use FLAT for all global memory accesses -      // on VI. -      // FIXME: This assumption is currently wrong.  On VI we still use -      // MUBUF instructions for the r + i addressing mode.  As currently -      // implemented, the MUBUF instructions only work on buffer < 4GB. -      // It may be possible to support > 4GB buffers with MUBUF instructions, -      // by setting the stride value in the resource descriptor which would -      // increase the size limit to (stride * 4GB).  However, this is risky, -      // because it has never been validated. -    return isLegalFlatAddressingMode(AM); +    // Assume the we will use FLAT for all global memory accesses +    // on VI. +    // FIXME: This assumption is currently wrong.  On VI we still use +    // MUBUF instructions for the r + i addressing mode.  As currently +    // implemented, the MUBUF instructions only work on buffer < 4GB. +    // It may be possible to support > 4GB buffers with MUBUF instructions, +    // by setting the stride value in the resource descriptor which would +    // increase the size limit to (stride * 4GB).  However, this is risky, +    // because it has never been validated. +    return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS, +                                     SIInstrFlags::FLAT);    }    return isLegalMUBUFAddressingMode(AM); @@ -1330,7 +1391,8 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {    // assume those use MUBUF instructions. Scratch loads / stores are currently    // implemented as mubuf instructions with offen bit set, so slightly    // different than the normal addr64. -  if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs)) +  const SIInstrInfo *TII = Subtarget->getInstrInfo(); +  if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))      return false;    // FIXME: Since we can split immediate into soffset and immediate offset, @@ -1367,7 +1429,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,    if (AS == AMDGPUAS::CONSTANT_ADDRESS ||        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || -      AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE) { +      AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || +      AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {      // If the offset isn't a multiple of 4, it probably isn't going to be      // correctly aligned.      // FIXME: Can we get the real alignment here? @@ -1394,11 +1457,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,        // On VI, these use the SMEM format and the offset is 20-bit in bytes.        if (!isUInt<20>(AM.BaseOffs))          return false; -    } else { +    } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {        // On GFX9 the offset is signed 21-bit in bytes (but must not be negative        // for S_BUFFER_* instructions).        if (!isInt<21>(AM.BaseOffs))          return false; +    } else { +      // On GFX12, all offsets are signed 24-bit in bytes. +      if (!isInt<24>(AM.BaseOffs)) +        return false;      }      if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. @@ -1411,9 +1478,13 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,    }    if (AS == AMDGPUAS::PRIVATE_ADDRESS) -    return isLegalMUBUFAddressingMode(AM); +    return Subtarget->enableFlatScratch() +               ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS, +                                           SIInstrFlags::FlatScratch) +               : isLegalMUBUFAddressingMode(AM); -  if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { +  if (AS == AMDGPUAS::LOCAL_ADDRESS || +      (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {      // Basic, single offset DS instructions allow a 16-bit unsigned immediate      // field.      // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have @@ -1436,7 +1507,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,      // computation. We don't have instructions that compute pointers with any      // addressing modes, so treat them as having no offset like flat      // instructions. -    return isLegalFlatAddressingMode(AM); +    return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS, +                                     SIInstrFlags::FLAT);    }    // Assume a user alias of global for unknown address spaces. @@ -1748,13 +1820,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,    // We may not have the kernarg segment argument if we have no kernel    // arguments.    if (!InputPtrReg) -    return DAG.getConstant(0, SL, PtrVT); +    return DAG.getConstant(Offset, SL, PtrVT);    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();    SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,      MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); -  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset)); +  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));  }  SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, @@ -2133,13 +2205,14 @@ void SITargetLowering::allocateSpecialInputSGPRs(    const SIRegisterInfo &TRI,    SIMachineFunctionInfo &Info) const {    auto &ArgInfo = Info.getArgInfo(); +  const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();    // TODO: Unify handling with private memory pointers. -  if (Info.hasDispatchPtr()) +  if (UserSGPRInfo.hasDispatchPtr())      allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);    const Module *M = MF.getFunction().getParent(); -  if (Info.hasQueuePtr() && +  if (UserSGPRInfo.hasQueuePtr() &&        AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)      allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); @@ -2148,7 +2221,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(    if (Info.hasImplicitArgPtr())      allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); -  if (Info.hasDispatchID()) +  if (UserSGPRInfo.hasDispatchID())      allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);    // flat_scratch_init is not applicable for non-kernel functions. @@ -2171,34 +2244,35 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,                                              MachineFunction &MF,                                              const SIRegisterInfo &TRI,                                              SIMachineFunctionInfo &Info) const { -  if (Info.hasImplicitBufferPtr()) { +  const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); +  if (UserSGPRInfo.hasImplicitBufferPtr()) {      Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);      MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);      CCInfo.AllocateReg(ImplicitBufferPtrReg);    }    // FIXME: How should these inputs interact with inreg / custom SGPR inputs? -  if (Info.hasPrivateSegmentBuffer()) { +  if (UserSGPRInfo.hasPrivateSegmentBuffer()) {      Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);      MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);      CCInfo.AllocateReg(PrivateSegmentBufferReg);    } -  if (Info.hasDispatchPtr()) { +  if (UserSGPRInfo.hasDispatchPtr()) {      Register DispatchPtrReg = Info.addDispatchPtr(TRI);      MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);      CCInfo.AllocateReg(DispatchPtrReg);    }    const Module *M = MF.getFunction().getParent(); -  if (Info.hasQueuePtr() && +  if (UserSGPRInfo.hasQueuePtr() &&        AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {      Register QueuePtrReg = Info.addQueuePtr(TRI);      MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);      CCInfo.AllocateReg(QueuePtrReg);    } -  if (Info.hasKernargSegmentPtr()) { +  if (UserSGPRInfo.hasKernargSegmentPtr()) {      MachineRegisterInfo &MRI = MF.getRegInfo();      Register InputPtrReg = Info.addKernargSegmentPtr(TRI);      CCInfo.AllocateReg(InputPtrReg); @@ -2207,26 +2281,100 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,      MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));    } -  if (Info.hasDispatchID()) { +  if (UserSGPRInfo.hasDispatchID()) {      Register DispatchIDReg = Info.addDispatchID(TRI);      MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);      CCInfo.AllocateReg(DispatchIDReg);    } -  if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { +  if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {      Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);      MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);      CCInfo.AllocateReg(FlatScratchInitReg);    } +  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read +  // these from the dispatch pointer. +} + +// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be +// sequential starting from the first argument. +void SITargetLowering::allocatePreloadKernArgSGPRs( +    CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, +    const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF, +    const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { +  Function &F = MF.getFunction(); +  unsigned LastExplicitArgOffset = +      MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset(); +  GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); +  bool InPreloadSequence = true; +  unsigned InIdx = 0; +  for (auto &Arg : F.args()) { +    if (!InPreloadSequence || !Arg.hasInRegAttr()) +      break; + +    int ArgIdx = Arg.getArgNo(); +    // Don't preload non-original args or parts not in the current preload +    // sequence. +    if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() || +                               (int)Ins[InIdx].getOrigArgIndex() != ArgIdx)) +      break; + +    for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && +           (int)Ins[InIdx].getOrigArgIndex() == ArgIdx; +         InIdx++) { +      assert(ArgLocs[ArgIdx].isMemLoc()); +      auto &ArgLoc = ArgLocs[InIdx]; +      const Align KernelArgBaseAlign = Align(16); +      unsigned ArgOffset = ArgLoc.getLocMemOffset(); +      Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset); +      unsigned NumAllocSGPRs = +          alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32; + +      // Arg is preloaded into the previous SGPR. +      if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { +        Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( +            Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); +        continue; +      } + +      unsigned Padding = ArgOffset - LastExplicitArgOffset; +      unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; +      // Check for free user SGPRs for preloading. +      if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ > +          SGPRInfo.getNumFreeUserSGPRs()) { +        InPreloadSequence = false; +        break; +      } + +      // Preload this argument. +      const TargetRegisterClass *RC = +          TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32); +      SmallVectorImpl<MCRegister> *PreloadRegs = +          Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs); + +      if (PreloadRegs->size() > 1) +        RC = &AMDGPU::SGPR_32RegClass; +      for (auto &Reg : *PreloadRegs) { +        assert(Reg); +        MF.addLiveIn(Reg, RC); +        CCInfo.AllocateReg(Reg); +      } + +      LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; +    } +  } +} + +void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, +                                           const SIRegisterInfo &TRI, +                                           SIMachineFunctionInfo &Info) const { +  // Always allocate this last since it is a synthetic preload.    if (Info.hasLDSKernelId()) {      Register Reg = Info.addLDSKernelId();      MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);      CCInfo.AllocateReg(Reg);    } - -  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read -  // these from the dispatch pointer.  }  // Allocate special input registers that are initialized per-wave. @@ -2331,7 +2479,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,    // Everything live out of a block is spilled with fast regalloc, so it's    // almost certain that spilling will be required. -  if (TM.getOptLevel() == CodeGenOpt::None) +  if (TM.getOptLevel() == CodeGenOptLevel::None)      HasStackObjects = true;    // For now assume stack access is needed in any callee functions, so we need @@ -2477,12 +2625,14 @@ SDValue SITargetLowering::LowerFormalArguments(    bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);    if (IsGraphics) { -    assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && -           !Info->hasWorkGroupInfo() && !Info->hasLDSKernelId() && -           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && -           !Info->hasWorkItemIDZ()); +    const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); +    assert(!UserSGPRInfo.hasDispatchPtr() && +           !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && +           !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && +           !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); +    (void)UserSGPRInfo;      if (!Subtarget->enableFlatScratch()) -      assert(!Info->hasFlatScratchInit()); +      assert(!UserSGPRInfo.hasFlatScratchInit());      if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())        assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&               !Info->hasWorkGroupIDZ()); @@ -2531,18 +2681,29 @@ SDValue SITargetLowering::LowerFormalArguments(      Splits.append(Ins.begin(), Ins.end());    } +  if (IsKernel) +    analyzeFormalArgumentsCompute(CCInfo, Ins); +    if (IsEntryFunc) {      allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);      allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); +    if (IsKernel && Subtarget->hasKernargPreload() && +        !Subtarget->needsKernargPreloadBackwardsCompatibility()) +      allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info); + +    allocateLDSKernelId(CCInfo, MF, *TRI, *Info);    } else if (!IsGraphics) {      // For the fixed ABI, pass workitem IDs in the last argument register.      allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);    } -  if (IsKernel) { -    analyzeFormalArgumentsCompute(CCInfo, Ins); -  } else { +  if (!IsKernel) {      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); +    if (!IsGraphics && !Subtarget->enableFlatScratch()) { +      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1, +                                                  AMDGPU::SGPR2, AMDGPU::SGPR3}, +                              4); +    }      CCInfo.AnalyzeFormalArguments(Splits, AssignFn);    } @@ -2587,9 +2748,81 @@ SDValue SITargetLowering::LowerFormalArguments(          continue;        } -      SDValue Arg = lowerKernargMemParameter( -        DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]); -      Chains.push_back(Arg.getValue(1)); +      SDValue NewArg; +      if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) { +        if (MemVT.getStoreSize() < 4 && Alignment < 4) { +          // In this case the argument is packed into the previous preload SGPR. +          int64_t AlignDownOffset = alignDown(Offset, 4); +          int64_t OffsetDiff = Offset - AlignDownOffset; +          EVT IntVT = MemVT.changeTypeToInteger(); + +          const SIMachineFunctionInfo *Info = +              MF.getInfo<SIMachineFunctionInfo>(); +          MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); +          Register Reg = +              Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0]; + +          assert(Reg); +          Register VReg = MRI.getLiveInVirtReg(Reg); +          SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); + +          SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32); +          SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt); + +          SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract); +          ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal); +          NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal, +                                  Ins[i].Flags.isSExt(), &Ins[i]); + +          NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL); +        } else { +          const SIMachineFunctionInfo *Info = +              MF.getInfo<SIMachineFunctionInfo>(); +          MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); +          const SmallVectorImpl<MCRegister> &PreloadRegs = +              Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs; + +          SDValue Copy; +          if (PreloadRegs.size() == 1) { +            Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]); +            const TargetRegisterClass *RC = MRI.getRegClass(VReg); +            NewArg = DAG.getCopyFromReg( +                Chain, DL, VReg, +                EVT::getIntegerVT(*DAG.getContext(), +                                  TRI->getRegSizeInBits(*RC))); + +          } else { +            // If the kernarg alignment does not match the alignment of the SGPR +            // tuple RC that can accommodate this argument, it will be built up +            // via copies from from the individual SGPRs that the argument was +            // preloaded to. +            SmallVector<SDValue, 4> Elts; +            for (auto Reg : PreloadRegs) { +              Register VReg = MRI.getLiveInVirtReg(Reg); +              Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); +              Elts.push_back(Copy); +            } +            NewArg = +                DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32, +                                                    PreloadRegs.size()), +                                   DL, Elts); +          } + +          SDValue CMemVT; +          if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType())) +            CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg); +          else +            CMemVT = DAG.getBitcast(MemVT, NewArg); +          NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT, +                                  Ins[i].Flags.isSExt(), &Ins[i]); +          NewArg = DAG.getMergeValues({NewArg, Chain}, DL); +        } +      } else { +        NewArg = +            lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, +                                     Alignment, Ins[i].Flags.isSExt(), &Ins[i]); +      } +      Chains.push_back(NewArg.getValue(1));        auto *ParamTy =          dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); @@ -2599,11 +2832,11 @@ SDValue SITargetLowering::LowerFormalArguments(          // On SI local pointers are just offsets into LDS, so they are always          // less than 16-bits.  On CI and newer they could potentially be          // real pointers, so we can't guarantee their size. -        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, -                          DAG.getValueType(MVT::i16)); +        NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg, +                             DAG.getValueType(MVT::i16));        } -      InVals.push_back(Arg); +      InVals.push_back(NewArg);        continue;      } else if (!IsEntryFunc && VA.isMemLoc()) {        SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); @@ -3084,6 +3317,9 @@ bool SITargetLowering::isEligibleForTailCallOptimization(      const SmallVectorImpl<ISD::OutputArg> &Outs,      const SmallVectorImpl<SDValue> &OutVals,      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { +  if (AMDGPU::isChainCC(CalleeCC)) +    return true; +    if (!mayTailCallThisCC(CalleeCC))      return false; @@ -3168,7 +3404,36 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {  // The wave scratch offset register is used as the global base pointer.  SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,                                      SmallVectorImpl<SDValue> &InVals) const { +  CallingConv::ID CallConv = CLI.CallConv; +  bool IsChainCallConv = AMDGPU::isChainCC(CallConv); +    SelectionDAG &DAG = CLI.DAG; + +  TargetLowering::ArgListEntry RequestedExec; +  if (IsChainCallConv) { +    // The last argument should be the value that we need to put in EXEC. +    // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we +    // don't treat it like the rest of the arguments. +    RequestedExec = CLI.Args.back(); +    assert(RequestedExec.Node && "No node for EXEC"); + +    if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize())) +      return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC"); + +    assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg"); +    CLI.Outs.pop_back(); +    CLI.OutVals.pop_back(); + +    if (RequestedExec.Ty->isIntegerTy(64)) { +      assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up"); +      CLI.Outs.pop_back(); +      CLI.OutVals.pop_back(); +    } + +    assert(CLI.Outs.back().OrigArgIndex != 2 && +           "Haven't popped all the pieces of the EXEC mask"); +  } +    const SDLoc &DL = CLI.DL;    SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;    SmallVector<SDValue, 32> &OutVals = CLI.OutVals; @@ -3176,7 +3441,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,    SDValue Chain = CLI.Chain;    SDValue Callee = CLI.Callee;    bool &IsTailCall = CLI.IsTailCall; -  CallingConv::ID CallConv = CLI.CallConv;    bool IsVarArg = CLI.IsVarArg;    bool IsSibCall = false;    bool IsThisReturn = false; @@ -3207,9 +3471,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,    if (IsTailCall) {      IsTailCall = isEligibleForTailCallOptimization(        Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); -    if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) { +    if (!IsTailCall && +        ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {        report_fatal_error("failed to perform tail call elimination on a call " -                         "site marked musttail"); +                         "site marked musttail or on llvm.amdgcn.cs.chain");      }      bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; @@ -3232,7 +3497,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,    CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); -  if (CallConv != CallingConv::AMDGPU_Gfx) { +  if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {      // With a fixed ABI, allocate fixed registers before user arguments.      passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);    } @@ -3258,16 +3523,20 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,    // Adjust the stack pointer for the new arguments...    // These operations are automatically eliminated by the prolog/epilog pass -  if (!IsSibCall) { +  if (!IsSibCall)      Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); +  if (!IsSibCall || IsChainCallConv) {      if (!Subtarget->enableFlatScratch()) {        SmallVector<SDValue, 4> CopyFromChains;        // In the HSA case, this should be an identity copy.        SDValue ScratchRSrcReg          = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); -      RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); +      RegsToPass.emplace_back(IsChainCallConv +                                  ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 +                                  : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, +                              ScratchRSrcReg);        CopyFromChains.push_back(ScratchRSrcReg.getValue(1));        Chain = DAG.getTokenFactor(DL, CopyFromChains);      } @@ -3412,6 +3681,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,      Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));    } +  if (IsChainCallConv) +    Ops.push_back(RequestedExec.Node); +    // Add argument registers to the end of the list so that they are known live    // into the call.    for (auto &RegToPass : RegsToPass) { @@ -3420,8 +3692,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,    }    // Add a register mask operand representing the call-preserved registers. - -  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); +  auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());    const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);    assert(Mask && "Missing call preserved mask for calling convention");    Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3435,8 +3706,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,    // actual call instruction.    if (IsTailCall) {      MFI.setHasTailCall(); -    unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ? -                   AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN; +    unsigned OPC = AMDGPUISD::TC_RETURN; +    switch (CallConv) { +    case CallingConv::AMDGPU_Gfx: +      OPC = AMDGPUISD::TC_RETURN_GFX; +      break; +    case CallingConv::AMDGPU_CS_Chain: +    case CallingConv::AMDGPU_CS_ChainPreserve: +      OPC = AMDGPUISD::TC_RETURN_CHAIN; +      break; +    } +      return DAG.getNode(OPC, DL, NodeTys, Ops);    } @@ -3481,22 +3761,21 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);    Chain = SP.getValue(1);    MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue(); -  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); -  const TargetFrameLowering *TFL = ST.getFrameLowering(); +  const TargetFrameLowering *TFL = Subtarget->getFrameLowering();    unsigned Opc =      TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?      ISD::ADD : ISD::SUB;    SDValue ScaledSize = DAG.getNode(        ISD::SHL, dl, VT, Size, -      DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32)); +      DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));    Align StackAlign = TFL->getStackAlign();    Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value    if (Alignment && *Alignment > StackAlign) {      Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,                         DAG.getConstant(-(uint64_t)Alignment->value() -                                           << ST.getWavefrontSizeLog2(), +                                           << Subtarget->getWavefrontSizeLog2(),                                         dl, VT));    } @@ -3520,6 +3799,111 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,    return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);  } +SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { +  if (Op.getValueType() != MVT::i32) +    return Op; // Defer to cannot select error. + +  Register SP = getStackPointerRegisterToSaveRestore(); +  SDLoc SL(Op); + +  SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32); + +  // Convert from wave uniform to swizzled vector address. This should protect +  // from any edge cases where the stacksave result isn't directly used with +  // stackrestore. +  SDValue VectorAddress = +      DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP); +  return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL); +} + +SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, +                                            SelectionDAG &DAG) const { +  SDLoc SL(Op); +  assert(Op.getValueType() == MVT::i32); + +  uint32_t BothRoundHwReg = +      AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4); +  SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); + +  SDValue IntrinID = +      DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); +  SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(), +                               Op.getOperand(0), IntrinID, GetRoundBothImm); + +  // There are two rounding modes, one for f32 and one for f64/f16. We only +  // report in the standard value range if both are the same. +  // +  // The raw values also differ from the expected FLT_ROUNDS values. Nearest +  // ties away from zero is not supported, and the other values are rotated by +  // 1. +  // +  // If the two rounding modes are not the same, report a target defined value. + +  // Mode register rounding mode fields: +  // +  // [1:0] Single-precision round mode. +  // [3:2] Double/Half-precision round mode. +  // +  // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero. +  // +  //             Hardware   Spec +  // Toward-0        3        0 +  // Nearest Even    0        1 +  // +Inf            1        2 +  // -Inf            2        3 +  //  NearestAway0  N/A       4 +  // +  // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit +  // table we can index by the raw hardware mode. +  // +  // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf + +  SDValue BitTable = +      DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64); + +  SDValue Two = DAG.getConstant(2, SL, MVT::i32); +  SDValue RoundModeTimesNumBits = +      DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two); + +  // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we +  // knew only one mode was demanded. +  SDValue TableValue = +      DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); +  SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); + +  SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32); +  SDValue TableEntry = +      DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask); + +  // There's a gap in the 4-bit encoded table and actual enum values, so offset +  // if it's an extended value. +  SDValue Four = DAG.getConstant(4, SL, MVT::i32); +  SDValue IsStandardValue = +      DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT); +  SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four); +  SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, +                               TableEntry, EnumOffset); + +  return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); +} + +SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { +  if (Op->isDivergent()) +    return SDValue(); + +  switch (cast<MemSDNode>(Op)->getAddressSpace()) { +  case AMDGPUAS::FLAT_ADDRESS: +  case AMDGPUAS::GLOBAL_ADDRESS: +  case AMDGPUAS::CONSTANT_ADDRESS: +  case AMDGPUAS::CONSTANT_ADDRESS_32BIT: +    break; +  default: +    return SDValue(); +  } + +  return Op; +} +  Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,                                               const MachineFunction &MF) const {    Register Reg = StringSwitch<Register>(RegName) @@ -4217,40 +4601,51 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(    }    case AMDGPU::S_ADD_U64_PSEUDO:    case AMDGPU::S_SUB_U64_PSEUDO: { -    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); +    // For targets older than GFX12, we emit a sequence of 32-bit operations. +    // For GFX12, we emit s_add_u64 and s_sub_u64.      const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); -    const SIRegisterInfo *TRI = ST.getRegisterInfo(); -    const TargetRegisterClass *BoolRC = TRI->getBoolRC(); +    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();      const DebugLoc &DL = MI.getDebugLoc(); -      MachineOperand &Dest = MI.getOperand(0);      MachineOperand &Src0 = MI.getOperand(1);      MachineOperand &Src1 = MI.getOperand(2); - -    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); -    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - -    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( -        MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); -    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( -        MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); - -    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( -        MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); -    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( -        MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); -      bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); - -    unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; -    unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; -    BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0); -    BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1); -    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) -        .addReg(DestSub0) -        .addImm(AMDGPU::sub0) -        .addReg(DestSub1) -        .addImm(AMDGPU::sub1); +    if (Subtarget->hasScalarAddSub64()) { +      unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; +      BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) +          .addReg(Src0.getReg()) +          .addReg(Src1.getReg()); +    } else { +      const SIRegisterInfo *TRI = ST.getRegisterInfo(); +      const TargetRegisterClass *BoolRC = TRI->getBoolRC(); + +      Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); +      Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + +      MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( +          MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); +      MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( +          MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); + +      MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( +          MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); +      MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( +          MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); + +      unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; +      unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; +      BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) +          .add(Src0Sub0) +          .add(Src1Sub0); +      BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) +          .add(Src0Sub1) +          .add(Src1Sub1); +      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) +          .addReg(DestSub0) +          .addImm(AMDGPU::sub0) +          .addReg(DestSub1) +          .addImm(AMDGPU::sub1); +    }      MI.eraseFromParent();      return BB;    } @@ -4463,8 +4858,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(      const SIRegisterInfo *TRI = ST.getRegisterInfo();      Register Dst = MI.getOperand(0).getReg(); -    Register Src0 = MI.getOperand(1).getReg(); -    Register Src1 = MI.getOperand(2).getReg(); +    const MachineOperand &Src0 = MI.getOperand(1); +    const MachineOperand &Src1 = MI.getOperand(2);      const DebugLoc &DL = MI.getDebugLoc();      Register SrcCond = MI.getOperand(3).getReg(); @@ -4473,20 +4868,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(      const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);      Register SrcCondCopy = MRI.createVirtualRegister(CondRC); +    const TargetRegisterClass *Src0RC = Src0.isReg() +                                            ? MRI.getRegClass(Src0.getReg()) +                                            : &AMDGPU::VReg_64RegClass; +    const TargetRegisterClass *Src1RC = Src1.isReg() +                                            ? MRI.getRegClass(Src1.getReg()) +                                            : &AMDGPU::VReg_64RegClass; + +    const TargetRegisterClass *Src0SubRC = +        TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); +    const TargetRegisterClass *Src1SubRC = +        TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); + +    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( +        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); +    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( +        MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + +    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( +        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); +    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( +        MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); +      BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)        .addReg(SrcCond);      BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) -      .addImm(0) -      .addReg(Src0, 0, AMDGPU::sub0) -      .addImm(0) -      .addReg(Src1, 0, AMDGPU::sub0) -      .addReg(SrcCondCopy); +        .addImm(0) +        .add(Src0Sub0) +        .addImm(0) +        .add(Src1Sub0) +        .addReg(SrcCondCopy);      BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) -      .addImm(0) -      .addReg(Src0, 0, AMDGPU::sub1) -      .addImm(0) -      .addReg(Src1, 0, AMDGPU::sub1) -      .addReg(SrcCondCopy); +        .addImm(0) +        .add(Src0Sub1) +        .addImm(0) +        .add(Src1Sub1) +        .addReg(SrcCondCopy);      BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)        .addReg(DstLo) @@ -4843,7 +5260,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,    assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||           VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||           VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || -         VT == MVT::v32f32); +         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);    SDValue Lo, Hi;    std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4866,7 +5283,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,    assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||           VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||           VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || -         VT == MVT::v32f32); +         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);    SDValue Lo0, Hi0;    std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4926,10 +5343,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {             "Load should return a value and a chain");      return Result;    } -  case ISD::FSQRT: -    if (Op.getValueType() == MVT::f64) +  case ISD::FSQRT: { +    EVT VT = Op.getValueType(); +    if (VT == MVT::f32) +      return lowerFSQRTF32(Op, DAG); +    if (VT == MVT::f64)        return lowerFSQRTF64(Op, DAG);      return SDValue(); +  }    case ISD::FSIN:    case ISD::FCOS:      return LowerTrig(Op, DAG); @@ -5027,6 +5448,12 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {      return lowerXMUL_LOHI(Op, DAG);    case ISD::DYNAMIC_STACKALLOC:      return LowerDYNAMIC_STACKALLOC(Op, DAG); +  case ISD::STACKSAVE: +    return LowerSTACKSAVE(Op, DAG); +  case ISD::GET_ROUNDING: +    return lowerGET_ROUNDING(Op, DAG); +  case ISD::PREFETCH: +    return lowerPREFETCH(Op, DAG);    }    return SDValue();  } @@ -5382,6 +5809,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));      return;    } +  case ISD::FSQRT: { +    if (N->getValueType(0) != MVT::f16) +      break; +    Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG)); +    break; +  }    default:      AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);      break; @@ -5433,6 +5866,9 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {  }  bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { +  if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) +    return false; +    // FIXME: Either avoid relying on address space here or change the default    // address space for functions to avoid the explicit check.    return (GV->getValueType()->isFunctionTy() || @@ -5616,7 +6052,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,    if (IsIEEEMode)      return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); -  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16) +  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || +      VT == MVT::v16f16)      return splitBinaryVectorOp(Op, DAG);    return Op;  } @@ -5711,11 +6148,6 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {        Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)      return lowerTrapEndpgm(Op, DAG); -  const Module *M = DAG.getMachineFunction().getFunction().getParent(); -  unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); -  if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) -    return lowerTrapHsaQueuePtr(Op, DAG); -    return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :           lowerTrapHsaQueuePtr(Op, DAG);  } @@ -5873,7 +6305,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,    uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;    SDValue Ptr = -      DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset)); +      DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));    // TODO: Use custom target PseudoSourceValue.    // TODO: We should use the value from the IR intrinsic call, but it might not @@ -6134,7 +6566,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,    if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))      return Combined; -  if (VecSize == 128 || VecSize == 256) { +  if (VecSize == 128 || VecSize == 256 || VecSize == 512) {      SDValue Lo, Hi;      EVT LoVT, HiVT;      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); @@ -6147,9 +6579,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,        Hi = DAG.getBitcast(HiVT,                            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,                                        DAG.getConstant(1, SL, MVT::i32))); -    } else { -      assert(VecSize == 256); - +    } else if (VecSize == 256) {        SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);        SDValue Parts[4];        for (unsigned P = 0; P < 4; ++P) { @@ -6161,6 +6591,22 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,                                              Parts[0], Parts[1]));        Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,                                              Parts[2], Parts[3])); +    } else { +      assert(VecSize == 512); + +      SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec); +      SDValue Parts[8]; +      for (unsigned P = 0; P < 8; ++P) { +        Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, +                               DAG.getConstant(P, SL, MVT::i32)); +      } + +      Lo = DAG.getBitcast(LoVT, +                          DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, +                                      Parts[0], Parts[1], Parts[2], Parts[3])); +      Hi = DAG.getBitcast(HiVT, +                          DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, +                                      Parts[4], Parts[5],Parts[6], Parts[7]));      }      EVT IdxVT = Idx.getValueType(); @@ -6326,6 +6772,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,      return DAG.getNode(ISD::BITCAST, SL, VT, Blend);    } +  if (VT == MVT::v32i16 || VT == MVT::v32f16) { +    EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), +                                     VT.getVectorNumElements() / 8); +    MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); + +    SmallVector<SDValue, 8> Parts[8]; +    for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) { +      for (unsigned P = 0; P < 8; ++P) +        Parts[P].push_back(Op.getOperand(I + P * E)); +    } +    SDValue Casts[8]; +    for (unsigned P = 0; P < 8; ++P) { +      SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); +      Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); +    } + +    SDValue Blend = +        DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts); +    return DAG.getNode(ISD::BITCAST, SL, VT, Blend); +  } +    assert(VT == MVT::v2f16 || VT == MVT::v2i16);    assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); @@ -6391,24 +6858,12 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,    //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,    //   which is a 64-bit pc-relative offset from the encoding of the $symbol    //   operand to the global variable. -  // -  // What we want here is an offset from the value returned by s_getpc -  // (which is the address of the s_add_u32 instruction) to the global -  // variable, but since the encoding of $symbol starts 4 bytes after the start -  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too -  // small. This requires us to add 4 to the global variable offset in order to -  // compute the correct address. Similarly for the s_addc_u32 instruction, the -  // encoding of $symbol starts 12 bytes after the start of the s_add_u32 -  // instruction. -  SDValue PtrLo = -      DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags); +  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);    SDValue PtrHi; -  if (GAFlags == SIInstrInfo::MO_NONE) { +  if (GAFlags == SIInstrInfo::MO_NONE)      PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); -  } else { -    PtrHi = -        DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1); -  } +  else +    PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);    return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);  } @@ -6450,9 +6905,22 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,      return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);    } +  if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { +    SDValue AddrLo = DAG.getTargetGlobalAddress( +        GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); +    AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; + +    SDValue AddrHi = DAG.getTargetGlobalAddress( +        GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI); +    AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0}; + +    return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi); +  } +    if (shouldEmitFixup(GV))      return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); -  else if (shouldEmitPCReloc(GV)) + +  if (shouldEmitPCReloc(GV))      return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,                                     SIInstrInfo::MO_REL32); @@ -6699,6 +7167,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,    unsigned IntrOpcode = Intr->BaseOpcode;    bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);    bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); +  bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);    SmallVector<EVT, 3> ResultTypes(Op->values());    SmallVector<EVT, 3> OrigResultTypes(Op->values()); @@ -6718,7 +7187,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,    if (BaseOpcode->Atomic) {      VData = Op.getOperand(2); -    bool Is64Bit = VData.getValueType() == MVT::i64; +    bool Is64Bit = VData.getValueSizeInBits() == 64;      if (BaseOpcode->AtomicX2) {        SDValue VData2 = Op.getOperand(3);        VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL, @@ -6878,9 +7347,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,    // SIShrinkInstructions will convert NSA encodings to non-NSA after register    // allocation when possible.    // -  // Partial NSA is allowed on GFX11 where the final register is a contiguous +  // Partial NSA is allowed on GFX11+ where the final register is a contiguous    // set of the remaining addresses. -  const unsigned NSAMaxSize = ST->getNSAMaxSize(); +  const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);    const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();    const bool UseNSA = ST->hasNSAEncoding() &&                        VAddrs.size() >= ST->getNSAThreshold(MF) && @@ -6957,7 +7426,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,        Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();    if (BaseOpcode->Atomic)      CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization -  if (CPol & ~AMDGPU::CPol::ALL) +  if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))      return Op;    SmallVector<SDValue, 26> Ops; @@ -6977,7 +7446,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,    Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));    if (IsGFX10Plus)      Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); -  Ops.push_back(Unorm); +  if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) +    Ops.push_back(Unorm);    Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));    Ops.push_back(IsA16 &&  // r128, a16 for gfx9                  ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); @@ -6988,7 +7458,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,    } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {      report_fatal_error("TFE is not supported on this GPU");    } -  Ops.push_back(LWE); // lwe +  if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) +    Ops.push_back(LWE); // lwe    if (!IsGFX10Plus)      Ops.push_back(DimInfo->DA ? True : False);    if (BaseOpcode->HasD16) @@ -7000,7 +7471,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,        UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;    int Opcode = -1; -  if (IsGFX11Plus) { +  if (IsGFX12Plus) { +    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, +                                   NumVDataDwords, NumVAddrDwords); +  } else if (IsGFX11Plus) {      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,                                     UseNSA ? AMDGPU::MIMGEncGfx11NSA                                            : AMDGPU::MIMGEncGfx11Default, @@ -7071,7 +7545,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,      };      // Widen vec3 load to vec4. -    if (VT.isVector() && VT.getVectorNumElements() == 3) { +    if (VT.isVector() && VT.getVectorNumElements() == 3 && +        !Subtarget->hasScalarDwordx3Loads()) {        EVT WidenedVT =            EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);        auto WidenedOp = DAG.getMemIntrinsicNode( @@ -7317,7 +7792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,                             SDLoc(Op), MVT::i32);    case Intrinsic::amdgcn_s_buffer_load: {      unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); -    if (CPol & ~AMDGPU::CPol::ALL) +    if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) +                     ? AMDGPU::CPol::ALL +                     : AMDGPU::CPol::ALL_pregfx12))        return Op;      return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),                          DAG); @@ -7341,9 +7818,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,      return emitRemovedIntrinsicError(DAG, DL, VT);    } -  case Intrinsic::amdgcn_ldexp: -    return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2)); -    case Intrinsic::amdgcn_fract:      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); @@ -7490,6 +7964,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,    }  } +// On targets not supporting constant in soffset field, turn zero to +// SGPR_NULL to avoid generating an extra s_mov with zero. +static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, +                             const GCNSubtarget *Subtarget) { +  if (Subtarget->hasRestrictedSOffset()) +    if (auto SOffsetConst = dyn_cast<ConstantSDNode>(SOffset)) { +      if (SOffsetConst->isZero()) { +        return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32); +      } +    } +  return SOffset; +} +  SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,                                                       SelectionDAG &DAG,                                                       unsigned NewOpcode) const { @@ -7498,13 +7985,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,    SDValue VData = Op.getOperand(2);    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); +  auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);    SDValue Ops[] = {        Op.getOperand(0),                      // Chain        VData,                                 // vdata        Rsrc,                                  // rsrc        DAG.getConstant(0, DL, MVT::i32),      // vindex        Offsets.first,                         // voffset -      Op.getOperand(5),                      // soffset +      SOffset,                               // soffset        Offsets.second,                        // offset        Op.getOperand(6),                      // cachepolicy        DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -7531,13 +8019,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,    SDValue VData = Op.getOperand(2);    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); +  auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);    SDValue Ops[] = {        Op.getOperand(0),                      // Chain        VData,                                 // vdata        Rsrc,                                  // rsrc        Op.getOperand(4),                      // vindex        Offsets.first,                         // voffset -      Op.getOperand(6),                      // soffset +      SOffset,                               // soffset        Offsets.second,                        // offset        Op.getOperand(7),                      // cachepolicy        DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -7693,12 +8182,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,      SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); +    auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);      SDValue Ops[] = {          Op.getOperand(0),                      // Chain          Rsrc,                                  // rsrc          DAG.getConstant(0, DL, MVT::i32),      // vindex          Offsets.first,                         // voffset -        Op.getOperand(4),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(5),                      // cachepolicy, swizzled buffer          DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -7717,12 +8207,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,      SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); +    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);      SDValue Ops[] = {          Op.getOperand(0),                      // Chain          Rsrc,                                  // rsrc          Op.getOperand(3),                      // vindex          Offsets.first,                         // voffset -        Op.getOperand(5),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(6),                      // cachepolicy, swizzled buffer          DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -7734,21 +8225,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,      MemSDNode *M = cast<MemSDNode>(Op);      EVT LoadVT = Op.getValueType(); +    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);      unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();      unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();      unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();      unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();      unsigned IdxEn = getIdxEn(Op.getOperand(3));      SDValue Ops[] = { -      Op.getOperand(0),  // Chain -      Op.getOperand(2),  // rsrc -      Op.getOperand(3),  // vindex -      Op.getOperand(4),  // voffset -      Op.getOperand(5),  // soffset -      Op.getOperand(6),  // offset -      DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format -      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy -      DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen +        Op.getOperand(0),                                        // Chain +        Op.getOperand(2),                                        // rsrc +        Op.getOperand(3),                                        // vindex +        Op.getOperand(4),                                        // voffset +        SOffset,                                                 // soffset +        Op.getOperand(6),                                        // offset +        DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format +        DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32),   // cachepolicy +        DAG.getTargetConstant(IdxEn, DL, MVT::i1)                // idxen      };      if (LoadVT.getScalarType() == MVT::f16) @@ -7764,13 +8256,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,      EVT LoadVT = Op.getValueType();      SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); +    auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);      SDValue Ops[] = {          Op.getOperand(0),                      // Chain          Rsrc,                                  // rsrc          DAG.getConstant(0, DL, MVT::i32),      // vindex          Offsets.first,                         // voffset -        Op.getOperand(4),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(5),                      // format          Op.getOperand(6),                      // cachepolicy, swizzled buffer @@ -7790,13 +8283,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,      EVT LoadVT = Op.getValueType();      SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); +    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);      SDValue Ops[] = {          Op.getOperand(0),                      // Chain          Rsrc,                                  // rsrc          Op.getOperand(3),                      // vindex          Offsets.first,                         // voffset -        Op.getOperand(5),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(6),                      // format          Op.getOperand(7),                      // cachepolicy, swizzled buffer @@ -8009,6 +8503,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,    case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {      SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); +    auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);      SDValue Ops[] = {          Op.getOperand(0),                      // Chain          Op.getOperand(2),                      // src @@ -8016,7 +8511,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,          Rsrc,                                  // rsrc          DAG.getConstant(0, DL, MVT::i32),      // vindex          Offsets.first,                         // voffset -        Op.getOperand(6),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(7),                      // cachepolicy          DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -8031,6 +8526,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,    case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {      SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG); +    auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);      SDValue Ops[] = {          Op.getOperand(0),                      // Chain          Op.getOperand(2),                      // src @@ -8038,7 +8534,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,          Rsrc,                                  // rsrc          Op.getOperand(5),                      // vindex          Offsets.first,                         // voffset -        Op.getOperand(7),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(8),                      // cachepolicy          DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -8068,14 +8564,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,        return SDValue();      } +    const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);      const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); +    const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);      const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;      const bool Is64 = NodePtr.getValueType() == MVT::i64;      const unsigned NumVDataDwords = 4;      const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);      const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; -    const bool UseNSA = -        Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize(); +    const bool UseNSA = (Subtarget->hasNSAEncoding() && +                         NumVAddrs <= Subtarget->getNSAMaxSize()) || +                        IsGFX12Plus;      const unsigned BaseOpcodes[2][2] = {          {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},          {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, @@ -8083,15 +8582,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,      int Opcode;      if (UseNSA) {        Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], -                                     IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA +                                     IsGFX12Plus ? AMDGPU::MIMGEncGfx12 +                                     : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA                                                   : AMDGPU::MIMGEncGfx10NSA,                                       NumVDataDwords, NumVAddrDwords);      } else { -      Opcode = -          AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], -                                IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default -                                            : AMDGPU::MIMGEncGfx10Default, -                                NumVDataDwords, NumVAddrDwords); +      assert(!IsGFX12Plus); +      Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], +                                     IsGFX11 ? AMDGPU::MIMGEncGfx11Default +                                             : AMDGPU::MIMGEncGfx10Default, +                                     NumVDataDwords, NumVAddrDwords);      }      assert(Opcode != -1); @@ -8179,8 +8679,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,    }    case Intrinsic::amdgcn_global_atomic_fmin:    case Intrinsic::amdgcn_global_atomic_fmax: +  case Intrinsic::amdgcn_global_atomic_fmin_num: +  case Intrinsic::amdgcn_global_atomic_fmax_num:    case Intrinsic::amdgcn_flat_atomic_fmin: -  case Intrinsic::amdgcn_flat_atomic_fmax: { +  case Intrinsic::amdgcn_flat_atomic_fmax: +  case Intrinsic::amdgcn_flat_atomic_fmin_num: +  case Intrinsic::amdgcn_flat_atomic_fmax_num: {      MemSDNode *M = cast<MemSDNode>(Op);      SDValue Ops[] = {        M->getOperand(0), // Chain @@ -8190,12 +8694,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,      unsigned Opcode = 0;      switch (IntrID) {      case Intrinsic::amdgcn_global_atomic_fmin: -    case Intrinsic::amdgcn_flat_atomic_fmin: { +    case Intrinsic::amdgcn_global_atomic_fmin_num: +    case Intrinsic::amdgcn_flat_atomic_fmin: +    case Intrinsic::amdgcn_flat_atomic_fmin_num: {        Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;        break;      }      case Intrinsic::amdgcn_global_atomic_fmax: -    case Intrinsic::amdgcn_flat_atomic_fmax: { +    case Intrinsic::amdgcn_global_atomic_fmax_num: +    case Intrinsic::amdgcn_flat_atomic_fmax: +    case Intrinsic::amdgcn_flat_atomic_fmax_num: {        Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;        break;      } @@ -8206,6 +8714,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,                                     M->getVTList(), Ops, M->getMemoryVT(),                                     M->getMemOperand());    } +  case Intrinsic::amdgcn_s_get_barrier_state: { +    SDValue Chain = Op->getOperand(0); +    SmallVector<SDValue, 2> Ops; +    unsigned Opc; +    bool IsInlinableBarID = false; +    int64_t BarID; + +    if (isa<ConstantSDNode>(Op->getOperand(2))) { +      BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue(); +      IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID); +    } + +    if (IsInlinableBarID) { +      Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; +      SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); +      Ops.push_back(K); +    } else { +      Opc = AMDGPU::S_GET_BARRIER_STATE_M0; +      SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2)); +      Ops.push_back(M0Val.getValue(0)); +    } + +    auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); +    return SDValue(NewMI, 0); +  }    default:      if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = @@ -8383,13 +8916,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,      return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);    }    case Intrinsic::amdgcn_s_barrier: { -    if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { -      const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); +    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); +    if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {        unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;        if (WGSize <= ST.getWavefrontSize())          return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,                                            Op.getOperand(0)), 0);      } + +    // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait +    if (ST.hasSplitBarriers()) { +      SDValue K = +          DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); +      SDValue BarSignal = +          SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, +                                     MVT::Other, K, Op.getOperand(0)), +                  0); +      SDValue BarWait = +          SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K, +                                     BarSignal.getValue(0)), +                  0); +      return BarWait; +    } +      return SDValue();    };    case Intrinsic::amdgcn_tbuffer_store: { @@ -8429,13 +8978,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,        VData = handleD16VData(VData, DAG);      SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); +    auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);      SDValue Ops[] = {          Chain,          VData,                                 // vdata          Rsrc,                                  // rsrc          Op.getOperand(4),                      // vindex          Offsets.first,                         // voffset -        Op.getOperand(6),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(7),                      // format          Op.getOperand(8),                      // cachepolicy, swizzled buffer @@ -8456,13 +9006,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,        VData = handleD16VData(VData, DAG);      SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); +    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);      SDValue Ops[] = {          Chain,          VData,                                 // vdata          Rsrc,                                  // rsrc          DAG.getConstant(0, DL, MVT::i32),      // vindex          Offsets.first,                         // voffset -        Op.getOperand(5),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(6),                      // format          Op.getOperand(7),                      // cachepolicy, swizzled buffer @@ -8536,13 +9087,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,      SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); +    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);      SDValue Ops[] = {          Chain,          VData,          Rsrc,          DAG.getConstant(0, DL, MVT::i32),      // vindex          Offsets.first,                         // voffset -        Op.getOperand(5),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(6),                      // cachepolicy, swizzled buffer          DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -8586,13 +9138,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,      auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);      auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); +    auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);      SDValue Ops[] = {          Chain,          VData,          Rsrc,          Op.getOperand(4),                      // vindex          Offsets.first,                         // voffset -        Op.getOperand(6),                      // soffset +        SOffset,                               // soffset          Offsets.second,                        // offset          Op.getOperand(7),                      // cachepolicy, swizzled buffer          DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -8620,8 +9173,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,          IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;      unsigned OpOffset = HasVIndex ? 1 : 0;      SDValue VOffset = Op.getOperand(5 + OpOffset); -    auto CVOffset = dyn_cast<ConstantSDNode>(VOffset); -    bool HasVOffset = !CVOffset || !CVOffset->isZero(); +    bool HasVOffset = !isNullConstant(VOffset);      unsigned Size = Op->getConstantOperandVal(4);      switch (Size) { @@ -8684,12 +9236,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,      auto F = LoadMMO->getFlags() &               ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); -    LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, -                                      Size, LoadMMO->getBaseAlign()); +    LoadMMO = +        MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, +                                LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); -    MachineMemOperand *StoreMMO = -        MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, -                                sizeof(int32_t), LoadMMO->getBaseAlign()); +    MachineMemOperand *StoreMMO = MF.getMachineMemOperand( +        StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), +        LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());      auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);      DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); @@ -8760,11 +9313,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,      StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;      auto F = LoadMMO->getFlags() &               ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); -    LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, -                                      Size, LoadMMO->getBaseAlign()); -    MachineMemOperand *StoreMMO = -        MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, -                                sizeof(int32_t), Align(4)); +    LoadMMO = +        MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, +                                LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); +    MachineMemOperand *StoreMMO = MF.getMachineMemOperand( +        StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), +        LoadMMO->getAAInfo());      auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);      DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); @@ -8774,7 +9328,76 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,    case Intrinsic::amdgcn_end_cf:      return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,                                        Op->getOperand(2), Chain), 0); +  case Intrinsic::amdgcn_s_barrier_init: +  case Intrinsic::amdgcn_s_barrier_join: +  case Intrinsic::amdgcn_s_wakeup_barrier: { +    SDValue Chain = Op->getOperand(0); +    SmallVector<SDValue, 2> Ops; +    SDValue BarOp = Op->getOperand(2); +    unsigned Opc; +    bool IsInlinableBarID = false; +    int64_t BarVal; + +    if (isa<ConstantSDNode>(BarOp)) { +      BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue(); +      IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal); +    } + +    if (IsInlinableBarID) { +      switch (IntrinsicID) { +      default: +        return SDValue(); +      case Intrinsic::amdgcn_s_barrier_init: +        Opc = AMDGPU::S_BARRIER_INIT_IMM; +        break; +      case Intrinsic::amdgcn_s_barrier_join: +        Opc = AMDGPU::S_BARRIER_JOIN_IMM; +        break; +      case Intrinsic::amdgcn_s_wakeup_barrier: +        Opc = AMDGPU::S_WAKEUP_BARRIER_IMM; +        break; +      } + +      SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32); +      Ops.push_back(K); +    } else { +      switch (IntrinsicID) { +      default: +        return SDValue(); +      case Intrinsic::amdgcn_s_barrier_init: +        Opc = AMDGPU::S_BARRIER_INIT_M0; +        break; +      case Intrinsic::amdgcn_s_barrier_join: +        Opc = AMDGPU::S_BARRIER_JOIN_M0; +        break; +      case Intrinsic::amdgcn_s_wakeup_barrier: +        Opc = AMDGPU::S_WAKEUP_BARRIER_M0; +        break; +      } +    } + +    if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) { +      SDValue M0Val; +      // Member count will be read from M0[16:22] +      M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3), +                          DAG.getShiftAmountConstant(16, MVT::i32, DL)); +      if (!IsInlinableBarID) { +        // If reference to barrier id is not an inline constant then it must be +        // referenced with M0[4:0]. Perform an OR with the member count to +        // include it in M0. +        M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, +                                           Op.getOperand(2), M0Val), +                        0); +      } +      Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); +    } else if (!IsInlinableBarID) { +      Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0)); +    } + +    auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); +    return SDValue(NewMI, 0); +  }    default: {      if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =              AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -8794,7 +9417,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,  std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(      SDValue Offset, SelectionDAG &DAG) const {    SDLoc DL(Offset); -  const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); +  const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);    SDValue N0 = Offset;    ConstantSDNode *C1 = nullptr; @@ -8870,8 +9493,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,        return;      }    } + +  SDValue SOffsetZero = Subtarget->hasRestrictedSOffset() +                            ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32) +                            : DAG.getConstant(0, DL, MVT::i32); +    Offsets[0] = CombinedOffset; -  Offsets[1] = DAG.getConstant(0, DL, MVT::i32); +  Offsets[1] = SOffsetZero;    Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);  } @@ -9051,7 +9679,7 @@ static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,                                            const SIMachineFunctionInfo &Info) {    // TODO: Should check if the address can definitely not access stack.    if (Info.isEntryFunction()) -    return Info.hasFlatScratchInit(); +    return Info.getUserSGPRInfo().hasFlatScratchInit();    return true;  } @@ -9129,7 +9757,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {    if (AS == AMDGPUAS::CONSTANT_ADDRESS ||        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {      if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) { -      if (MemVT.isPow2VectorType()) +      if (MemVT.isPow2VectorType() || +          (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))          return SDValue();        return WidenOrSplitVectorLoad(Op, DAG);      } @@ -9145,7 +9774,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {      if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&          Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&          Alignment >= Align(4) && NumElements < 32) { -      if (MemVT.isPow2VectorType()) +      if (MemVT.isPow2VectorType() || +          (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))          return SDValue();        return WidenOrSplitVectorLoad(Op, DAG);      } @@ -9217,7 +9847,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {  SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {    EVT VT = Op.getValueType(); -  if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) +  if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 || +      VT.getSizeInBits() == 512)      return splitTernaryVectorOp(Op, DAG);    assert(VT.getSizeInBits() == 64); @@ -9277,11 +9908,6 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,        // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP        // error seems really high at 2^29 ULP. - -      // XXX - do we need afn for this or is arcp sufficent? -      if (RHS.getOpcode() == ISD::FSQRT) -        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); -        // 1.0 / x -> rcp(x)        return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);      } @@ -9294,8 +9920,8 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,      }    } -  // For f16 require arcp only. -  // For f32 require afn+arcp. +  // For f16 require afn or arcp. +  // For f32 require afn.    if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))      return SDValue(); @@ -9480,28 +10106,44 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {    const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();    const DenormalMode DenormMode = Info->getMode().FP32Denormals; -  const bool HasFP32Denormals = DenormMode == DenormalMode::getIEEE(); +  const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE(); +  const bool HasDynamicDenormals = +      (DenormMode.Input == DenormalMode::Dynamic) || +      (DenormMode.Output == DenormalMode::Dynamic); + +  SDValue SavedDenormMode; -  if (!HasFP32Denormals) { +  if (!PreservesDenormals) {      // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV      // lowering. The chain dependence is insufficient, and we need glue. We do      // not need the glue variants in a strictfp function.      SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); +    SDValue Glue = DAG.getEntryNode(); +    if (HasDynamicDenormals) { +      SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL, +                                          DAG.getVTList(MVT::i32, MVT::Glue), +                                          {BitField, Glue}); +      SavedDenormMode = SDValue(GetReg, 0); + +      Glue = DAG.getMergeValues( +          {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL); +    } +      SDNode *EnableDenorm;      if (Subtarget->hasDenormModeInst()) {        const SDValue EnableDenormValue =            getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget); -      EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, -                                 DAG.getEntryNode(), EnableDenormValue).getNode(); +      EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue, +                                 EnableDenormValue) +                         .getNode();      } else {        const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,                                                          SL, MVT::i32); -      EnableDenorm = -          DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, -                             {EnableDenormValue, BitField, DAG.getEntryNode()}); +      EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, +                                        {EnableDenormValue, BitField, Glue});      }      SDValue Ops[3] = { @@ -9531,12 +10173,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {    SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,                               NumeratorScaled, Fma3, Flags); -  if (!HasFP32Denormals) { -    // FIXME: This mishandles dynamic denormal mode. We need to query the -    // current mode and restore the original. - +  if (!PreservesDenormals) {      SDNode *DisableDenorm; -    if (Subtarget->hasDenormModeInst()) { +    if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {        const SDValue DisableDenormValue = getSPDenormModeValue(            FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget); @@ -9544,8 +10183,11 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {                                    Fma4.getValue(1), DisableDenormValue,                                    Fma4.getValue(2)).getNode();      } else { +      assert(HasDynamicDenormals == (bool)SavedDenormMode);        const SDValue DisableDenormValue = -          DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); +          HasDynamicDenormals +              ? SavedDenormMode +              : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);        DisableDenorm = DAG.getMachineNode(            AMDGPU::S_SETREG_B32, SL, MVT::Other, @@ -9754,6 +10396,111 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {    return SDValue();  } +// Avoid the full correct expansion for f32 sqrt when promoting from f16. +SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { +  SDLoc SL(Op); +  assert(!Subtarget->has16BitInsts()); +  SDNodeFlags Flags = Op->getFlags(); +  SDValue Ext = +      DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags); + +  SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32); +  SDValue Sqrt = +      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags); + +  return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt, +                     DAG.getTargetConstant(0, SL, MVT::i32), Flags); +} + +SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { +  SDLoc DL(Op); +  SDNodeFlags Flags = Op->getFlags(); +  MVT VT = Op.getValueType().getSimpleVT(); +  const SDValue X = Op.getOperand(0); + +  if (allowApproxFunc(DAG, Flags)) { +    // Instruction is 1ulp but ignores denormals. +    return DAG.getNode( +        ISD::INTRINSIC_WO_CHAIN, DL, VT, +        DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags); +  } + +  SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT); +  SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT); + +  SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT); + +  SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags); + +  SDValue SqrtX = +      DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags); + +  SDValue SqrtS; +  if (needsDenormHandlingF32(DAG, X, Flags)) { +    SDValue SqrtID = +        DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32); +    SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags); + +    SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS); +    SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, +                                           DAG.getConstant(-1, DL, MVT::i32)); +    SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt); + +    SDValue NegSqrtSNextDown = +        DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags); + +    SDValue SqrtVP = +        DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags); + +    SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, +                                         DAG.getConstant(1, DL, MVT::i32)); +    SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt); + +    SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags); +    SDValue SqrtVS = +        DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags); + +    SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); +    SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE); + +    SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS, +                        Flags); + +    SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT); +    SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS, +                        Flags); +  } else { +    SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags); + +    SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags); + +    SDValue Half = DAG.getConstantFP(0.5f, DL, VT); +    SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags); +    SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags); + +    SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags); +    SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags); +    SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags); + +    SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags); +    SDValue SqrtD = +        DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags); +    SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags); +  } + +  SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT); + +  SDValue ScaledDown = +      DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags); + +  SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags); +  SDValue IsZeroOrInf = +      DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, +                  DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); + +  return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags); +} +  SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {    // For double type, the SQRT and RSQ instructions don't have required    // precision, we apply Goldschmidt's algorithm to improve the result: @@ -10111,9 +10858,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(    return SDValue();  } -// Returns true if argument is a boolean value which is not serialized into -// memory or argument and does not require v_cndmask_b32 to be deserialized. -static bool isBoolSGPR(SDValue V) { +bool llvm::isBoolSGPR(SDValue V) {    if (V.getValueType() != MVT::i1)      return false;    switch (V.getOpcode()) { @@ -10427,13 +11172,34 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,    if (Depth >= 6)      return std::nullopt; +  auto ValueSize = Op.getValueSizeInBits(); +  if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32) +    return std::nullopt; +    switch (Op->getOpcode()) {    case ISD::TRUNCATE: { -    if (Op->getOperand(0).getScalarValueSizeInBits() != 32) +    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); +  } + +  case ISD::SIGN_EXTEND: +  case ISD::ZERO_EXTEND: +  case ISD::SIGN_EXTEND_INREG: { +    SDValue NarrowOp = Op->getOperand(0); +    auto NarrowVT = NarrowOp.getValueType(); +    if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { +      auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); +      NarrowVT = VTSign->getVT(); +    } +    if (!NarrowVT.isByteSized()) +      return std::nullopt; +    uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); + +    if (SrcIndex >= NarrowByteWidth)        return std::nullopt;      return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);    } +  case ISD::SRA:    case ISD::SRL: {      auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));      if (!ShiftOp) @@ -10450,9 +11216,6 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,    }    default: { -    if (Op.getScalarValueSizeInBits() != 32) -      return std::nullopt; -      return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);    }    } @@ -10476,7 +11239,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,    unsigned BitWidth = Op.getScalarValueSizeInBits();    if (BitWidth % 8 != 0)      return std::nullopt; -  assert(Index < BitWidth / 8 && "invalid index requested"); +  if (Index > BitWidth / 8 - 1) +    return std::nullopt;    switch (Op.getOpcode()) {    case ISD::OR: { @@ -10519,6 +11283,31 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,      return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);    } +  case ISD::FSHR: { +    // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) +    auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2)); +    if (!ShiftOp || Op.getValueType().isVector()) +      return std::nullopt; + +    uint64_t BitsProvided = Op.getValueSizeInBits(); +    if (BitsProvided % 8 != 0) +      return std::nullopt; + +    uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided); +    if (BitShift % 8) +      return std::nullopt; + +    uint64_t ConcatSizeInBytes = BitsProvided / 4; +    uint64_t ByteShift = BitShift / 8; + +    uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; +    uint64_t BytesProvided = BitsProvided / 8; +    SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1); +    NewIndex %= BytesProvided; +    return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex); +  } + +  case ISD::SRA:    case ISD::SRL: {      auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));      if (!ShiftOp) @@ -10565,9 +11354,18 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,    }    case ISD::ANY_EXTEND:    case ISD::SIGN_EXTEND: -  case ISD::ZERO_EXTEND: { +  case ISD::ZERO_EXTEND: +  case ISD::SIGN_EXTEND_INREG: +  case ISD::AssertZext: +  case ISD::AssertSext: {      SDValue NarrowOp = Op->getOperand(0); -    unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); +    unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); +    if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || +        Op->getOpcode() == ISD::AssertZext || +        Op->getOpcode() == ISD::AssertSext) { +      auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); +      NarrowBitWidth = VTSign->getVT().getSizeInBits(); +    }      if (NarrowBitWidth % 8 != 0)        return std::nullopt;      uint64_t NarrowByteWidth = NarrowBitWidth / 8; @@ -10581,10 +11379,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,    }    case ISD::TRUNCATE: { -    unsigned NarrowBitWidth = Op.getScalarValueSizeInBits(); -    if (NarrowBitWidth % 8 != 0) -      return std::nullopt; -    uint64_t NarrowByteWidth = NarrowBitWidth / 8; +    uint64_t NarrowByteWidth = BitWidth / 8;      if (NarrowByteWidth >= Index) {        return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, @@ -10594,8 +11389,16 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,      return std::nullopt;    } +  case ISD::CopyFromReg: { +    if (BitWidth / 8 > Index) +      return calculateSrcByte(Op, StartingIndex, Index); + +    return std::nullopt; +  } +    case ISD::LOAD: {      auto L = cast<LoadSDNode>(Op.getNode()); +      unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();      if (NarrowBitWidth % 8 != 0)        return std::nullopt; @@ -10621,6 +11424,41 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,    case ISD::BSWAP:      return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,                                   Depth + 1, StartingIndex); + +  case ISD::EXTRACT_VECTOR_ELT: { +    auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); +    if (!IdxOp) +      return std::nullopt; +    auto VecIdx = IdxOp->getZExtValue(); +    auto ScalarSize = Op.getScalarValueSizeInBits(); +    if (ScalarSize != 32) { +      if ((VecIdx + 1) * ScalarSize > 32) +        return std::nullopt; +      Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; +    } + +    return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0), +                            StartingIndex, Index); +  } + +  case AMDGPUISD::PERM: { +    auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2)); +    if (!PermMask) +      return std::nullopt; + +    auto IdxMask = +        (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8); +    if (IdxMask > 0x07 && IdxMask != 0x0c) +      return std::nullopt; + +    auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1); +    auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask; + +    return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex) +                           : ByteProvider<SDValue>( +                                 ByteProvider<SDValue>::getConstantZero()); +  } +    default: {      return std::nullopt;    } @@ -10630,7 +11468,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,  }  // Returns true if the Operand is a scalar and is 16 bits -static bool is16BitScalarOp(SDValue &Operand) { +static bool isExtendedFrom16Bits(SDValue &Operand) { +    switch (Operand.getOpcode()) {    case ISD::ANY_EXTEND:    case ISD::SIGN_EXTEND: @@ -10646,7 +11485,7 @@ static bool is16BitScalarOp(SDValue &Operand) {        auto MemVT = L->getMemoryVT();        return !MemVT.isVector() && MemVT.getSizeInBits() == 16;      } -    return false; +    return L->getMemoryVT().getSizeInBits() == 16;    }    default:      return false; @@ -10674,29 +11513,118 @@ static bool addresses16Bits(int Mask) {  // Do not lower into v_perm if the operands are actually 16 bit  // and the selected bits (based on PermMask) correspond with two  // easily addressable 16 bit operands. -static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op, +static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,                                  SDValue &OtherOp) {    int Low16 = PermMask & 0xffff;    int Hi16 = (PermMask & 0xffff0000) >> 16; -  // ByteProvider only accepts 32 bit operands -  assert(Op.getValueType().getSizeInBits() == 32); -  assert(OtherOp.getValueType().getSizeInBits() == 32); +  assert(Op.getValueType().isByteSized()); +  assert(OtherOp.getValueType().isByteSized()); -  auto OpIs16Bit = is16BitScalarOp(Op); -  auto OtherOpIs16Bit = is16BitScalarOp(Op); +  auto TempOp = peekThroughBitcasts(Op); +  auto TempOtherOp = peekThroughBitcasts(OtherOp); -  // If there is a size mismatch, then we must use masking on at least one -  // operand -  if (OpIs16Bit != OtherOpIs16Bit) +  auto OpIs16Bit = +      TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp); +  if (!OpIs16Bit)      return true; -  // If both operands are 16 bit, return whether or not we cleanly address both -  if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp)) -    return !addresses16Bits(Low16) || !addresses16Bits(Hi16); +  auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 || +                        isExtendedFrom16Bits(TempOtherOp); +  if (!OtherOpIs16Bit) +    return true; -  // Both are 32 bit operands -  return true; +  // Do we cleanly address both +  return !addresses16Bits(Low16) || !addresses16Bits(Hi16); +} + +static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { +  SelectionDAG &DAG = DCI.DAG; +  EVT VT = N->getValueType(0); + +  if (VT != MVT::i32) +    return SDValue(); + +  // VT is known to be MVT::i32, so we need to provide 4 bytes. +  SmallVector<ByteProvider<SDValue>, 8> PermNodes; +  for (int i = 0; i < 4; i++) { +    // Find the ByteProvider that provides the ith byte of the result of OR +    std::optional<ByteProvider<SDValue>> P = +        calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); +    // TODO support constantZero +    if (!P || P->isConstantZero()) +      return SDValue(); + +    PermNodes.push_back(*P); +  } +  if (PermNodes.size() != 4) +    return SDValue(); + +  int FirstSrc = 0; +  std::optional<int> SecondSrc; +  uint64_t PermMask = 0x00000000; +  for (size_t i = 0; i < PermNodes.size(); i++) { +    auto PermOp = PermNodes[i]; +    // Since the mask is applied to Src1:Src2, Src1 bytes must be offset +    // by sizeof(Src2) = 4 +    int SrcByteAdjust = 4; + +    if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { +      if (SecondSrc.has_value()) +        if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) +          return SDValue(); + +      // Set the index of the second distinct Src node +      SecondSrc = i; +      assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8)); +      SrcByteAdjust = 0; +    } +    assert(PermOp.SrcOffset + SrcByteAdjust < 8); +    assert(!DAG.getDataLayout().isBigEndian()); +    PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); +  } + +  SDValue Op = *PermNodes[FirstSrc].Src; +  SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src +                                          : *PermNodes[FirstSrc].Src; + +  // Check that we haven't just recreated the same FSHR node. +  if (N->getOpcode() == ISD::FSHR && +      (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) && +      (N->getOperand(1) == Op || N->getOperand(1) == OtherOp)) +    return SDValue(); + +  // Check that we are not just extracting the bytes in order from an op +  if (Op == OtherOp && Op.getValueSizeInBits() == 32) { +    int Low16 = PermMask & 0xffff; +    int Hi16 = (PermMask & 0xffff0000) >> 16; + +    bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); +    bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); + +    // The perm op would really just produce Op. So combine into Op +    if (WellFormedLow && WellFormedHi) +      return DAG.getBitcast(MVT::getIntegerVT(32), Op); +  } + +  if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { +    SDLoc DL(N); +    assert(Op.getValueType().isByteSized() && +           OtherOp.getValueType().isByteSized()); + +    // If the ultimate src is less than 32 bits, then we will only be +    // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. +    // CalculateByteProvider would not have returned Op as source if we +    // used a byte that is outside its ValueType. Thus, we are free to +    // ANY_EXTEND as the extended bits are dont-cares. +    Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32); +    OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32); + +    return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, +                       DAG.getConstant(PermMask, DL, MVT::i32)); +  } + +  return SDValue();  }  SDValue SITargetLowering::performOrCombine(SDNode *N, @@ -10812,69 +11740,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,        }      }      if (LHSMask == ~0u || RHSMask == ~0u) { -      SmallVector<ByteProvider<SDValue>, 8> PermNodes; - -      // VT is known to be MVT::i32, so we need to provide 4 bytes. -      assert(VT == MVT::i32); -      for (int i = 0; i < 4; i++) { -        // Find the ByteProvider that provides the ith byte of the result of OR -        std::optional<ByteProvider<SDValue>> P = -            calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); -        // TODO support constantZero -        if (!P || P->isConstantZero()) -          return SDValue(); - -        PermNodes.push_back(*P); -      } -      if (PermNodes.size() != 4) -        return SDValue(); - -      int FirstSrc = 0; -      std::optional<int> SecondSrc; -      uint64_t permMask = 0x00000000; -      for (size_t i = 0; i < PermNodes.size(); i++) { -        auto PermOp = PermNodes[i]; -        // Since the mask is applied to Src1:Src2, Src1 bytes must be offset -        // by sizeof(Src2) = 4 -        int SrcByteAdjust = 4; - -        if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { -          if (SecondSrc.has_value()) -            if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) -              return SDValue(); -          // Set the index of the second distinct Src node -          SecondSrc = i; -          assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() == -                 32); -          SrcByteAdjust = 0; -        } -        assert(PermOp.SrcOffset + SrcByteAdjust < 8); -        assert(!DAG.getDataLayout().isBigEndian()); -        permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); -      } - -      SDValue Op = *PermNodes[FirstSrc].Src; -      SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src -                                              : *PermNodes[FirstSrc].Src; - -      // Check that we are not just extracting the bytes in order from an op -      if (Op == OtherOp) { -        int Low16 = permMask & 0xffff; -        int Hi16 = (permMask & 0xffff0000) >> 16; - -        bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); -        bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); - -        // The perm op would really just produce Op. So combine into Op -        if (WellFormedLow && WellFormedHi) -          return Op; -      } - -      if (hasEightBitAccesses(permMask, Op, OtherOp)) { -        SDLoc DL(N); -        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, -                           DAG.getConstant(permMask, DL, MVT::i32)); -      } +      if (SDValue Perm = matchPERM(N, DCI)) +        return Perm;      }    } @@ -11021,10 +11888,8 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,    SDValue Mask = N->getOperand(1);    // fp_class x, 0 -> false -  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { -    if (CMask->isZero()) -      return DAG.getConstant(0, SDLoc(N), MVT::i1); -  } +  if (isNullConstant(Mask)) +    return DAG.getConstant(0, SDLoc(N), MVT::i1);    if (N->getOperand(0).isUndef())      return DAG.getUNDEF(MVT::i1); @@ -11049,7 +11914,9 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,                             N->getFlags());    } -  if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) { +  // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. +  if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && +      N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {      return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,                             N0.getOperand(0), N->getFlags());    } @@ -11131,10 +11998,14 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,    case ISD::FMAXNUM:    case ISD::FMINNUM_IEEE:    case ISD::FMAXNUM_IEEE: +  case ISD::FMINIMUM: +  case ISD::FMAXIMUM:    case AMDGPUISD::CLAMP:    case AMDGPUISD::FMED3:    case AMDGPUISD::FMAX3: -  case AMDGPUISD::FMIN3: { +  case AMDGPUISD::FMIN3: +  case AMDGPUISD::FMAXIMUM3: +  case AMDGPUISD::FMINIMUM3: {      // FIXME: Shouldn't treat the generic operations different based these.      // However, we aren't really required to flush the result from      // minnum/maxnum.. @@ -11288,7 +12159,9 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,    case AMDGPU::G_FMINNUM:    case AMDGPU::G_FMAXNUM:    case AMDGPU::G_FMINNUM_IEEE: -  case AMDGPU::G_FMAXNUM_IEEE: { +  case AMDGPU::G_FMAXNUM_IEEE: +  case AMDGPU::G_FMINIMUM: +  case AMDGPU::G_FMAXIMUM: {      if (Subtarget->supportsMinMaxDenormModes() ||          // FIXME: denormalsEnabledForType is broken for dynamic          denormalsEnabledForType(MRI.getType(Reg), MF)) @@ -11302,7 +12175,8 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,          return false;      return true;    case AMDGPU::G_INTRINSIC: -    switch (MI->getIntrinsicID()) { +  case AMDGPU::G_INTRINSIC_CONVERGENT: +    switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {      case Intrinsic::amdgcn_fmul_legacy:      case Intrinsic::amdgcn_fmad_ftz:      case Intrinsic::amdgcn_sqrt: @@ -11321,7 +12195,6 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,      case Intrinsic::amdgcn_div_fmas:      case Intrinsic::amdgcn_div_fixup:      case Intrinsic::amdgcn_fract: -    case Intrinsic::amdgcn_ldexp:      case Intrinsic::amdgcn_cvt_pkrtz:      case Intrinsic::amdgcn_cubeid:      case Intrinsic::amdgcn_cubema: @@ -11476,6 +12349,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {    case ISD::FMAXNUM:    case ISD::FMAXNUM_IEEE:      return AMDGPUISD::FMAX3; +  case ISD::FMAXIMUM: +    return AMDGPUISD::FMAXIMUM3;    case ISD::SMAX:      return AMDGPUISD::SMAX3;    case ISD::UMAX: @@ -11483,6 +12358,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {    case ISD::FMINNUM:    case ISD::FMINNUM_IEEE:      return AMDGPUISD::FMIN3; +  case ISD::FMINIMUM: +    return AMDGPUISD::FMINIMUM3;    case ISD::SMIN:      return AMDGPUISD::SMIN3;    case ISD::UMIN: @@ -11842,7 +12719,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine(      case ISD::FMAXNUM:      case ISD::FMINNUM:      case ISD::FMAXNUM_IEEE: -    case ISD::FMINNUM_IEEE: { +    case ISD::FMINNUM_IEEE: +    case ISD::FMAXIMUM: +    case ISD::FMINIMUM: {        SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,                                   Vec.getOperand(0), Idx);        SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, @@ -12203,6 +13082,256 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,    return Accum;  } +// Collect the ultimate src of each of the mul node's operands, and confirm +// each operand is 8 bytes. +static std::optional<ByteProvider<SDValue>> +handleMulOperand(const SDValue &MulOperand) { +  auto Byte0 = calculateByteProvider(MulOperand, 0, 0); +  if (!Byte0 || Byte0->isConstantZero()) { +    return std::nullopt; +  } +  auto Byte1 = calculateByteProvider(MulOperand, 1, 0); +  if (Byte1 && !Byte1->isConstantZero()) { +    return std::nullopt; +  } +  return Byte0; +} + +static unsigned addPermMasks(unsigned First, unsigned Second) { +  unsigned FirstCs = First & 0x0c0c0c0c; +  unsigned SecondCs = Second & 0x0c0c0c0c; +  unsigned FirstNoCs = First & ~0x0c0c0c0c; +  unsigned SecondNoCs = Second & ~0x0c0c0c0c; + +  assert((FirstCs & 0xFF) | (SecondCs & 0xFF)); +  assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00)); +  assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000)); +  assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000)); + +  return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs); +} + +static void placeSources(ByteProvider<SDValue> &Src0, +                         ByteProvider<SDValue> &Src1, +                         SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s, +                         SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s, +                         int Step) { + +  assert(Src0.Src.has_value() && Src1.Src.has_value()); +  // Src0s and Src1s are empty, just place arbitrarily. +  if (Step == 0) { +    Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c}); +    Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c}); +    return; +  } + +  for (int BPI = 0; BPI < 2; BPI++) { +    std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1}; +    if (BPI == 1) { +      BPP = {Src1, Src0}; +    } +    unsigned ZeroMask = 0x0c0c0c0c; +    unsigned FMask = 0xFF << (8 * (3 - Step)); + +    unsigned FirstMask = +        BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask); +    unsigned SecondMask = +        BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask); +    // Attempt to find Src vector which contains our SDValue, if so, add our +    // perm mask to the existing one. If we are unable to find a match for the +    // first SDValue, attempt to find match for the second. +    int FirstGroup = -1; +    for (int I = 0; I < 2; I++) { +      SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs = +          I == 0 ? Src0s : Src1s; +      auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) { +        return IterElt.first == *BPP.first.Src; +      }; + +      auto Match = llvm::find_if(Srcs, MatchesFirst); +      if (Match != Srcs.end()) { +        Match->second = addPermMasks(FirstMask, Match->second); +        FirstGroup = I; +        break; +      } +    } +    if (FirstGroup != -1) { +      SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs = +          FirstGroup == 1 ? Src0s : Src1s; +      auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) { +        return IterElt.first == *BPP.second.Src; +      }; +      auto Match = llvm::find_if(Srcs, MatchesSecond); +      if (Match != Srcs.end()) { +        Match->second = addPermMasks(SecondMask, Match->second); +      } else +        Srcs.push_back({*BPP.second.Src, SecondMask}); +      return; +    } +  } + +  // If we have made it here, then we could not find a match in Src0s or Src1s +  // for either Src0 or Src1, so just place them arbitrarily. + +  unsigned ZeroMask = 0x0c0c0c0c; +  unsigned FMask = 0xFF << (8 * (3 - Step)); + +  Src0s.push_back( +      {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))}); +  Src1s.push_back( +      {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))}); + +  return; +} + +static SDValue +resolveSources(SelectionDAG &DAG, SDLoc SL, +               SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs, +               bool IsSigned, bool IsAny) { + +  // If we just have one source, just permute it accordingly. +  if (Srcs.size() == 1) { +    auto Elt = Srcs.begin(); +    auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32); + +    // v_perm will produce the original value. +    if (Elt->second == 0x3020100) +      return EltVal; + +    return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal, +                       DAG.getConstant(Elt->second, SL, MVT::i32)); +  } + +  auto FirstElt = Srcs.begin(); +  auto SecondElt = std::next(FirstElt); + +  SmallVector<SDValue, 2> Perms; + +  // If we have multiple sources in the chain, combine them via perms (using +  // calculated perm mask) and Ors. +  while (true) { +    auto FirstMask = FirstElt->second; +    auto SecondMask = SecondElt->second; + +    unsigned FirstCs = FirstMask & 0x0c0c0c0c; +    unsigned FirstPlusFour = FirstMask | 0x04040404; +    // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any +    // original 0x0C. +    FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs; + +    auto PermMask = addPermMasks(FirstMask, SecondMask); +    auto FirstVal = +        DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32); +    auto SecondVal = +        DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32); + +    Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal, +                                SecondVal, +                                DAG.getConstant(PermMask, SL, MVT::i32))); + +    FirstElt = std::next(SecondElt); +    if (FirstElt == Srcs.end()) +      break; + +    SecondElt = std::next(FirstElt); +    // If we only have a FirstElt, then just combine that into the cumulative +    // source node. +    if (SecondElt == Srcs.end()) { +      auto EltVal = +          DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32); + +      Perms.push_back( +          DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal, +                      DAG.getConstant(FirstElt->second, SL, MVT::i32))); +      break; +    } +  } + +  assert(Perms.size() == 1 || Perms.size() == 2); +  return Perms.size() == 2 +             ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1]) +             : Perms[0]; +} + +static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs, +                     unsigned ChainLength) { +  for (auto &[EntryVal, EntryMask] : Srcs) { +    EntryMask = EntryMask >> ((4 - ChainLength) * 8); +    auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000; +    EntryMask += ZeroMask; +  } +} + +static bool isMul(const SDValue Op) { +  auto Opcode = Op.getOpcode(); + +  return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 || +          Opcode == AMDGPUISD::MUL_I24); +} + +static std::optional<bool> +checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0, +                       ByteProvider<SDValue> &Src1, const SDValue &S0Op, +                       const SDValue &S1Op, const SelectionDAG &DAG) { +  // If we both ops are i8s (pre legalize-dag), then the signedness semantics +  // of the dot4 is irrelevant. +  if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8) +    return false; + +  auto Known0 = DAG.computeKnownBits(S0Op, 0); +  bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0; +  bool S0IsSigned = Known0.countMinLeadingOnes() > 0; +  auto Known1 = DAG.computeKnownBits(S1Op, 0); +  bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0; +  bool S1IsSigned = Known1.countMinLeadingOnes() > 0; + +  assert(!(S0IsUnsigned && S0IsSigned)); +  assert(!(S1IsUnsigned && S1IsSigned)); + +  // There are 9 possible permutations of +  // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned} + +  // In two permutations, the sign bits are known to be the same for both Ops, +  // so simply return Signed / Unsigned corresponding to the MSB + +  if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned)) +    return S0IsSigned; + +  // In another two permutations, the sign bits are known to be opposite. In +  // this case return std::nullopt to indicate a bad match. + +  if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned)) +    return std::nullopt; + +  // In the remaining five permutations, we don't know the value of the sign +  // bit for at least one Op. Since we have a valid ByteProvider, we know that +  // the upper bits must be extension bits. Thus, the only ways for the sign +  // bit to be unknown is if it was sign extended from unknown value, or if it +  // was any extended. In either case, it is correct to use the signed +  // version of the signedness semantics of dot4 + +  // In two of such permutations, we known the sign bit is set for +  // one op, and the other is unknown. It is okay to used signed version of +  // dot4. +  if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) || +      ((S1IsSigned && !(S0IsSigned || S0IsUnsigned)))) +    return true; + +  // In one such permutation, we don't know either of the sign bits. It is okay +  // to used the signed version of dot4. +  if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned))) +    return true; + +  // In two of such permutations, we known the sign bit is unset for +  // one op, and the other is unknown. Return std::nullopt to indicate a +  // bad match. +  if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) || +      ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned)))) +    return std::nullopt; + +  llvm_unreachable("Fully covered condition"); +} +  SDValue SITargetLowering::performAddCombine(SDNode *N,                                              DAGCombinerInfo &DCI) const {    SelectionDAG &DAG = DCI.DAG; @@ -12216,14 +13345,146 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,        if (SDValue Folded = tryFoldToMad64_32(N, DCI))          return Folded;      } - -    return SDValue();    }    if (SDValue V = reassociateScalarOps(N, DAG)) {      return V;    } +  if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && +      (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { +    SDValue TempNode(N, 0); +    std::optional<bool> IsSigned; +    SmallVector<std::pair<SDValue, unsigned>, 4> Src0s; +    SmallVector<std::pair<SDValue, unsigned>, 4> Src1s; +    SmallVector<SDValue, 4> Src2s; + +    // Match the v_dot4 tree, while collecting src nodes. +    int ChainLength = 0; +    for (int I = 0; I < 4; I++) { +      auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1; +      if (MulIdx == -1) +        break; +      auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0)); +      if (!Src0) +        break; +      auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1)); +      if (!Src1) +        break; + +      auto IterIsSigned = checkDot4MulSignedness( +          TempNode->getOperand(MulIdx), *Src0, *Src1, +          TempNode->getOperand(MulIdx)->getOperand(0), +          TempNode->getOperand(MulIdx)->getOperand(1), DAG); +      if (!IterIsSigned) +        break; +      if (!IsSigned) +        IsSigned = *IterIsSigned; +      if (*IterIsSigned != *IsSigned) +        break; +      placeSources(*Src0, *Src1, Src0s, Src1s, I); +      auto AddIdx = 1 - MulIdx; +      // Allow the special case where add (add (mul24, 0), mul24) became -> +      // add (mul24, mul24). +      if (I == 2 && isMul(TempNode->getOperand(AddIdx))) { +        Src2s.push_back(TempNode->getOperand(AddIdx)); +        auto Src0 = +            handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0)); +        if (!Src0) +          break; +        auto Src1 = +            handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1)); +        if (!Src1) +          break; +        auto IterIsSigned = checkDot4MulSignedness( +            TempNode->getOperand(AddIdx), *Src0, *Src1, +            TempNode->getOperand(AddIdx)->getOperand(0), +            TempNode->getOperand(AddIdx)->getOperand(1), DAG); +        if (!IterIsSigned) +          break; +        assert(IsSigned); +        if (*IterIsSigned != *IsSigned) +          break; +        placeSources(*Src0, *Src1, Src0s, Src1s, I + 1); +        Src2s.push_back(DAG.getConstant(0, SL, MVT::i32)); +        ChainLength = I + 2; +        break; +      } + +      TempNode = TempNode->getOperand(AddIdx); +      Src2s.push_back(TempNode); +      ChainLength = I + 1; +      if (TempNode->getNumOperands() < 2) +        break; +      LHS = TempNode->getOperand(0); +      RHS = TempNode->getOperand(1); +    } + +    if (ChainLength < 2) +      return SDValue(); + +    // Masks were constructed with assumption that we would find a chain of +    // length 4. If not, then we need to 0 out the MSB bits (via perm mask of +    // 0x0c) so they do not affect dot calculation. +    if (ChainLength < 4) { +      fixMasks(Src0s, ChainLength); +      fixMasks(Src1s, ChainLength); +    } + +    SDValue Src0, Src1; + +    // If we are just using a single source for both, and have permuted the +    // bytes consistently, we can just use the sources without permuting +    // (commutation). +    bool UseOriginalSrc = false; +    if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 && +        Src0s.begin()->second == Src1s.begin()->second && +        Src0s.begin()->first.getValueSizeInBits() == 32 && +        Src1s.begin()->first.getValueSizeInBits() == 32) { +      SmallVector<unsigned, 4> SrcBytes; +      auto Src0Mask = Src0s.begin()->second; +      SrcBytes.push_back(Src0Mask & 0xFF000000); +      bool UniqueEntries = true; +      for (auto I = 1; I < 4; I++) { +        auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8)); + +        if (is_contained(SrcBytes, NextByte)) { +          UniqueEntries = false; +          break; +        } +        SrcBytes.push_back(NextByte); +      } + +      if (UniqueEntries) { +        UseOriginalSrc = true; +        // Must be 32 bits to enter above conditional. +        assert(Src0s.begin()->first.getValueSizeInBits() == 32); +        assert(Src1s.begin()->first.getValueSizeInBits() == 32); +        Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first); +        Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first); +      } +    } + +    if (!UseOriginalSrc) { +      Src0 = resolveSources(DAG, SL, Src0s, false, true); +      Src1 = resolveSources(DAG, SL, Src1s, false, true); +    } + +    assert(IsSigned); +    SDValue Src2 = +        DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32); + +    SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4 +                                                  : Intrinsic::amdgcn_udot4, +                                        SL, MVT::i64); + +    assert(!VT.isVector()); +    auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0, +                           Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1)); + +    return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT); +  } +    if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())      return SDValue(); @@ -12295,8 +13556,7 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,    if (LHS.getOpcode() == ISD::USUBO_CARRY) {      // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc -    auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); -    if (!C || !C->isZero()) +    if (!isNullConstant(LHS.getOperand(1)))        return SDValue();      SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };      return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args); @@ -12417,6 +13677,41 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,    return SDValue();  } +SDValue SITargetLowering::performFDivCombine(SDNode *N, +                                             DAGCombinerInfo &DCI) const { +  SelectionDAG &DAG = DCI.DAG; +  SDLoc SL(N); +  EVT VT = N->getValueType(0); +  if (VT != MVT::f16 || !Subtarget->has16BitInsts()) +    return SDValue(); + +  SDValue LHS = N->getOperand(0); +  SDValue RHS = N->getOperand(1); + +  SDNodeFlags Flags = N->getFlags(); +  SDNodeFlags RHSFlags = RHS->getFlags(); +  if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() || +      !RHS->hasOneUse()) +    return SDValue(); + +  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { +    bool IsNegative = false; +    if (CLHS->isExactlyValue(1.0) || +        (IsNegative = CLHS->isExactlyValue(-1.0))) { +      // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 +      // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 +      if (RHS.getOpcode() == ISD::FSQRT) { +        // TODO: Or in RHS flags, somehow missing from SDNodeFlags +        SDValue Rsq = +            DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags); +        return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq; +      } +    } +  } + +  return SDValue(); +} +  SDValue SITargetLowering::performFMACombine(SDNode *N,                                              DAGCombinerInfo &DCI) const {    SelectionDAG &DAG = DCI.DAG; @@ -12666,7 +13961,7 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,  SDValue SITargetLowering::PerformDAGCombine(SDNode *N,                                              DAGCombinerInfo &DCI) const { -  if (getTargetMachine().getOptLevel() == CodeGenOpt::None) +  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)      return SDValue();    switch (N->getOpcode()) {    case ISD::ADD: @@ -12680,12 +13975,16 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,      return performFAddCombine(N, DCI);    case ISD::FSUB:      return performFSubCombine(N, DCI); +  case ISD::FDIV: +    return performFDivCombine(N, DCI);    case ISD::SETCC:      return performSetCCCombine(N, DCI);    case ISD::FMAXNUM:    case ISD::FMINNUM:    case ISD::FMAXNUM_IEEE:    case ISD::FMINNUM_IEEE: +  case ISD::FMAXIMUM: +  case ISD::FMINIMUM:    case ISD::SMAX:    case ISD::SMIN:    case ISD::UMAX: @@ -12699,6 +13998,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,      return performAndCombine(N, DCI);    case ISD::OR:      return performOrCombine(N, DCI); +  case ISD::FSHR: { +    const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); +    if (N->getValueType(0) == MVT::i32 && N->isDivergent() && +        TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { +      return matchPERM(N, DCI); +    } +    break; +  }    case ISD::XOR:      return performXorCombine(N, DCI);    case ISD::ZERO_EXTEND: @@ -12793,7 +14100,7 @@ static unsigned SubIdx2Lane(unsigned Idx) {    }  } -/// Adjust the writemask of MIMG instructions +/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions  SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,                                            SelectionDAG &DAG) const {    unsigned Opcode = Node->getMachineOpcode(); @@ -12811,7 +14118,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,    unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;    unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;    bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || -                  Node->getConstantOperandVal(LWEIdx)) +                  (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))                       ? true                       : false;    unsigned TFCLane = 0; @@ -12943,7 +14250,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,          continue;      } else {        SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); -      DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); +      SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); +      if (NewUser != User) { +        DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0)); +        DAG.RemoveDeadNode(User); +      }      }      switch (Idx) { @@ -13019,7 +14330,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();    unsigned Opcode = Node->getMachineOpcode(); -  if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && +  if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&        !TII->isGather4(Opcode) &&        AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {      return adjustWritemask(Node, DAG); @@ -13106,7 +14417,7 @@ void SITargetLowering::AddIMGInit(MachineInstr &MI) const {      return;    unsigned TFEVal = TFE ? TFE->getImm() : 0; -  unsigned LWEVal = LWE->getImm(); +  unsigned LWEVal = LWE ? LWE->getImm() : 0;    unsigned D16Val = D16 ? D16->getImm() : 0;    if (!TFEVal && !LWEVal) @@ -13183,7 +14494,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,                                                       SDNode *Node) const {    const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); -  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); +  MachineFunction *MF = MI.getParent()->getParent(); +  MachineRegisterInfo &MRI = MF->getRegInfo(); +  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();    if (TII->isVOP3(MI.getOpcode())) {      // Make sure constant bus requirements are respected. @@ -13194,11 +14507,16 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,      // use between vgpr and agpr as agpr tuples tend to be big.      if (!MI.getDesc().operands().empty()) {        unsigned Opc = MI.getOpcode(); +      bool HasAGPRs = Info->mayNeedAGPRs();        const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); -      for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), -                      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) { +      int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); +      for (auto I : +           {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), +            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {          if (I == -1)            break; +        if ((I == Src2Idx) && (HasAGPRs)) +          break;          MachineOperand &Op = MI.getOperand(I);          if (!Op.isReg() || !Op.getReg().isVirtual())            continue; @@ -13216,6 +14534,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,          MRI.setRegClass(Op.getReg(), NewRC);        } +      if (!HasAGPRs) +        return; +        // Resolve the rest of AV operands to AGPRs.        if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {          if (Src2->isReg() && Src2->getReg().isVirtual()) { @@ -13233,7 +14554,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,      return;    } -  if (TII->isMIMG(MI)) { +  if (TII->isImage(MI)) {      if (!MI.mayStore())        AddIMGInit(MI);      TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr); @@ -13377,7 +14698,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,        return std::pair(0U, RC);    } -  if (Constraint.startswith("{") && Constraint.endswith("}")) { +  if (Constraint.starts_with("{") && Constraint.ends_with("}")) {      StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);      if (RegName.consume_front("v")) {        RC = &AMDGPU::VGPR_32RegClass; @@ -13467,7 +14788,7 @@ static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {  }  void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, -                                                    std::string &Constraint, +                                                    StringRef Constraint,                                                      std::vector<SDValue> &Ops,                                                      SelectionDAG &DAG) const {    if (isImmConstraint(Constraint)) { @@ -13516,8 +14837,7 @@ bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {    return false;  } -bool SITargetLowering::checkAsmConstraintVal(SDValue Op, -                                             const std::string &Constraint, +bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,                                               uint64_t Val) const {    if (Constraint.size() == 1) {      switch (Constraint[0]) { @@ -13735,8 +15055,9 @@ void SITargetLowering::computeKnownBitsForTargetInstr(      const MachineRegisterInfo &MRI, unsigned Depth) const {    const MachineInstr *MI = MRI.getVRegDef(R);    switch (MI->getOpcode()) { -  case AMDGPU::G_INTRINSIC: { -    switch (MI->getIntrinsicID()) { +  case AMDGPU::G_INTRINSIC: +  case AMDGPU::G_INTRINSIC_CONVERGENT: { +    switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {      case Intrinsic::amdgcn_workitem_id_x:        knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);        break; @@ -13801,21 +15122,16 @@ Align SITargetLowering::computeKnownAlignForTargetInstr(    GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,    unsigned Depth) const {    const MachineInstr *MI = MRI.getVRegDef(R); -  switch (MI->getOpcode()) { -  case AMDGPU::G_INTRINSIC: -  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { +  if (auto *GI = dyn_cast<GIntrinsic>(MI)) {      // FIXME: Can this move to generic code? What about the case where the call      // site specifies a lower alignment? -    Intrinsic::ID IID = MI->getIntrinsicID(); +    Intrinsic::ID IID = GI->getIntrinsicID();      LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();      AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);      if (MaybeAlign RetAlign = Attrs.getRetAlignment())        return *RetAlign; -    return Align(1); -  } -  default: -    return Align(1);    } +  return Align(1);  }  Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {  | 
