diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 484 | 
1 files changed, 279 insertions, 205 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index bd577a6fb8c5..323aaaf70cd4 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -18,6 +18,7 @@  #include "AMDGPURegisterBankInfo.h"  #include "AMDGPUTargetMachine.h"  #include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h"  #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"  #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"  #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -59,11 +60,13 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(  const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } -void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, -                                        CodeGenCoverage &CoverageInfo) { +void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, +                                        CodeGenCoverage &CoverageInfo, +                                        ProfileSummaryInfo *PSI, +                                        BlockFrequencyInfo *BFI) {    MRI = &MF.getRegInfo();    Subtarget = &MF.getSubtarget<GCNSubtarget>(); -  InstructionSelector::setupMF(MF, KB, CoverageInfo); +  InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);  }  bool AMDGPUInstructionSelector::isVCC(Register Reg, @@ -136,20 +139,29 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {        const TargetRegisterClass *SrcRC          = TRI.getConstrainedRegClassForOperand(Src, *MRI); -      Register MaskedReg = MRI->createVirtualRegister(SrcRC); +      Optional<ValueAndVReg> ConstVal = +          getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true); +      if (ConstVal) { +        unsigned MovOpc = +            STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; +        BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg) +            .addImm(ConstVal->Value.getBoolValue() ? -1 : 0); +      } else { +        Register MaskedReg = MRI->createVirtualRegister(SrcRC); -      // We can't trust the high bits at this point, so clear them. +        // We can't trust the high bits at this point, so clear them. -      // TODO: Skip masking high bits if def is known boolean. +        // TODO: Skip masking high bits if def is known boolean. -      unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? -        AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; -      BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) -        .addImm(1) -        .addReg(SrcReg); -      BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) -        .addImm(0) -        .addReg(MaskedReg); +        unsigned AndOpc = +            TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; +        BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) +            .addImm(1) +            .addReg(SrcReg); +        BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) +            .addImm(0) +            .addReg(MaskedReg); +      }        if (!MRI->getRegClassOrNull(SrcReg))          MRI->setRegClass(SrcReg, SrcRC); @@ -578,7 +590,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(      return true;    const LLT S32 = LLT::scalar(32); -  const LLT V2S16 = LLT::vector(2, 16); +  const LLT V2S16 = LLT::fixed_vector(2, 16);    Register Dst = MI.getOperand(0).getReg();    if (MRI->getType(Dst) != V2S16) @@ -743,6 +755,30 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {    return true;  } +bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const { +  Register DstReg = MI.getOperand(0).getReg(); +  Register SrcReg = MI.getOperand(1).getReg(); +  Register OffsetReg = MI.getOperand(2).getReg(); +  Register WidthReg = MI.getOperand(3).getReg(); + +  assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID && +         "scalar BFX instructions are expanded in regbankselect"); +  assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 && +         "64-bit vector BFX instructions are expanded in regbankselect"); + +  const DebugLoc &DL = MI.getDebugLoc(); +  MachineBasicBlock *MBB = MI.getParent(); + +  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX; +  unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; +  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg) +                 .addReg(SrcReg) +                 .addReg(OffsetReg) +                 .addReg(WidthReg); +  MI.eraseFromParent(); +  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} +  bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {    if (STI.getLDSBankCount() != 16)      return selectImpl(MI, *CoverageInfo); @@ -916,8 +952,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {      return constrainCopyLikeIntrin(I, AMDGPU::WQM);    case Intrinsic::amdgcn_softwqm:      return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); +  case Intrinsic::amdgcn_strict_wwm:    case Intrinsic::amdgcn_wwm: -    return constrainCopyLikeIntrin(I, AMDGPU::WWM); +    return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); +  case Intrinsic::amdgcn_strict_wqm: +    return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);    case Intrinsic::amdgcn_writelane:      return selectWritelane(I);    case Intrinsic::amdgcn_div_scale: @@ -1375,7 +1414,24 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,    if (HasVSrc) {      Register VSrc = MI.getOperand(1).getReg(); -    MIB.addReg(VSrc); + +    if (STI.needsAlignedVGPRs()) { +      // Add implicit aligned super-reg to force alignment on the data operand. +      Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); +      BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); +      Register NewVR = +          MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); +      BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR) +          .addReg(VSrc, 0, MI.getOperand(1).getSubReg()) +          .addImm(AMDGPU::sub0) +          .addReg(Undef) +          .addImm(AMDGPU::sub1); +      MIB.addReg(NewVR, 0, AMDGPU::sub0); +      MIB.addReg(NewVR, RegState::Implicit); +    } else { +      MIB.addReg(VSrc); +    } +      if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))        return false;    } @@ -1446,24 +1502,6 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,    return TexFailCtrl == 0;  } -static bool parseCachePolicy(uint64_t Value, -                             bool *GLC, bool *SLC, bool *DLC) { -  if (GLC) { -    *GLC = (Value & 0x1) ? 1 : 0; -    Value &= ~(uint64_t)0x1; -  } -  if (SLC) { -    *SLC = (Value & 0x2) ? 1 : 0; -    Value &= ~(uint64_t)0x2; -  } -  if (DLC) { -    *DLC = (Value & 0x4) ? 1 : 0; -    Value &= ~(uint64_t)0x4; -  } - -  return Value == 0; -} -  bool AMDGPUInstructionSelector::selectImageIntrinsic(    MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {    MachineBasicBlock *MBB = MI.getParent(); @@ -1504,8 +1542,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(    const bool IsA16 = (Flags & 1) != 0;    const bool IsG16 = (Flags & 2) != 0; -  // A16 implies 16 bit gradients -  if (IsA16 && !IsG16) +  // A16 implies 16 bit gradients if subtarget doesn't support G16 +  if (IsA16 && !STI.hasG16() && !IsG16)      return false;    unsigned DMask = 0; @@ -1589,21 +1627,11 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(    // TODO: Check this in verifier.    assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); -  bool GLC = false; -  bool SLC = false; -  bool DLC = false; -  if (BaseOpcode->Atomic) { -    GLC = true; // TODO no-return optimization -    if (!parseCachePolicy( -            MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr, -            &SLC, IsGFX10Plus ? &DLC : nullptr)) -      return false; -  } else { -    if (!parseCachePolicy( -            MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC, -            &SLC, IsGFX10Plus ? &DLC : nullptr)) -      return false; -  } +  unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); +  if (BaseOpcode->Atomic) +    CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization +  if (CPol & ~AMDGPU::CPol::ALL) +    return false;    int NumVAddrRegs = 0;    int NumVAddrDwords = 0; @@ -1661,8 +1689,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(        unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;        MIB.addDef(TmpReg); -      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) -        .addReg(TmpReg, RegState::Kill, SubReg); +      if (!MRI->use_empty(VDataOut)) { +        BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) +            .addReg(TmpReg, RegState::Kill, SubReg); +      }      } else {        MIB.addDef(VDataOut); // vdata output @@ -1689,11 +1719,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(    if (IsGFX10Plus)      MIB.addImm(DimInfo->Encoding);    MIB.addImm(Unorm); -  if (IsGFX10Plus) -    MIB.addImm(DLC); -  MIB.addImm(GLC); -  MIB.addImm(SLC); +  MIB.addImm(CPol);    MIB.addImm(IsA16 &&  // a16 or r128               STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);    if (IsGFX10Plus) @@ -1706,6 +1733,38 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(    if (BaseOpcode->HasD16)      MIB.addImm(IsD16 ? -1 : 0); +  if (IsTexFail) { +    // An image load instruction with TFE/LWE only conditionally writes to its +    // result registers. Initialize them to zero so that we always get well +    // defined result values. +    assert(VDataOut && !VDataIn); +    Register Tied = MRI->cloneVirtualRegister(VDataOut); +    Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); +    BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero) +      .addImm(0); +    auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4); +    if (STI.usePRTStrictNull()) { +      // With enable-prt-strict-null enabled, initialize all result registers to +      // zero. +      auto RegSeq = +          BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); +      for (auto Sub : Parts) +        RegSeq.addReg(Zero).addImm(Sub); +    } else { +      // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE +      // result register. +      Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); +      BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); +      auto RegSeq = +          BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); +      for (auto Sub : Parts.drop_back(1)) +        RegSeq.addReg(Undef).addImm(Sub); +      RegSeq.addReg(Zero).addImm(Parts.back()); +    } +    MIB.addReg(Tied, RegState::Implicit); +    MIB->tieOperands(0, MIB->getNumOperands() - 1); +  } +    MI.eraseFromParent();    return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);  } @@ -1733,7 +1792,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(    case Intrinsic::amdgcn_s_barrier:      return selectSBarrier(I);    case Intrinsic::amdgcn_global_atomic_fadd: -    return selectGlobalAtomicFaddIntrinsic(I); +    return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));    default: {      return selectImpl(I, *CoverageInfo);    } @@ -1848,7 +1907,7 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {      return false;    } -  if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { +  if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {      MachineBasicBlock *MBB = I.getParent();      const DebugLoc &DL = I.getDebugLoc(); @@ -2336,6 +2395,13 @@ void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {  bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(    MachineInstr &I) const { +  if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) { +    const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); +    unsigned AS = PtrTy.getAddressSpace(); +    if (AS == AMDGPUAS::GLOBAL_ADDRESS) +      return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2)); +  } +    initM0(I);    return selectImpl(I, *CoverageInfo);  } @@ -2386,8 +2452,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(      MIB.addImm(0);    MIB.addImm(Offset); -  MIB.addImm(1); // glc -  MIB.addImm(0); // slc +  MIB.addImm(AMDGPU::CPol::GLC);    MIB.cloneMemRefs(MI);    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) @@ -2772,7 +2837,7 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(    Register Src1Reg = MI.getOperand(2).getReg();    ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); -  const LLT V2S16 = LLT::vector(2, 16); +  const LLT V2S16 = LLT::fixed_vector(2, 16);    if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)      return false; @@ -2895,6 +2960,8 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(  bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(    MachineInstr &MI) const { +  if (STI.hasGFX90AInsts()) +    return selectImpl(MI, *CoverageInfo);    MachineBasicBlock *MBB = MI.getParent();    const DebugLoc &DL = MI.getDebugLoc(); @@ -2951,7 +3018,7 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(    if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||        Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { -    Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); +    Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());      BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)        .addReg(VIndex.getReg())        .addImm(AMDGPU::sub0) @@ -2968,7 +3035,7 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(    I.add(MI.getOperand(2)); // rsrc    I.add(SOffset);    I.addImm(Offset); -  renderExtractSLC(I, MI, 7); +  I.addImm(MI.getOperand(7).getImm()); // cpol    I.cloneMemRefs(MI);    MI.eraseFromParent(); @@ -2976,8 +3043,14 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(    return true;  } -bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic( -  MachineInstr &MI) const{ +bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( +  MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const { + +  if (STI.hasGFX90AInsts()) { +    // gfx90a adds return versions of the global atomic fadd instructions so no +    // special handling is required. +    return selectImpl(MI, *CoverageInfo); +  }    MachineBasicBlock *MBB = MI.getParent();    const DebugLoc &DL = MI.getDebugLoc(); @@ -2994,16 +3067,16 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(    // FIXME: This is only needed because tablegen requires number of dst operands    // in match and replace pattern to be the same. Otherwise patterns can be    // exported from SDag path. -  auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2)); +  auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal); -  Register Data = MI.getOperand(3).getReg(); +  Register Data = DataOp.getReg();    const unsigned Opc = MRI->getType(Data).isVector() ?      AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;    auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))      .addReg(Addr.first)      .addReg(Data)      .addImm(Addr.second) -    .addImm(0) // SLC +    .addImm(0) // cpol      .cloneMemRefs(MI);    MI.eraseFromParent(); @@ -3140,6 +3213,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {      return selectBVHIntrinsic(I);    case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:      return selectAMDGPU_BUFFER_ATOMIC_FADD(I); +  case AMDGPU::G_SBFX: +  case AMDGPU::G_UBFX: +    return selectG_SBFX_UBFX(I);    default:      return selectImpl(I, *CoverageInfo);    } @@ -3282,7 +3358,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(    if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&        // It's possible to see an f32 fneg here, but unlikely.        // TODO: Treat f32 fneg as only high bit. -      MRI.getType(Src) == LLT::vector(2, 16)) { +      MRI.getType(Src) == LLT::fixed_vector(2, 16)) {      Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);      Src = MI->getOperand(1).getReg();      MI = MRI.getVRegDef(Src); @@ -3408,9 +3484,9 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {    }};  } -template <bool Signed>  std::pair<Register, int> -AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { +AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, +                                                uint64_t FlatVariant) const {    MachineInstr *MI = Root.getParent();    auto Default = std::make_pair(Root.getReg(), 0); @@ -3426,7 +3502,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {      return Default;    unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); -  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed)) +  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))      return Default;    return std::make_pair(PtrBase, ConstOffset); @@ -3434,7 +3510,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {  InstructionSelector::ComplexRendererFns  AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { -  auto PtrWithOffset = selectFlatOffsetImpl<false>(Root); +  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);    return {{        [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, @@ -3443,8 +3519,18 @@ AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {  }  InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { -  auto PtrWithOffset = selectFlatOffsetImpl<true>(Root); +AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const { +  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal); + +  return {{ +      [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, +      [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, +  }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { +  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);    return {{        [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, @@ -3483,39 +3569,56 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {    std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);    if (ConstOffset != 0) { -    if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) { +    if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, +                              SIInstrFlags::FlatGlobal)) {        Addr = PtrBase;        ImmOffset = ConstOffset; -    } else if (ConstOffset > 0) { +    } else {        auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);        if (!PtrBaseDef)          return None;        if (isSGPR(PtrBaseDef->Reg)) { -        // Offset is too large. -        // -        // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) -        //                         + (large_offset & MaxOffset); -        int64_t SplitImmOffset, RemainderOffset; -        std::tie(SplitImmOffset, RemainderOffset) -          = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true); - -        if (isUInt<32>(RemainderOffset)) { -          MachineInstr *MI = Root.getParent(); -          MachineBasicBlock *MBB = MI->getParent(); -          Register HighBits -            = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - -          BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), -                  HighBits) -            .addImm(RemainderOffset); - -          return {{ -            [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); },  // saddr -            [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset -            [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, -          }}; +        if (ConstOffset > 0) { +          // Offset is too large. +          // +          // saddr + large_offset -> saddr + +          //                         (voffset = large_offset & ~MaxOffset) + +          //                         (large_offset & MaxOffset); +          int64_t SplitImmOffset, RemainderOffset; +          std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( +              ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + +          if (isUInt<32>(RemainderOffset)) { +            MachineInstr *MI = Root.getParent(); +            MachineBasicBlock *MBB = MI->getParent(); +            Register HighBits = +                MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + +            BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), +                    HighBits) +                .addImm(RemainderOffset); + +            return {{ +                [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr +                [=](MachineInstrBuilder &MIB) { +                  MIB.addReg(HighBits); +                }, // voffset +                [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, +            }}; +          }          } + +        // We are adding a 64 bit SGPR and a constant. If constant bus limit +        // is 1 we would need to perform 1 or 2 extra moves for each half of +        // the constant and it is better to do a scalar add and then issue a +        // single VALU instruction to materialize zero. Otherwise it is less +        // instructions to perform VALU adds with immediates or inline literals. +        unsigned NumLiterals = +            !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) + +            !TII.isInlineConstant(APInt(32, ConstOffset >> 32)); +        if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) +          return None;        }      }    } @@ -3525,57 +3628,50 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {      return None;    // Match the variable offset. -  if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) { -    // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and -    // drop this. -    if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || -        AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT) -      return None; - -    // It's cheaper to materialize a single 32-bit zero for vaddr than the two -    // moves required to copy a 64-bit SGPR to VGPR. -    const Register SAddr = AddrDef->Reg; -    if (!isSGPR(SAddr)) -      return None; - -    MachineInstr *MI = Root.getParent(); -    MachineBasicBlock *MBB = MI->getParent(); -    Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - -    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), -            VOffset) -      .addImm(0); - -    return {{ -        [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); },    // saddr -        [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },  // voffset -        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset -    }}; +  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { +    // Look through the SGPR->VGPR copy. +    Register SAddr = +        getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); + +    if (SAddr && isSGPR(SAddr)) { +      Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); + +      // It's possible voffset is an SGPR here, but the copy to VGPR will be +      // inserted later. +      if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { +        return {{[=](MachineInstrBuilder &MIB) { // saddr +                   MIB.addReg(SAddr); +                 }, +                 [=](MachineInstrBuilder &MIB) { // voffset +                   MIB.addReg(VOffset); +                 }, +                 [=](MachineInstrBuilder &MIB) { // offset +                   MIB.addImm(ImmOffset); +                 }}}; +      } +    }    } -  // Look through the SGPR->VGPR copy. -  Register SAddr = -    getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); -  if (!SAddr || !isSGPR(SAddr)) +  // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and +  // drop this. +  if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || +      AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))      return None; -  Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); +  // It's cheaper to materialize a single 32-bit zero for vaddr than the two +  // moves required to copy a 64-bit SGPR to VGPR. +  MachineInstr *MI = Root.getParent(); +  MachineBasicBlock *MBB = MI->getParent(); +  Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); -  // It's possible voffset is an SGPR here, but the copy to VGPR will be -  // inserted later. -  Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset); -  if (!VOffset) -    return None; +  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) +      .addImm(0); -  return {{[=](MachineInstrBuilder &MIB) { // saddr -             MIB.addReg(SAddr); -           }, -           [=](MachineInstrBuilder &MIB) { // voffset -             MIB.addReg(VOffset); -           }, -           [=](MachineInstrBuilder &MIB) { // offset -             MIB.addImm(ImmOffset); -           }}}; +  return {{ +      [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr +      [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },      // voffset +      [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }     // offset +  }};  }  InstructionSelector::ComplexRendererFns @@ -3590,7 +3686,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {    std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);    if (ConstOffset != 0 && -      TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { +      TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, +                            SIInstrFlags::FlatScratch)) {      Addr = PtrBase;      ImmOffset = ConstOffset;    } @@ -3624,9 +3721,9 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {        const DebugLoc &DL = I.getDebugLoc();        SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); -      BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr) -        .addFrameIndex(FI) -        .addReg(RHSDef->Reg); +      BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) +          .addFrameIndex(FI) +          .addReg(RHSDef->Reg);      }    } @@ -3639,11 +3736,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {    }};  } -static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { -  auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); -  return PSV && PSV->isStack(); -} -  InstructionSelector::ComplexRendererFns  AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {    MachineInstr *MI = Root.getParent(); @@ -3685,23 +3777,19 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {    Optional<int> FI;    Register VAddr = Root.getReg();    if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { -    if (isBaseWithConstantOffset(Root, *MRI)) { -      const MachineOperand &LHS = RootDef->getOperand(1); -      const MachineOperand &RHS = RootDef->getOperand(2); -      const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); -      const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); -      if (LHSDef && RHSDef) { -        int64_t PossibleOffset = -            RHSDef->getOperand(1).getCImm()->getSExtValue(); -        if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && -            (!STI.privateMemoryResourceIsRangeChecked() || -             KnownBits->signBitIsZero(LHS.getReg()))) { -          if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) -            FI = LHSDef->getOperand(1).getIndex(); -          else -            VAddr = LHS.getReg(); -          Offset = PossibleOffset; -        } +    Register PtrBase; +    int64_t ConstOffset; +    std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); +    if (ConstOffset != 0) { +      if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) && +          (!STI.privateMemoryResourceIsRangeChecked() || +           KnownBits->signBitIsZero(PtrBase))) { +        const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase); +        if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX) +          FI = PtrBaseDef->getOperand(1).getIndex(); +        else +          VAddr = PtrBase; +        Offset = ConstOffset;        }      } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {        FI = RootDef->getOperand(1).getIndex(); @@ -3769,18 +3857,13 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(    const MachineFunction *MF = MBB->getParent();    const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); -  const MachineMemOperand *MMO = *MI->memoperands_begin(); -  const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();    return {{        [=](MachineInstrBuilder &MIB) { // rsrc          MIB.addReg(Info->getScratchRSrcReg());        },        [=](MachineInstrBuilder &MIB) { // soffset -        if (isStackPtrRelative(PtrInfo)) -          MIB.addReg(Info->getStackPtrOffsetReg()); -        else -          MIB.addImm(0); +        MIB.addImm(0);        },        [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset    }}; @@ -4130,10 +4213,8 @@ AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {        [=](MachineInstrBuilder &MIB) { // offset          MIB.addImm(Offset);        }, -      addZeroImm, //  glc -      addZeroImm, //  slc +      addZeroImm, //  cpol        addZeroImm, //  tfe -      addZeroImm, //  dlc        addZeroImm  //  swz      }};  } @@ -4158,11 +4239,9 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {            MIB.addImm(0);        },        [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset -      addZeroImm, //  glc -      addZeroImm, //  slc +      addZeroImm, //  cpol        addZeroImm, //  tfe -      addZeroImm, //  dlc -      addZeroImm  //  swz +      addZeroImm, //  swz      }};  } @@ -4194,7 +4273,9 @@ AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {        [=](MachineInstrBuilder &MIB) { // offset          MIB.addImm(Offset);        }, -      addZeroImm //  slc +      [=](MachineInstrBuilder &MIB) { +        MIB.addImm(AMDGPU::CPol::GLC); // cpol +      }      }};  } @@ -4218,7 +4299,7 @@ AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {            MIB.addImm(0);        },        [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset -      addZeroImm //  slc +      [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol      }};  } @@ -4308,32 +4389,25 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,    MIB.addImm(MI.getOperand(OpIdx).getImm());  } -void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, -                                                 const MachineInstr &MI, -                                                 int OpIdx) const { +void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, +                                                  const MachineInstr &MI, +                                                  int OpIdx) const {    assert(OpIdx >= 0 && "expected to match an immediate operand"); -  MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); +  MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);  } -void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, -                                                 const MachineInstr &MI, -                                                 int OpIdx) const { -  assert(OpIdx >= 0 && "expected to match an immediate operand"); -  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); -} - -void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, +void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,                                                   const MachineInstr &MI,                                                   int OpIdx) const {    assert(OpIdx >= 0 && "expected to match an immediate operand"); -  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); +  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);  } -void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, -                                                 const MachineInstr &MI, -                                                 int OpIdx) const { +void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, +                                             const MachineInstr &MI, +                                             int OpIdx) const {    assert(OpIdx >= 0 && "expected to match an immediate operand"); -  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); +  MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);  }  void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,  | 
