aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp484
1 files changed, 279 insertions, 205 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index bd577a6fb8c5..323aaaf70cd4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -18,6 +18,7 @@
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
@@ -59,11 +60,13 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
-void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
- CodeGenCoverage &CoverageInfo) {
+void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
+ CodeGenCoverage &CoverageInfo,
+ ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) {
MRI = &MF.getRegInfo();
Subtarget = &MF.getSubtarget<GCNSubtarget>();
- InstructionSelector::setupMF(MF, KB, CoverageInfo);
+ InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
}
bool AMDGPUInstructionSelector::isVCC(Register Reg,
@@ -136,20 +139,29 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
const TargetRegisterClass *SrcRC
= TRI.getConstrainedRegClassForOperand(Src, *MRI);
- Register MaskedReg = MRI->createVirtualRegister(SrcRC);
+ Optional<ValueAndVReg> ConstVal =
+ getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
+ if (ConstVal) {
+ unsigned MovOpc =
+ STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+ BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
+ .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
+ } else {
+ Register MaskedReg = MRI->createVirtualRegister(SrcRC);
- // We can't trust the high bits at this point, so clear them.
+ // We can't trust the high bits at this point, so clear them.
- // TODO: Skip masking high bits if def is known boolean.
+ // TODO: Skip masking high bits if def is known boolean.
- unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
- AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
- BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
- .addImm(1)
- .addReg(SrcReg);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
- .addImm(0)
- .addReg(MaskedReg);
+ unsigned AndOpc =
+ TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
+ BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
+ .addImm(1)
+ .addReg(SrcReg);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
+ .addImm(0)
+ .addReg(MaskedReg);
+ }
if (!MRI->getRegClassOrNull(SrcReg))
MRI->setRegClass(SrcReg, SrcRC);
@@ -578,7 +590,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
return true;
const LLT S32 = LLT::scalar(32);
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
Register Dst = MI.getOperand(0).getReg();
if (MRI->getType(Dst) != V2S16)
@@ -743,6 +755,30 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register OffsetReg = MI.getOperand(2).getReg();
+ Register WidthReg = MI.getOperand(3).getReg();
+
+ assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
+ "scalar BFX instructions are expanded in regbankselect");
+ assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
+ "64-bit vector BFX instructions are expanded in regbankselect");
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
+ unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
+ .addReg(SrcReg)
+ .addReg(OffsetReg)
+ .addReg(WidthReg);
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
if (STI.getLDSBankCount() != 16)
return selectImpl(MI, *CoverageInfo);
@@ -916,8 +952,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return constrainCopyLikeIntrin(I, AMDGPU::WQM);
case Intrinsic::amdgcn_softwqm:
return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
+ case Intrinsic::amdgcn_strict_wwm:
case Intrinsic::amdgcn_wwm:
- return constrainCopyLikeIntrin(I, AMDGPU::WWM);
+ return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
+ case Intrinsic::amdgcn_strict_wqm:
+ return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
case Intrinsic::amdgcn_writelane:
return selectWritelane(I);
case Intrinsic::amdgcn_div_scale:
@@ -1375,7 +1414,24 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
if (HasVSrc) {
Register VSrc = MI.getOperand(1).getReg();
- MIB.addReg(VSrc);
+
+ if (STI.needsAlignedVGPRs()) {
+ // Add implicit aligned super-reg to force alignment on the data operand.
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ Register NewVR =
+ MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
+ BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
+ .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(Undef)
+ .addImm(AMDGPU::sub1);
+ MIB.addReg(NewVR, 0, AMDGPU::sub0);
+ MIB.addReg(NewVR, RegState::Implicit);
+ } else {
+ MIB.addReg(VSrc);
+ }
+
if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
return false;
}
@@ -1446,24 +1502,6 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
return TexFailCtrl == 0;
}
-static bool parseCachePolicy(uint64_t Value,
- bool *GLC, bool *SLC, bool *DLC) {
- if (GLC) {
- *GLC = (Value & 0x1) ? 1 : 0;
- Value &= ~(uint64_t)0x1;
- }
- if (SLC) {
- *SLC = (Value & 0x2) ? 1 : 0;
- Value &= ~(uint64_t)0x2;
- }
- if (DLC) {
- *DLC = (Value & 0x4) ? 1 : 0;
- Value &= ~(uint64_t)0x4;
- }
-
- return Value == 0;
-}
-
bool AMDGPUInstructionSelector::selectImageIntrinsic(
MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
MachineBasicBlock *MBB = MI.getParent();
@@ -1504,8 +1542,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
const bool IsA16 = (Flags & 1) != 0;
const bool IsG16 = (Flags & 2) != 0;
- // A16 implies 16 bit gradients
- if (IsA16 && !IsG16)
+ // A16 implies 16 bit gradients if subtarget doesn't support G16
+ if (IsA16 && !STI.hasG16() && !IsG16)
return false;
unsigned DMask = 0;
@@ -1589,21 +1627,11 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
// TODO: Check this in verifier.
assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
- bool GLC = false;
- bool SLC = false;
- bool DLC = false;
- if (BaseOpcode->Atomic) {
- GLC = true; // TODO no-return optimization
- if (!parseCachePolicy(
- MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
- &SLC, IsGFX10Plus ? &DLC : nullptr))
- return false;
- } else {
- if (!parseCachePolicy(
- MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
- &SLC, IsGFX10Plus ? &DLC : nullptr))
- return false;
- }
+ unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
+ if (BaseOpcode->Atomic)
+ CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ if (CPol & ~AMDGPU::CPol::ALL)
+ return false;
int NumVAddrRegs = 0;
int NumVAddrDwords = 0;
@@ -1661,8 +1689,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
MIB.addDef(TmpReg);
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
- .addReg(TmpReg, RegState::Kill, SubReg);
+ if (!MRI->use_empty(VDataOut)) {
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
+ .addReg(TmpReg, RegState::Kill, SubReg);
+ }
} else {
MIB.addDef(VDataOut); // vdata output
@@ -1689,11 +1719,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
if (IsGFX10Plus)
MIB.addImm(DimInfo->Encoding);
MIB.addImm(Unorm);
- if (IsGFX10Plus)
- MIB.addImm(DLC);
- MIB.addImm(GLC);
- MIB.addImm(SLC);
+ MIB.addImm(CPol);
MIB.addImm(IsA16 && // a16 or r128
STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
if (IsGFX10Plus)
@@ -1706,6 +1733,38 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
if (BaseOpcode->HasD16)
MIB.addImm(IsD16 ? -1 : 0);
+ if (IsTexFail) {
+ // An image load instruction with TFE/LWE only conditionally writes to its
+ // result registers. Initialize them to zero so that we always get well
+ // defined result values.
+ assert(VDataOut && !VDataIn);
+ Register Tied = MRI->cloneVirtualRegister(VDataOut);
+ Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
+ .addImm(0);
+ auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
+ if (STI.usePRTStrictNull()) {
+ // With enable-prt-strict-null enabled, initialize all result registers to
+ // zero.
+ auto RegSeq =
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
+ for (auto Sub : Parts)
+ RegSeq.addReg(Zero).addImm(Sub);
+ } else {
+ // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
+ // result register.
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ auto RegSeq =
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
+ for (auto Sub : Parts.drop_back(1))
+ RegSeq.addReg(Undef).addImm(Sub);
+ RegSeq.addReg(Zero).addImm(Parts.back());
+ }
+ MIB.addReg(Tied, RegState::Implicit);
+ MIB->tieOperands(0, MIB->getNumOperands() - 1);
+ }
+
MI.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
@@ -1733,7 +1792,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_global_atomic_fadd:
- return selectGlobalAtomicFaddIntrinsic(I);
+ return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
default: {
return selectImpl(I, *CoverageInfo);
}
@@ -1848,7 +1907,7 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
return false;
}
- if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
+ if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -2336,6 +2395,13 @@ void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
MachineInstr &I) const {
+ if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) {
+ const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
+ unsigned AS = PtrTy.getAddressSpace();
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS)
+ return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2));
+ }
+
initM0(I);
return selectImpl(I, *CoverageInfo);
}
@@ -2386,8 +2452,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
MIB.addImm(0);
MIB.addImm(Offset);
- MIB.addImm(1); // glc
- MIB.addImm(0); // slc
+ MIB.addImm(AMDGPU::CPol::GLC);
MIB.cloneMemRefs(MI);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
@@ -2772,7 +2837,7 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
Register Src1Reg = MI.getOperand(2).getReg();
ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
return false;
@@ -2895,6 +2960,8 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
MachineInstr &MI) const {
+ if (STI.hasGFX90AInsts())
+ return selectImpl(MI, *CoverageInfo);
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2951,7 +3018,7 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
- Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
.addReg(VIndex.getReg())
.addImm(AMDGPU::sub0)
@@ -2968,7 +3035,7 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
I.add(MI.getOperand(2)); // rsrc
I.add(SOffset);
I.addImm(Offset);
- renderExtractSLC(I, MI, 7);
+ I.addImm(MI.getOperand(7).getImm()); // cpol
I.cloneMemRefs(MI);
MI.eraseFromParent();
@@ -2976,8 +3043,14 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
return true;
}
-bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
- MachineInstr &MI) const{
+bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
+ MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const {
+
+ if (STI.hasGFX90AInsts()) {
+ // gfx90a adds return versions of the global atomic fadd instructions so no
+ // special handling is required.
+ return selectImpl(MI, *CoverageInfo);
+ }
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2994,16 +3067,16 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
// FIXME: This is only needed because tablegen requires number of dst operands
// in match and replace pattern to be the same. Otherwise patterns can be
// exported from SDag path.
- auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
+ auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal);
- Register Data = MI.getOperand(3).getReg();
+ Register Data = DataOp.getReg();
const unsigned Opc = MRI->getType(Data).isVector() ?
AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
.addReg(Addr.first)
.addReg(Data)
.addImm(Addr.second)
- .addImm(0) // SLC
+ .addImm(0) // cpol
.cloneMemRefs(MI);
MI.eraseFromParent();
@@ -3140,6 +3213,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectBVHIntrinsic(I);
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
+ case AMDGPU::G_SBFX:
+ case AMDGPU::G_UBFX:
+ return selectG_SBFX_UBFX(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -3282,7 +3358,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
// It's possible to see an f32 fneg here, but unlikely.
// TODO: Treat f32 fneg as only high bit.
- MRI.getType(Src) == LLT::vector(2, 16)) {
+ MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = MI->getOperand(1).getReg();
MI = MRI.getVRegDef(Src);
@@ -3408,9 +3484,9 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
}};
}
-template <bool Signed>
std::pair<Register, int>
-AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
+ uint64_t FlatVariant) const {
MachineInstr *MI = Root.getParent();
auto Default = std::make_pair(Root.getReg(), 0);
@@ -3426,7 +3502,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
- if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
+ if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
return Default;
return std::make_pair(PtrBase, ConstOffset);
@@ -3434,7 +3510,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
- auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
+ auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
@@ -3443,8 +3519,18 @@ AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
- auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
+AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
+ auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
+ auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
@@ -3483,39 +3569,56 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0) {
- if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+ if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
- } else if (ConstOffset > 0) {
+ } else {
auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
if (!PtrBaseDef)
return None;
if (isSGPR(PtrBaseDef->Reg)) {
- // Offset is too large.
- //
- // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
- // + (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset)
- = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
-
- if (isUInt<32>(RemainderOffset)) {
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
- Register HighBits
- = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
- HighBits)
- .addImm(RemainderOffset);
-
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
- }};
+ if (ConstOffset > 0) {
+ // Offset is too large.
+ //
+ // saddr + large_offset -> saddr +
+ // (voffset = large_offset & ~MaxOffset) +
+ // (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
+ ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+
+ if (isUInt<32>(RemainderOffset)) {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register HighBits =
+ MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+ HighBits)
+ .addImm(RemainderOffset);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(HighBits);
+ }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ }};
+ }
}
+
+ // We are adding a 64 bit SGPR and a constant. If constant bus limit
+ // is 1 we would need to perform 1 or 2 extra moves for each half of
+ // the constant and it is better to do a scalar add and then issue a
+ // single VALU instruction to materialize zero. Otherwise it is less
+ // instructions to perform VALU adds with immediates or inline literals.
+ unsigned NumLiterals =
+ !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
+ !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
+ if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
+ return None;
}
}
}
@@ -3525,57 +3628,50 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
return None;
// Match the variable offset.
- if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
- // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
- // drop this.
- if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
- AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
- return None;
-
- // It's cheaper to materialize a single 32-bit zero for vaddr than the two
- // moves required to copy a 64-bit SGPR to VGPR.
- const Register SAddr = AddrDef->Reg;
- if (!isSGPR(SAddr))
- return None;
-
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
- Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
- VOffset)
- .addImm(0);
-
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
- }};
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+ // Look through the SGPR->VGPR copy.
+ Register SAddr =
+ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+
+ if (SAddr && isSGPR(SAddr)) {
+ Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+
+ // It's possible voffset is an SGPR here, but the copy to VGPR will be
+ // inserted later.
+ if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ return {{[=](MachineInstrBuilder &MIB) { // saddr
+ MIB.addReg(SAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // voffset
+ MIB.addReg(VOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(ImmOffset);
+ }}};
+ }
+ }
}
- // Look through the SGPR->VGPR copy.
- Register SAddr =
- getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
- if (!SAddr || !isSGPR(SAddr))
+ // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
+ // drop this.
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
+ AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
return None;
- Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // It's possible voffset is an SGPR here, but the copy to VGPR will be
- // inserted later.
- Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
- if (!VOffset)
- return None;
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
+ .addImm(0);
- return {{[=](MachineInstrBuilder &MIB) { // saddr
- MIB.addReg(SAddr);
- },
- [=](MachineInstrBuilder &MIB) { // voffset
- MIB.addReg(VOffset);
- },
- [=](MachineInstrBuilder &MIB) { // offset
- MIB.addImm(ImmOffset);
- }}};
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
}
InstructionSelector::ComplexRendererFns
@@ -3590,7 +3686,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0 &&
- TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
}
@@ -3624,9 +3721,9 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
const DebugLoc &DL = I.getDebugLoc();
SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
- .addFrameIndex(FI)
- .addReg(RHSDef->Reg);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
+ .addFrameIndex(FI)
+ .addReg(RHSDef->Reg);
}
}
@@ -3639,11 +3736,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
}};
}
-static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
- auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
- return PSV && PSV->isStack();
-}
-
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
@@ -3685,23 +3777,19 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
Optional<int> FI;
Register VAddr = Root.getReg();
if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
- if (isBaseWithConstantOffset(Root, *MRI)) {
- const MachineOperand &LHS = RootDef->getOperand(1);
- const MachineOperand &RHS = RootDef->getOperand(2);
- const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
- const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
- if (LHSDef && RHSDef) {
- int64_t PossibleOffset =
- RHSDef->getOperand(1).getCImm()->getSExtValue();
- if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
- (!STI.privateMemoryResourceIsRangeChecked() ||
- KnownBits->signBitIsZero(LHS.getReg()))) {
- if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
- FI = LHSDef->getOperand(1).getIndex();
- else
- VAddr = LHS.getReg();
- Offset = PossibleOffset;
- }
+ Register PtrBase;
+ int64_t ConstOffset;
+ std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
+ if (ConstOffset != 0) {
+ if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
+ (!STI.privateMemoryResourceIsRangeChecked() ||
+ KnownBits->signBitIsZero(PtrBase))) {
+ const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
+ if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
+ FI = PtrBaseDef->getOperand(1).getIndex();
+ else
+ VAddr = PtrBase;
+ Offset = ConstOffset;
}
} else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
FI = RootDef->getOperand(1).getIndex();
@@ -3769,18 +3857,13 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
const MachineFunction *MF = MBB->getParent();
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
- const MachineMemOperand *MMO = *MI->memoperands_begin();
- const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
return {{
[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
},
[=](MachineInstrBuilder &MIB) { // soffset
- if (isStackPtrRelative(PtrInfo))
- MIB.addReg(Info->getStackPtrOffsetReg());
- else
- MIB.addImm(0);
+ MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
}};
@@ -4130,10 +4213,8 @@ AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset);
},
- addZeroImm, // glc
- addZeroImm, // slc
+ addZeroImm, // cpol
addZeroImm, // tfe
- addZeroImm, // dlc
addZeroImm // swz
}};
}
@@ -4158,11 +4239,9 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
- addZeroImm, // glc
- addZeroImm, // slc
+ addZeroImm, // cpol
addZeroImm, // tfe
- addZeroImm, // dlc
- addZeroImm // swz
+ addZeroImm, // swz
}};
}
@@ -4194,7 +4273,9 @@ AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset);
},
- addZeroImm // slc
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addImm(AMDGPU::CPol::GLC); // cpol
+ }
}};
}
@@ -4218,7 +4299,7 @@ AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
- addZeroImm // slc
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
}};
}
@@ -4308,32 +4389,25 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
MIB.addImm(MI.getOperand(OpIdx).getImm());
}
-void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
+void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
+ MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
}
-void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
- assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
-}
-
-void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
+void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
+ MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
}
-void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
+void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
+ MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
}
void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,