summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp2341
1 files changed, 1929 insertions, 412 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c0ea35817ec8e..2025c0fa5d21b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -15,7 +15,6 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPURegisterBankInfo.h"
-#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -40,6 +39,12 @@
using namespace llvm;
using namespace MIPatternMatch;
+static cl::opt<bool> AllowRiskySelect(
+ "amdgpu-global-isel-risky-select",
+ cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
+ cl::init(false),
+ cl::ReallyHidden);
+
#define GET_GLOBALISEL_IMPL
#define AMDGPUSubtarget GCNSubtarget
#include "AMDGPUGenGlobalISel.inc"
@@ -88,6 +93,30 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
return RB->getID() == AMDGPU::VCCRegBankID;
}
+bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
+ unsigned NewOpc) const {
+ MI.setDesc(TII.get(NewOpc));
+ MI.RemoveOperand(1); // Remove intrinsic ID.
+ MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+
+ MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(1);
+
+ // TODO: This should be legalized to s32 if needed
+ if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
+ return false;
+
+ const TargetRegisterClass *DstRC
+ = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+ const TargetRegisterClass *SrcRC
+ = TRI.getConstrainedRegClassForOperand(Src, *MRI);
+ if (!DstRC || DstRC != SrcRC)
+ return false;
+
+ return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
+ RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+}
+
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();
@@ -173,6 +202,14 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI->getType(DefReg);
+ if (DefTy == LLT::scalar(1)) {
+ if (!AllowRiskySelect) {
+ LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
+ }
// TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
@@ -261,6 +298,11 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
RC == &AMDGPU::SReg_64RegClass);
I.setDesc(TII.get(InstOpc));
+ // Dead implicit-def of scc
+ I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
+ true, // isImp
+ false, // isKill
+ true)); // isDead
// FIXME: Hack to avoid turning the register bank into a register class.
// The selector for G_ICMP relies on seeing the register bank for the result
@@ -295,7 +337,11 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
MachineFunction *MF = BB->getParent();
Register DstReg = I.getOperand(0).getReg();
const DebugLoc &DL = I.getDebugLoc();
- unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
+ LLT Ty = MRI->getType(DstReg);
+ if (Ty.isVector())
+ return false;
+
+ unsigned Size = Ty.getSizeInBits();
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
@@ -445,6 +491,7 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
return true;
}
+// TODO: We should probably legalize these to only using 32-bit results.
bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
Register DstReg = I.getOperand(0).getReg();
@@ -452,11 +499,21 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
LLT DstTy = MRI->getType(DstReg);
LLT SrcTy = MRI->getType(SrcReg);
const unsigned SrcSize = SrcTy.getSizeInBits();
- const unsigned DstSize = DstTy.getSizeInBits();
+ unsigned DstSize = DstTy.getSizeInBits();
// TODO: Should handle any multiple of 32 offset.
unsigned Offset = I.getOperand(2).getImm();
- if (Offset % DstSize != 0)
+ if (Offset % 32 != 0 || DstSize > 128)
+ return false;
+
+ // 16-bit operations really use 32-bit registers.
+ // FIXME: Probably should not allow 16-bit G_EXTRACT results.
+ if (DstSize == 16)
+ DstSize = 32;
+
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
+ if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
return false;
const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -464,20 +521,18 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
if (!SrcRC)
return false;
+ unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
+ DstSize / 32);
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
+ if (!SrcRC)
+ return false;
- ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
-
+ SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
+ *SrcRC, I.getOperand(1));
const DebugLoc &DL = I.getDebugLoc();
- MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
- .addReg(SrcReg, 0, SubRegs[Offset / DstSize]);
+ BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
+ .addReg(SrcReg, 0, SubReg);
- for (const MachineOperand &MO : Copy->operands()) {
- const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, *MRI);
- if (!RC)
- continue;
- RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
- }
I.eraseFromParent();
return true;
}
@@ -563,6 +618,90 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
return true;
}
+static bool isZero(Register Reg, const MachineRegisterInfo &MRI) {
+ int64_t Val;
+ return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0;
+}
+
+bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
+ MachineInstr &MI) const {
+ if (selectImpl(MI, *CoverageInfo))
+ return true;
+
+ const LLT S32 = LLT::scalar(32);
+ const LLT V2S16 = LLT::vector(2, 16);
+
+ Register Dst = MI.getOperand(0).getReg();
+ if (MRI->getType(Dst) != V2S16)
+ return false;
+
+ const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
+ if (DstBank->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+ if (MRI->getType(Src0) != S32)
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *BB = MI.getParent();
+
+ // TODO: This should probably be a combine somewhere
+ // (build_vector_trunc $src0, undef -> copy $src0
+ MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
+ if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
+ MI.setDesc(TII.get(AMDGPU::COPY));
+ MI.RemoveOperand(2);
+ return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
+ RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
+ }
+
+ Register ShiftSrc0;
+ Register ShiftSrc1;
+ int64_t ShiftAmt;
+
+ // With multiple uses of the shift, this will duplicate the shift and
+ // increase register pressure.
+ //
+ // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
+ // => (S_PACK_HH_B32_B16 $src0, $src1)
+ // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
+ // => (S_PACK_LH_B32_B16 $src0, $src1)
+ // (build_vector_trunc $src0, $src1)
+ // => (S_PACK_LL_B32_B16 $src0, $src1)
+
+ // FIXME: This is an inconvenient way to check a specific value
+ bool Shift0 = mi_match(
+ Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
+ ShiftAmt == 16;
+
+ bool Shift1 = mi_match(
+ Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
+ ShiftAmt == 16;
+
+ unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
+ if (Shift0 && Shift1) {
+ Opc = AMDGPU::S_PACK_HH_B32_B16;
+ MI.getOperand(1).setReg(ShiftSrc0);
+ MI.getOperand(2).setReg(ShiftSrc1);
+ } else if (Shift1) {
+ Opc = AMDGPU::S_PACK_LH_B32_B16;
+ MI.getOperand(2).setReg(ShiftSrc1);
+ } else if (Shift0 && isZero(Src1, *MRI)) {
+ // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
+ auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
+ .addReg(ShiftSrc0)
+ .addImm(16);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ }
+
+ MI.setDesc(TII.get(Opc));
+ return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
return selectG_ADD_SUB(I);
}
@@ -594,7 +733,9 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
unsigned InsSize = Src1Ty.getSizeInBits();
int64_t Offset = I.getOperand(3).getImm();
- if (Offset % 32 != 0)
+
+ // FIXME: These cases should have been illegal and unnecessary to check here.
+ if (Offset % 32 != 0 || InsSize % 32 != 0)
return false;
unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
@@ -617,7 +758,7 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
// Deal with weird cases where the class only partially supports the subreg
// index.
Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
- if (!Src0RC)
+ if (!Src0RC || !Src1RC)
return false;
if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
@@ -635,6 +776,85 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
+ if (STI.getLDSBankCount() != 16)
+ return selectImpl(MI, *CoverageInfo);
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(2).getReg();
+ Register M0Val = MI.getOperand(6).getReg();
+ if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
+ !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
+ !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
+ return false;
+
+ // This requires 2 instructions. It is possible to write a pattern to support
+ // this, but the generated isel emitter doesn't correctly deal with multiple
+ // output instructions using the same physical register input. The copy to m0
+ // is incorrectly placed before the second instruction.
+ //
+ // TODO: Match source modifiers.
+
+ Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0Val);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
+ .addImm(2)
+ .addImm(MI.getOperand(4).getImm()) // $attr
+ .addImm(MI.getOperand(3).getImm()); // $attrchan
+
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
+ .addImm(0) // $src0_modifiers
+ .addReg(Src0) // $src0
+ .addImm(MI.getOperand(4).getImm()) // $attr
+ .addImm(MI.getOperand(3).getImm()) // $attrchan
+ .addImm(0) // $src2_modifiers
+ .addReg(InterpMov) // $src2 - 2 f16 values selected by high
+ .addImm(MI.getOperand(5).getImm()) // $high
+ .addImm(0) // $clamp
+ .addImm(0); // $omod
+
+ MI.eraseFromParent();
+ return true;
+}
+
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
+ Register Dst0 = MI.getOperand(0).getReg();
+ Register Dst1 = MI.getOperand(1).getReg();
+
+ LLT Ty = MRI->getType(Dst0);
+ unsigned Opc;
+ if (Ty == LLT::scalar(32))
+ Opc = AMDGPU::V_DIV_SCALE_F32;
+ else if (Ty == LLT::scalar(64))
+ Opc = AMDGPU::V_DIV_SCALE_F64;
+ else
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ Register Numer = MI.getOperand(3).getReg();
+ Register Denom = MI.getOperand(4).getReg();
+ unsigned ChooseDenom = MI.getOperand(5).getImm();
+
+ Register Src0 = ChooseDenom != 0 ? Numer : Denom;
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
+ .addDef(Dst1)
+ .addUse(Src0)
+ .addUse(Denom)
+ .addUse(Numer);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
unsigned IntrinsicID = I.getIntrinsicID();
switch (IntrinsicID) {
@@ -659,6 +879,20 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return true;
}
+ case Intrinsic::amdgcn_interp_p1_f16:
+ return selectInterpP1F16(I);
+ case Intrinsic::amdgcn_wqm:
+ return constrainCopyLikeIntrin(I, AMDGPU::WQM);
+ case Intrinsic::amdgcn_softwqm:
+ return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
+ case Intrinsic::amdgcn_wwm:
+ return constrainCopyLikeIntrin(I, AMDGPU::WWM);
+ case Intrinsic::amdgcn_div_scale:
+ return selectDivScale(I);
+ case Intrinsic::amdgcn_icmp:
+ return selectIntrinsicIcmp(I);
+ case Intrinsic::amdgcn_ballot:
+ return selectBallot(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -779,247 +1013,79 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
return Ret;
}
-static MachineInstr *
-buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
- unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3,
- unsigned VM, bool Compr, unsigned Enabled, bool Done) {
- const DebugLoc &DL = Insert->getDebugLoc();
- MachineBasicBlock &BB = *Insert->getParent();
- unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP;
- return BuildMI(BB, Insert, DL, TII.get(Opcode))
- .addImm(Tgt)
- .addReg(Reg0)
- .addReg(Reg1)
- .addReg(Reg2)
- .addReg(Reg3)
- .addImm(VM)
- .addImm(Compr)
- .addImm(Enabled);
-}
-
-static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
- int64_t C;
- if (mi_match(Reg, MRI, m_ICst(C)) && C == 0)
- return true;
-
- // FIXME: matcher should ignore copies
- return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0;
-}
+bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
+ Register Dst = I.getOperand(0).getReg();
+ if (isVCC(Dst, *MRI))
+ return false;
-static unsigned extractGLC(unsigned AuxiliaryData) {
- return AuxiliaryData & 1;
-}
+ if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
+ return false;
-static unsigned extractSLC(unsigned AuxiliaryData) {
- return (AuxiliaryData >> 1) & 1;
-}
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register SrcReg = I.getOperand(2).getReg();
+ unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
+ auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
-static unsigned extractDLC(unsigned AuxiliaryData) {
- return (AuxiliaryData >> 2) & 1;
-}
+ int Opcode = getV_CMPOpcode(Pred, Size);
+ if (Opcode == -1)
+ return false;
-static unsigned extractSWZ(unsigned AuxiliaryData) {
- return (AuxiliaryData >> 3) & 1;
+ MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+ RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
+ *MRI);
+ bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
+ I.eraseFromParent();
+ return Ret;
}
-static unsigned getBufferStoreOpcode(LLT Ty,
- const unsigned MemSize,
- const bool Offen) {
- const int Size = Ty.getSizeInBits();
- switch (8 * MemSize) {
- case 8:
- return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
- AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
- case 16:
- return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
- AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
- default:
- unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
- AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
- if (Size > 32)
- Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
- return Opc;
- }
-}
-
-static unsigned getBufferStoreFormatOpcode(LLT Ty,
- const unsigned MemSize,
- const bool Offen) {
- bool IsD16Packed = Ty.getScalarSizeInBits() == 16;
- bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits();
- int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
-
- if (IsD16Packed) {
- switch (NumElts) {
- case 1:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
- case 2:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact;
- case 3:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact;
- case 4:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact;
- default:
- return -1;
- }
- }
-
- if (IsD16Unpacked) {
- switch (NumElts) {
- case 1:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
- case 2:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact;
- case 3:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact;
- case 4:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact;
- default:
- return -1;
- }
- }
-
- switch (NumElts) {
- case 1:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact;
- case 2:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact;
- case 3:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact;
- case 4:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact;
- default:
- return -1;
- }
+bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register DstReg = I.getOperand(0).getReg();
+ const unsigned Size = MRI->getType(DstReg).getSizeInBits();
+ const bool Is64 = Size == 64;
- llvm_unreachable("unhandled buffer store");
-}
-
-// TODO: Move this to combiner
-// Returns base register, imm offset, total constant offset.
-std::tuple<Register, unsigned, unsigned>
-AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B,
- Register OrigOffset) const {
- const unsigned MaxImm = 4095;
- Register BaseReg;
- unsigned TotalConstOffset;
- MachineInstr *OffsetDef;
-
- std::tie(BaseReg, TotalConstOffset, OffsetDef)
- = AMDGPU::getBaseWithConstantOffset(*MRI, OrigOffset);
-
- unsigned ImmOffset = TotalConstOffset;
-
- // If the immediate value is too big for the immoffset field, put the value
- // and -4096 into the immoffset field so that the value that is copied/added
- // for the voffset field is a multiple of 4096, and it stands more chance
- // of being CSEd with the copy/add for another similar load/store.f
- // However, do not do that rounding down to a multiple of 4096 if that is a
- // negative number, as it appears to be illegal to have a negative offset
- // in the vgpr, even if adding the immediate offset makes it positive.
- unsigned Overflow = ImmOffset & ~MaxImm;
- ImmOffset -= Overflow;
- if ((int32_t)Overflow < 0) {
- Overflow += ImmOffset;
- ImmOffset = 0;
- }
-
- if (Overflow != 0) {
- // In case this is in a waterfall loop, insert offset code at the def point
- // of the offset, not inside the loop.
- MachineBasicBlock::iterator OldInsPt = B.getInsertPt();
- MachineBasicBlock &OldMBB = B.getMBB();
- B.setInstr(*OffsetDef);
-
- if (!BaseReg) {
- BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- B.buildInstr(AMDGPU::V_MOV_B32_e32)
- .addDef(BaseReg)
- .addImm(Overflow);
- } else {
- Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- B.buildInstr(AMDGPU::V_MOV_B32_e32)
- .addDef(OverflowVal)
- .addImm(Overflow);
-
- Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg)
- .addReg(BaseReg)
- .addReg(OverflowVal, RegState::Kill)
- .addImm(0);
- BaseReg = NewBaseReg;
- }
+ if (Size != STI.getWavefrontSize())
+ return false;
- B.setInsertPt(OldMBB, OldInsPt);
+ Optional<ValueAndVReg> Arg =
+ getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
+
+ if (Arg.hasValue()) {
+ const int64_t Value = Arg.getValue().Value;
+ if (Value == 0) {
+ unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+ BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
+ } else if (Value == -1) { // all ones
+ Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
+ } else
+ return false;
+ } else {
+ Register SrcReg = I.getOperand(2).getReg();
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
}
- return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+ I.eraseFromParent();
+ return true;
}
-bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI,
- bool IsFormat) const {
- MachineIRBuilder B(MI);
- MachineFunction &MF = B.getMF();
- Register VData = MI.getOperand(1).getReg();
- LLT Ty = MRI->getType(VData);
-
- int Size = Ty.getSizeInBits();
- if (Size % 32 != 0)
- return false;
-
- // FIXME: Verifier should enforce 1 MMO for these intrinsics.
- MachineMemOperand *MMO = *MI.memoperands_begin();
- const int MemSize = MMO->getSize();
-
- Register RSrc = MI.getOperand(2).getReg();
- Register VOffset = MI.getOperand(3).getReg();
- Register SOffset = MI.getOperand(4).getReg();
- unsigned AuxiliaryData = MI.getOperand(5).getImm();
- unsigned ImmOffset;
- unsigned TotalOffset;
-
- std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
- if (TotalOffset != 0)
- MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize);
-
- const bool Offen = !isZero(VOffset, *MRI);
-
- int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) :
- getBufferStoreOpcode(Ty, MemSize, Offen);
- if (Opc == -1)
- return false;
-
- MachineInstrBuilder MIB = B.buildInstr(Opc)
- .addUse(VData);
-
- if (Offen)
- MIB.addUse(VOffset);
-
- MIB.addUse(RSrc)
- .addUse(SOffset)
- .addImm(ImmOffset)
- .addImm(extractGLC(AuxiliaryData))
- .addImm(extractSLC(AuxiliaryData))
- .addImm(0) // tfe: FIXME: Remove from inst
- .addImm(extractDLC(AuxiliaryData))
- .addImm(extractSWZ(AuxiliaryData))
- .addMemOperand(MMO);
+bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
+ // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
+ // SelectionDAG uses for wave32 vs wave64.
+ MachineBasicBlock *BB = MI.getParent();
+ BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
+ .add(MI.getOperand(1));
+ Register Reg = MI.getOperand(1).getReg();
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ if (!MRI->getRegClassOrNull(Reg))
+ MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
+ return true;
}
static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
@@ -1106,70 +1172,458 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
return Ret;
}
-bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
- MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- unsigned IntrinsicID = I.getIntrinsicID();
- switch (IntrinsicID) {
- case Intrinsic::amdgcn_exp: {
- int64_t Tgt = I.getOperand(1).getImm();
- int64_t Enabled = I.getOperand(2).getImm();
- int64_t Done = I.getOperand(7).getImm();
- int64_t VM = I.getOperand(8).getImm();
-
- MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
- I.getOperand(4).getReg(),
- I.getOperand(5).getReg(),
- I.getOperand(6).getReg(),
- VM, false, Enabled, Done);
+static unsigned gwsIntrinToOpcode(unsigned IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_gws_init:
+ return AMDGPU::DS_GWS_INIT;
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ return AMDGPU::DS_GWS_BARRIER;
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ return AMDGPU::DS_GWS_SEMA_V;
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ return AMDGPU::DS_GWS_SEMA_BR;
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ return AMDGPU::DS_GWS_SEMA_P;
+ case Intrinsic::amdgcn_ds_gws_sema_release_all:
+ return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
+ default:
+ llvm_unreachable("not a gws intrinsic");
+ }
+}
- I.eraseFromParent();
- return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
+ Intrinsic::ID IID) const {
+ if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
+ !STI.hasGWSSemaReleaseAll())
+ return false;
+
+ // intrinsic ID, vsrc, offset
+ const bool HasVSrc = MI.getNumOperands() == 3;
+ assert(HasVSrc || MI.getNumOperands() == 2);
+
+ Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
+ const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
+ if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
+ assert(OffsetDef);
+
+ unsigned ImmOffset;
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineInstr *Readfirstlane = nullptr;
+
+ // If we legalized the VGPR input, strip out the readfirstlane to analyze the
+ // incoming offset, in case there's an add of a constant. We'll have to put it
+ // back later.
+ if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
+ Readfirstlane = OffsetDef;
+ BaseOffset = OffsetDef->getOperand(1).getReg();
+ OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
}
- case Intrinsic::amdgcn_exp_compr: {
- const DebugLoc &DL = I.getDebugLoc();
- int64_t Tgt = I.getOperand(1).getImm();
- int64_t Enabled = I.getOperand(2).getImm();
- Register Reg0 = I.getOperand(3).getReg();
- Register Reg1 = I.getOperand(4).getReg();
- Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- int64_t Done = I.getOperand(5).getImm();
- int64_t VM = I.getOperand(6).getImm();
-
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
- MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
- true, Enabled, Done);
- I.eraseFromParent();
- return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+ if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
+ // If we have a constant offset, try to use the 0 in m0 as the base.
+ // TODO: Look into changing the default m0 initialization value. If the
+ // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
+ // the immediate offset.
+
+ ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addImm(0);
+ } else {
+ std::tie(BaseOffset, ImmOffset, OffsetDef)
+ = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
+
+ if (Readfirstlane) {
+ // We have the constant offset now, so put the readfirstlane back on the
+ // variable component.
+ if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
+ return false;
+
+ Readfirstlane->getOperand(1).setReg(BaseOffset);
+ BaseOffset = Readfirstlane->getOperand(0).getReg();
+ } else {
+ if (!RBI.constrainGenericRegister(BaseOffset,
+ AMDGPU::SReg_32RegClass, *MRI))
+ return false;
+ }
+
+ Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
+ .addReg(BaseOffset)
+ .addImm(16);
+
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0Base);
}
- case Intrinsic::amdgcn_end_cf: {
- // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
- // SelectionDAG uses for wave32 vs wave64.
- BuildMI(*BB, &I, I.getDebugLoc(),
- TII.get(AMDGPU::SI_END_CF))
- .add(I.getOperand(1));
- Register Reg = I.getOperand(1).getReg();
- I.eraseFromParent();
+ // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
+ // offset field) % 64. Some versions of the programming guide omit the m0
+ // part, or claim it's from offset 0.
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
- if (!MRI->getRegClassOrNull(Reg))
- MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
- return true;
+ if (HasVSrc) {
+ Register VSrc = MI.getOperand(1).getReg();
+ MIB.addReg(VSrc);
+ if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
+ return false;
+ }
+
+ MIB.addImm(ImmOffset)
+ .addImm(-1) // $gds
+ .cloneMemRefs(MI);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
+ bool IsAppend) const {
+ Register PtrBase = MI.getOperand(2).getReg();
+ LLT PtrTy = MRI->getType(PtrBase);
+ bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+
+ unsigned Offset;
+ std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
+
+ // TODO: Should this try to look through readfirstlane like GWS?
+ if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
+ PtrBase = MI.getOperand(2).getReg();
+ Offset = 0;
+ }
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(PtrBase);
+ BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
+ .addImm(Offset)
+ .addImm(IsGDS ? -1 : 0)
+ .cloneMemRefs(MI);
+ MI.eraseFromParent();
+ return true;
+}
+
+static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
+ bool &IsTexFail) {
+ if (TexFailCtrl)
+ IsTexFail = true;
+
+ TFE = (TexFailCtrl & 0x1) ? 1 : 0;
+ TexFailCtrl &= ~(uint64_t)0x1;
+ LWE = (TexFailCtrl & 0x2) ? 1 : 0;
+ TexFailCtrl &= ~(uint64_t)0x2;
+
+ return TexFailCtrl == 0;
+}
+
+static bool parseCachePolicy(uint64_t Value,
+ bool *GLC, bool *SLC, bool *DLC) {
+ if (GLC) {
+ *GLC = (Value & 0x1) ? 1 : 0;
+ Value &= ~(uint64_t)0x1;
+ }
+ if (SLC) {
+ *SLC = (Value & 0x2) ? 1 : 0;
+ Value &= ~(uint64_t)0x2;
+ }
+ if (DLC) {
+ *DLC = (Value & 0x4) ? 1 : 0;
+ Value &= ~(uint64_t)0x4;
+ }
+
+ return Value == 0;
+}
+
+bool AMDGPUInstructionSelector::selectImageIntrinsic(
+ MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+ const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
+ AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
+ const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
+ AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
+ unsigned IntrOpcode = Intr->BaseOpcode;
+ const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
+
+ const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
+ MI.getNumExplicitDefs());
+ int NumVAddr, NumGradients;
+ std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
+
+ Register VDataIn, VDataOut;
+ LLT VDataTy;
+ int NumVDataDwords = -1;
+ bool IsD16 = false;
+
+ // XXX - Can we just get the second to last argument for ctrl?
+ unsigned CtrlIdx; // Index of texfailctrl argument
+ bool Unorm;
+ if (!BaseOpcode->Sampler) {
+ Unorm = true;
+ CtrlIdx = VAddrIdx + NumVAddr + 1;
+ } else {
+ Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
+ CtrlIdx = VAddrIdx + NumVAddr + 3;
+ }
+
+ bool TFE;
+ bool LWE;
+ bool IsTexFail = false;
+ if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
+ return false;
+
+ const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
+ const bool IsA16 = (Flags & 1) != 0;
+ const bool IsG16 = (Flags & 2) != 0;
+
+ // A16 implies 16 bit gradients
+ if (IsA16 && !IsG16)
+ return false;
+
+ unsigned DMask = 0;
+ unsigned DMaskLanes = 0;
+
+ if (BaseOpcode->Atomic) {
+ VDataOut = MI.getOperand(0).getReg();
+ VDataIn = MI.getOperand(2).getReg();
+ LLT Ty = MRI->getType(VDataIn);
+
+ // Be careful to allow atomic swap on 16-bit element vectors.
+ const bool Is64Bit = BaseOpcode->AtomicX2 ?
+ Ty.getSizeInBits() == 128 :
+ Ty.getSizeInBits() == 64;
+
+ if (BaseOpcode->AtomicX2) {
+ assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
+
+ DMask = Is64Bit ? 0xf : 0x3;
+ NumVDataDwords = Is64Bit ? 4 : 2;
+ } else {
+ DMask = Is64Bit ? 0x3 : 0x1;
+ NumVDataDwords = Is64Bit ? 2 : 1;
+ }
+ } else {
+ const int DMaskIdx = 2; // Input/output + intrinsic ID.
+
+ DMask = MI.getOperand(DMaskIdx).getImm();
+ DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
+
+ if (BaseOpcode->Store) {
+ VDataIn = MI.getOperand(1).getReg();
+ VDataTy = MRI->getType(VDataIn);
+ NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
+ } else {
+ VDataOut = MI.getOperand(0).getReg();
+ VDataTy = MRI->getType(VDataOut);
+ NumVDataDwords = DMaskLanes;
+
+ // One memoperand is mandatory, except for getresinfo.
+ // FIXME: Check this in verifier.
+ if (!MI.memoperands_empty()) {
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ // Infer d16 from the memory size, as the register type will be mangled by
+ // unpacked subtargets, or by TFE.
+ IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
+
+ if (IsD16 && !STI.hasUnpackedD16VMem())
+ NumVDataDwords = (DMaskLanes + 1) / 2;
+ }
+ }
+ }
+
+ // Optimize _L to _LZ when _L is zero
+ if (LZMappingInfo) {
+ // The legalizer replaced the register with an immediate 0 if we need to
+ // change the opcode.
+ const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+ if (Lod.isImm()) {
+ assert(Lod.getImm() == 0);
+ IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
+ }
+ }
+
+ // Optimize _mip away, when 'lod' is zero
+ if (MIPMappingInfo) {
+ const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+ if (Lod.isImm()) {
+ assert(Lod.getImm() == 0);
+ IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
+ }
+ }
+
+ // Set G16 opcode
+ if (IsG16 && !IsA16) {
+ const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
+ AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
+ assert(G16MappingInfo);
+ IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
+ }
+
+ // TODO: Check this in verifier.
+ assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
+
+ bool GLC = false;
+ bool SLC = false;
+ bool DLC = false;
+ if (BaseOpcode->Atomic) {
+ GLC = true; // TODO no-return optimization
+ if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
+ IsGFX10 ? &DLC : nullptr))
+ return false;
+ } else {
+ if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
+ IsGFX10 ? &DLC : nullptr))
+ return false;
+ }
+
+ int NumVAddrRegs = 0;
+ int NumVAddrDwords = 0;
+ for (int I = 0; I < NumVAddr; ++I) {
+ // Skip the $noregs and 0s inserted during legalization.
+ MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
+ if (!AddrOp.isReg())
+ continue; // XXX - Break?
+
+ Register Addr = AddrOp.getReg();
+ if (!Addr)
+ break;
+
+ ++NumVAddrRegs;
+ NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
+ }
+
+ // The legalizer preprocessed the intrinsic arguments. If we aren't using
+ // NSA, these should have beeen packed into a single value in the first
+ // address register
+ const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
+ if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
+ LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
+ return false;
+ }
+
+ if (IsTexFail)
+ ++NumVDataDwords;
+
+ int Opcode = -1;
+ if (IsGFX10) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+ UseNSA ? AMDGPU::MIMGEncGfx10NSA
+ : AMDGPU::MIMGEncGfx10Default,
+ NumVDataDwords, NumVAddrDwords);
+ } else {
+ if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
+ NumVDataDwords, NumVAddrDwords);
+ }
+ assert(Opcode != -1);
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
+ .cloneMemRefs(MI);
+
+ if (VDataOut) {
+ if (BaseOpcode->AtomicX2) {
+ const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
+
+ Register TmpReg = MRI->createVirtualRegister(
+ Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
+ unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+
+ MIB.addDef(TmpReg);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
+ .addReg(TmpReg, RegState::Kill, SubReg);
+
+ } else {
+ MIB.addDef(VDataOut); // vdata output
+ }
}
- case Intrinsic::amdgcn_raw_buffer_store:
- return selectStoreIntrinsic(I, false);
- case Intrinsic::amdgcn_raw_buffer_store_format:
- return selectStoreIntrinsic(I, true);
+
+ if (VDataIn)
+ MIB.addReg(VDataIn); // vdata input
+
+ for (int i = 0; i != NumVAddrRegs; ++i) {
+ MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
+ if (SrcOp.isReg()) {
+ assert(SrcOp.getReg() != 0);
+ MIB.addReg(SrcOp.getReg());
+ }
+ }
+
+ MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
+ if (BaseOpcode->Sampler)
+ MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
+
+ MIB.addImm(DMask); // dmask
+
+ if (IsGFX10)
+ MIB.addImm(DimInfo->Encoding);
+ MIB.addImm(Unorm);
+ if (IsGFX10)
+ MIB.addImm(DLC);
+
+ MIB.addImm(GLC);
+ MIB.addImm(SLC);
+ MIB.addImm(IsA16 && // a16 or r128
+ STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
+ if (IsGFX10)
+ MIB.addImm(IsA16 ? -1 : 0);
+
+ MIB.addImm(TFE); // tfe
+ MIB.addImm(LWE); // lwe
+ if (!IsGFX10)
+ MIB.addImm(DimInfo->DA ? -1 : 0);
+ if (BaseOpcode->HasD16)
+ MIB.addImm(IsD16 ? -1 : 0);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
+ MachineInstr &I) const {
+ unsigned IntrinsicID = I.getIntrinsicID();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_end_cf:
+ return selectEndCfIntrinsic(I);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
return selectDSOrderedIntrinsic(I, IntrinsicID);
- default:
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all:
+ return selectDSGWSIntrinsic(I, IntrinsicID);
+ case Intrinsic::amdgcn_ds_append:
+ return selectDSAppendConsume(I, true);
+ case Intrinsic::amdgcn_ds_consume:
+ return selectDSAppendConsume(I, false);
+ default: {
return selectImpl(I, *CoverageInfo);
}
+ }
}
bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+
MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -1247,9 +1701,6 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI->getType(DstReg);
const LLT SrcTy = MRI->getType(SrcReg);
- if (!DstTy.isScalar())
- return false;
-
const LLT S1 = LLT::scalar(1);
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -1264,6 +1715,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
return false;
}
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
@@ -1271,6 +1724,73 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
= TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
const TargetRegisterClass *DstRC
= TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
+ if (!SrcRC || !DstRC)
+ return false;
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ return false;
+ }
+
+ if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ Register LoReg = MRI->createVirtualRegister(DstRC);
+ Register HiReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
+ .addReg(SrcReg, 0, AMDGPU::sub0);
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
+ .addReg(SrcReg, 0, AMDGPU::sub1);
+
+ if (IsVALU && STI.hasSDWA()) {
+ // Write the low 16-bits of the high element into the high 16-bits of the
+ // low element.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(HiReg) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
+ .addReg(LoReg, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ Register TmpReg0 = MRI->createVirtualRegister(DstRC);
+ Register TmpReg1 = MRI->createVirtualRegister(DstRC);
+ Register ImmReg = MRI->createVirtualRegister(DstRC);
+ if (IsVALU) {
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
+ .addImm(16)
+ .addReg(HiReg);
+ } else {
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
+ .addReg(HiReg)
+ .addImm(16);
+ }
+
+ unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+ unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
+ unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
+
+ BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
+ .addImm(0xffff);
+ BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
+ .addReg(LoReg)
+ .addReg(ImmReg);
+ BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
+ .addReg(TmpReg0)
+ .addReg(TmpReg1);
+ }
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ if (!DstTy.isScalar())
+ return false;
if (SrcSize > 32) {
int SubRegIdx = sizeToSubRegIndex(DstSize);
@@ -1279,17 +1799,17 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
// Deal with weird cases where the class only partially supports the subreg
// index.
- SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
- if (!SrcRC)
+ const TargetRegisterClass *SrcWithSubRC
+ = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
+ if (!SrcWithSubRC)
return false;
- I.getOperand(1).setSubReg(SubRegIdx);
- }
+ if (SrcWithSubRC != SrcRC) {
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
+ return false;
+ }
- if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
- !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
- LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
- return false;
+ I.getOperand(1).setSubReg(SubRegIdx);
}
I.setDesc(TII.get(TargetOpcode::COPY));
@@ -1318,7 +1838,8 @@ const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
}
bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
- bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
+ bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
+ bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock &MBB = *I.getParent();
const Register DstReg = I.getOperand(0).getReg();
@@ -1326,7 +1847,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
const LLT DstTy = MRI->getType(DstReg);
const LLT SrcTy = MRI->getType(SrcReg);
- const unsigned SrcSize = SrcTy.getSizeInBits();
+ const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
+ I.getOperand(2).getImm() : SrcTy.getSizeInBits();
const unsigned DstSize = DstTy.getSizeInBits();
if (!DstTy.isScalar())
return false;
@@ -1362,7 +1884,9 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
}
if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
- if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
+ const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
+ AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
+ if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
return false;
if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
@@ -1378,13 +1902,15 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
// Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
- if (DstSize > 32 && SrcSize <= 32) {
+ if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
// We need a 64-bit register source, but the high bits don't matter.
Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
+
BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
- .addReg(SrcReg)
+ .addReg(SrcReg, 0, SubReg)
.addImm(AMDGPU::sub0)
.addReg(UndefReg)
.addImm(AMDGPU::sub1);
@@ -1487,6 +2013,103 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
}
+bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
+ // Only manually handle the f64 SGPR case.
+ //
+ // FIXME: This is a workaround for 2.5 different tablegen problems. Because
+ // the bit ops theoretically have a second result due to the implicit def of
+ // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
+ // that is easy by disabling the check. The result works, but uses a
+ // nonsensical sreg32orlds_and_sreg_1 regclass.
+ //
+ // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
+ // the variadic REG_SEQUENCE operands.
+
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
+ if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
+ MRI->getType(Dst) != LLT::scalar(64))
+ return false;
+
+ Register Src = MI.getOperand(1).getReg();
+ MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
+ if (Fabs)
+ Src = Fabs->getOperand(1).getReg();
+
+ if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
+ !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
+ return false;
+
+ MachineBasicBlock *BB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
+ .addReg(Src, 0, AMDGPU::sub0);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
+ .addReg(Src, 0, AMDGPU::sub1);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
+ .addImm(0x80000000);
+
+ // Set or toggle sign bit.
+ unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
+ BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
+ .addReg(HiReg)
+ .addReg(ConstReg);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(LoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(OpReg)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return true;
+}
+
+// FIXME: This is a workaround for the same tablegen problems as G_FNEG
+bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
+ if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
+ MRI->getType(Dst) != LLT::scalar(64))
+ return false;
+
+ Register Src = MI.getOperand(1).getReg();
+ MachineBasicBlock *BB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
+ !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
+ return false;
+
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
+ .addReg(Src, 0, AMDGPU::sub0);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
+ .addReg(Src, 0, AMDGPU::sub1);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
+ .addImm(0x7fffffff);
+
+ // Clear sign bit.
+ // TODO: Should this used S_BITSET0_*?
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
+ .addReg(HiReg)
+ .addReg(ConstReg);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(LoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(OpReg)
+ .addImm(AMDGPU::sub1);
+
+ MI.eraseFromParent();
+ return true;
+}
+
static bool isConstant(const MachineInstr &MI) {
return MI.getOpcode() == TargetOpcode::G_CONSTANT;
}
@@ -1573,6 +2196,65 @@ bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
return selectImpl(I, *CoverageInfo);
}
+// TODO: No rtn optimization.
+bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
+ MachineInstr &MI) const {
+ Register PtrReg = MI.getOperand(1).getReg();
+ const LLT PtrTy = MRI->getType(PtrReg);
+ if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+ STI.useFlatForGlobal())
+ return selectImpl(MI, *CoverageInfo);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ const LLT Ty = MRI->getType(DstReg);
+ const bool Is64 = Ty.getSizeInBits() == 64;
+ const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ Register TmpReg = MRI->createVirtualRegister(
+ Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *BB = MI.getParent();
+
+ Register VAddr, RSrcReg, SOffset;
+ int64_t Offset = 0;
+
+ unsigned Opcode;
+ if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
+ Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
+ AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
+ } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
+ RSrcReg, SOffset, Offset)) {
+ Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
+ AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
+ } else
+ return selectImpl(MI, *CoverageInfo);
+
+ auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
+ .addReg(MI.getOperand(2).getReg());
+
+ if (VAddr)
+ MIB.addReg(VAddr);
+
+ MIB.addReg(RSrcReg);
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+
+ MIB.addImm(Offset);
+ MIB.addImm(0); // slc
+ MIB.cloneMemRefs(MI);
+
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(TmpReg, RegState::Kill, SubReg);
+
+ MI.eraseFromParent();
+
+ MRI->setRegClass(
+ DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineOperand &CondOp = I.getOperand(0);
@@ -1619,7 +2301,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
return true;
}
-bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
+ MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
@@ -1631,67 +2314,134 @@ bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
}
-bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const {
- uint64_t Align = I.getOperand(2).getImm();
- const uint64_t Mask = ~((UINT64_C(1) << Align) - 1);
-
- MachineBasicBlock *BB = I.getParent();
-
+bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
+ Register MaskReg = I.getOperand(2).getReg();
+ LLT Ty = MRI->getType(DstReg);
+ LLT MaskTy = MRI->getType(MaskReg);
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
+ const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ if (DstRB != SrcRB) // Should only happen for hand written MIR.
+ return false;
+
unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
- unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
const TargetRegisterClass &RegRC
= IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
- LLT Ty = MRI->getType(DstReg);
-
const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
*MRI);
const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
*MRI);
+ const TargetRegisterClass *MaskRC =
+ TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
+
if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
- !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+ !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+ !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
return false;
+ MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- Register ImmReg = MRI->createVirtualRegister(&RegRC);
- BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg)
- .addImm(Mask);
-
if (Ty.getSizeInBits() == 32) {
+ assert(MaskTy.getSizeInBits() == 32 &&
+ "ptrmask should have been narrowed during legalize");
+
BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
.addReg(SrcReg)
- .addReg(ImmReg);
+ .addReg(MaskReg);
I.eraseFromParent();
return true;
}
Register HiReg = MRI->createVirtualRegister(&RegRC);
Register LoReg = MRI->createVirtualRegister(&RegRC);
- Register MaskLo = MRI->createVirtualRegister(&RegRC);
+ // Extract the subregisters from the source pointer.
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
.addReg(SrcReg, 0, AMDGPU::sub0);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
.addReg(SrcReg, 0, AMDGPU::sub1);
- BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo)
- .addReg(LoReg)
- .addReg(ImmReg);
+ Register MaskedLo, MaskedHi;
+
+ // Try to avoid emitting a bit operation when we only need to touch half of
+ // the 64-bit pointer.
+ APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
+
+ const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
+ const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
+ if ((MaskOnes & MaskLo32) == MaskLo32) {
+ // If all the bits in the low half are 1, we only need a copy for it.
+ MaskedLo = LoReg;
+ } else {
+ // Extract the mask subregister and apply the and.
+ Register MaskLo = MRI->createVirtualRegister(&RegRC);
+ MaskedLo = MRI->createVirtualRegister(&RegRC);
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
+ .addReg(MaskReg, 0, AMDGPU::sub0);
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
+ .addReg(LoReg)
+ .addReg(MaskLo);
+ }
+
+ if ((MaskOnes & MaskHi32) == MaskHi32) {
+ // If all the bits in the high half are 1, we only need a copy for it.
+ MaskedHi = HiReg;
+ } else {
+ Register MaskHi = MRI->createVirtualRegister(&RegRC);
+ MaskedHi = MRI->createVirtualRegister(&RegRC);
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
+ .addReg(MaskReg, 0, AMDGPU::sub1);
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
+ .addReg(HiReg)
+ .addReg(MaskHi);
+ }
+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(MaskLo)
+ .addReg(MaskedLo)
.addImm(AMDGPU::sub0)
- .addReg(HiReg)
+ .addReg(MaskedHi)
.addImm(AMDGPU::sub1);
I.eraseFromParent();
return true;
}
+/// Return the register to use for the index value, and the subregister to use
+/// for the indirectly accessed register.
+static std::pair<Register, unsigned>
+computeIndirectRegIndex(MachineRegisterInfo &MRI,
+ const SIRegisterInfo &TRI,
+ const TargetRegisterClass *SuperRC,
+ Register IdxReg,
+ unsigned EltSize) {
+ Register IdxBaseReg;
+ int Offset;
+ MachineInstr *Unused;
+
+ std::tie(IdxBaseReg, Offset, Unused)
+ = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
+ if (IdxBaseReg == AMDGPU::NoRegister) {
+ // This will happen if the index is a known constant. This should ordinarily
+ // be legalized out, but handle it as a register just in case.
+ assert(Offset == 0);
+ IdxBaseReg = IdxReg;
+ }
+
+ ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
+
+ // Skip out of bounds offsets, or else we would end up using an undefined
+ // register.
+ if (static_cast<unsigned>(Offset) >= SubRegs.size())
+ return std::make_pair(IdxReg, SubRegs[0]);
+ return std::make_pair(IdxBaseReg, SubRegs[Offset]);
+}
+
bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
MachineInstr &MI) const {
Register DstReg = MI.getOperand(0).getReg();
@@ -1714,6 +2464,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
*MRI);
const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
*MRI);
+ if (!SrcRC || !DstRC)
+ return false;
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
!RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
@@ -1723,7 +2475,9 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
const DebugLoc &DL = MI.getDebugLoc();
const bool Is64 = DstTy.getSizeInBits() == 64;
- unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubReg;
+ std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
+ DstTy.getSizeInBits() / 8);
if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
if (DstTy.getSizeInBits() != 32 && !Is64)
@@ -1766,6 +2520,237 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
return true;
}
+// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
+bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
+ MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register VecReg = MI.getOperand(1).getReg();
+ Register ValReg = MI.getOperand(2).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
+
+ LLT VecTy = MRI->getType(DstReg);
+ LLT ValTy = MRI->getType(ValReg);
+ unsigned VecSize = VecTy.getSizeInBits();
+ unsigned ValSize = ValTy.getSizeInBits();
+
+ const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
+ const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
+ const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
+
+ assert(VecTy.getElementType() == ValTy);
+
+ // The index must be scalar. If it wasn't RegBankSelect should have moved this
+ // into a waterfall loop.
+ if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
+ *MRI);
+ const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
+ *MRI);
+
+ if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
+ !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
+ !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
+ return false;
+
+ if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
+ return false;
+
+ unsigned SubReg;
+ std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
+ ValSize / 8);
+
+ const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
+ STI.useVGPRIndexMode();
+
+ MachineBasicBlock *BB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (IndexMode) {
+ BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(IdxReg)
+ .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
+ } else {
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(IdxReg);
+ }
+
+ const MCInstrDesc &RegWriteOp
+ = TII.getIndirectRegWritePseudo(VecSize, ValSize,
+ VecRB->getID() == AMDGPU::SGPRRegBankID);
+ BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
+ .addReg(VecReg)
+ .addReg(ValReg)
+ .addImm(SubReg);
+
+ if (IndexMode)
+ BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
+
+ MI.eraseFromParent();
+ return true;
+}
+
+static bool isZeroOrUndef(int X) {
+ return X == 0 || X == -1;
+}
+
+static bool isOneOrUndef(int X) {
+ return X == 1 || X == -1;
+}
+
+static bool isZeroOrOneOrUndef(int X) {
+ return X == 0 || X == 1 || X == -1;
+}
+
+// Normalize a VOP3P shuffle mask to refer to the low/high half of a single
+// 32-bit register.
+static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
+ ArrayRef<int> Mask) {
+ NewMask[0] = Mask[0];
+ NewMask[1] = Mask[1];
+ if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
+ return Src0;
+
+ assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
+ assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
+
+ // Shift the mask inputs to be 0/1;
+ NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
+ NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
+ return Src1;
+}
+
+// This is only legal with VOP3P instructions as an aid to op_sel matching.
+bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
+ MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0Reg = MI.getOperand(1).getReg();
+ Register Src1Reg = MI.getOperand(2).getReg();
+ ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
+
+ const LLT V2S16 = LLT::vector(2, 16);
+ if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
+ return false;
+
+ if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
+ return false;
+
+ assert(ShufMask.size() == 2);
+ assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ const TargetRegisterClass &RC = IsVALU ?
+ AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+
+ // Handle the degenerate case which should have folded out.
+ if (ShufMask[0] == -1 && ShufMask[1] == -1) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
+
+ MI.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, RC, *MRI);
+ }
+
+ // A legal VOP3P mask only reads one of the sources.
+ int Mask[2];
+ Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
+
+ if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
+ !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
+ return false;
+
+ // TODO: This also should have been folded out
+ if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(SrcVec);
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (Mask[0] == 1 && Mask[1] == -1) {
+ if (IsVALU) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
+ .addImm(16)
+ .addReg(SrcVec);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
+ .addReg(SrcVec)
+ .addImm(16);
+ }
+ } else if (Mask[0] == -1 && Mask[1] == 0) {
+ if (IsVALU) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
+ .addImm(16)
+ .addReg(SrcVec);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
+ .addReg(SrcVec)
+ .addImm(16);
+ }
+ } else if (Mask[0] == 0 && Mask[1] == 0) {
+ if (IsVALU) {
+ // Write low half of the register into the high half.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(SrcVec) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
+ .addReg(SrcVec, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
+ .addReg(SrcVec)
+ .addReg(SrcVec);
+ }
+ } else if (Mask[0] == 1 && Mask[1] == 1) {
+ if (IsVALU) {
+ // Write high half of the register into the low half.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(SrcVec) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
+ .addReg(SrcVec, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
+ .addReg(SrcVec)
+ .addReg(SrcVec);
+ }
+ } else if (Mask[0] == 1 && Mask[1] == 0) {
+ if (IsVALU) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
+ .addReg(SrcVec)
+ .addReg(SrcVec)
+ .addImm(16);
+ } else {
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
+ .addReg(SrcVec)
+ .addImm(16);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
+ .addReg(TmpReg)
+ .addReg(SrcVec);
+ }
+ } else
+ llvm_unreachable("all shuffle masks should be handled");
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
@@ -1780,9 +2765,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_AND:
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR:
- if (selectG_AND_OR_XOR(I))
+ if (selectImpl(I, *CoverageInfo))
return true;
- return selectImpl(I, *CoverageInfo);
+ return selectG_AND_OR_XOR(I);
case TargetOpcode::G_ADD:
case TargetOpcode::G_SUB:
if (selectImpl(I, *CoverageInfo))
@@ -1800,6 +2785,14 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
return selectG_CONSTANT(I);
+ case TargetOpcode::G_FNEG:
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+ return selectG_FNEG(I);
+ case TargetOpcode::G_FABS:
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+ return selectG_FABS(I);
case TargetOpcode::G_EXTRACT:
return selectG_EXTRACT(I);
case TargetOpcode::G_MERGE_VALUES:
@@ -1808,6 +2801,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectG_MERGE_VALUES(I);
case TargetOpcode::G_UNMERGE_VALUES:
return selectG_UNMERGE_VALUES(I);
+ case TargetOpcode::G_BUILD_VECTOR_TRUNC:
+ return selectG_BUILD_VECTOR_TRUNC(I);
case TargetOpcode::G_PTR_ADD:
return selectG_PTR_ADD(I);
case TargetOpcode::G_IMPLICIT_DEF:
@@ -1836,6 +2831,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_FADD:
return selectG_LOAD_ATOMICRMW(I);
+ case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
+ return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
case TargetOpcode::G_STORE:
@@ -1845,17 +2842,34 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_SEXT:
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_ANYEXT:
+ case TargetOpcode::G_SEXT_INREG:
if (selectImpl(I, *CoverageInfo))
return true;
return selectG_SZA_EXT(I);
case TargetOpcode::G_BRCOND:
return selectG_BRCOND(I);
case TargetOpcode::G_FRAME_INDEX:
- return selectG_FRAME_INDEX(I);
- case TargetOpcode::G_PTR_MASK:
- return selectG_PTR_MASK(I);
+ case TargetOpcode::G_GLOBAL_VALUE:
+ return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
+ case TargetOpcode::G_PTRMASK:
+ return selectG_PTRMASK(I);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return selectG_EXTRACT_VECTOR_ELT(I);
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
+ return selectG_INSERT_VECTOR_ELT(I);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return selectG_SHUFFLE_VECTOR(I);
+ case AMDGPU::G_AMDGPU_ATOMIC_INC:
+ case AMDGPU::G_AMDGPU_ATOMIC_DEC:
+ initM0(I);
+ return selectImpl(I, *CoverageInfo);
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+ const AMDGPU::ImageDimIntrinsicInfo *Intr
+ = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
+ assert(Intr && "not an image intrinsic with image pseudo");
+ return selectImageIntrinsic(I, Intr);
+ }
default:
return selectImpl(I, *CoverageInfo);
}
@@ -1871,15 +2885,16 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(
- Register Src) const {
+AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
+ Register Src = Root.getReg();
+ Register OrigSrc = Src;
unsigned Mods = 0;
- MachineInstr *MI = MRI->getVRegDef(Src);
+ MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
Src = MI->getOperand(1).getReg();
Mods |= SISrcMods::NEG;
- MI = MRI->getVRegDef(Src);
+ MI = getDefIgnoringCopies(Src, *MRI);
}
if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
@@ -1887,6 +2902,20 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(
Mods |= SISrcMods::ABS;
}
+ if (Mods != 0 &&
+ RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
+ MachineInstr *UseMI = Root.getParent();
+
+ // If we looked through copies to find source modifiers on an SGPR operand,
+ // we now have an SGPR register source. To avoid potentially violating the
+ // constant bus restriction, we need to insert a copy to a VGPR.
+ Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
+ BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
+ TII.get(AMDGPU::COPY), VGPRSrc)
+ .addReg(Src);
+ Src = VGPRSrc;
+ }
+
return std::make_pair(Src, Mods);
}
@@ -1904,7 +2933,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1927,7 +2956,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1936,12 +2965,48 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
+ Register Reg = Root.getReg();
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+ if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
+ Def->getOpcode() == AMDGPU::G_FABS))
+ return {};
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+ }};
+}
+
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectVOP3PModsImpl(
+ Register Src, const MachineRegisterInfo &MRI) const {
+ unsigned Mods = 0;
+ MachineInstr *MI = MRI.getVRegDef(Src);
+
+ if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
+ // It's possible to see an f32 fneg here, but unlikely.
+ // TODO: Treat f32 fneg as only high bit.
+ MRI.getType(Src) == LLT::vector(2, 16)) {
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ Src = MI->getOperand(1).getReg();
+ MI = MRI.getVRegDef(Src);
+ }
+
+ // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+
+ // Packed instructions do not have abs modifiers.
+ Mods |= SISrcMods::OP_SEL_1;
+
+ return std::make_pair(Src, Mods);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
- if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
- return None;
+ std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1950,12 +3015,16 @@ AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const {
- // FIXME: Handle clamp and op_sel
+AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
+ return None;
+
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}
@@ -1977,15 +3046,15 @@ AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
return None;
const GEPInfo &GEPInfo = AddrInfo[0];
-
- if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm))
+ Optional<int64_t> EncodedImm =
+ AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
+ if (!EncodedImm)
return None;
unsigned PtrReg = GEPInfo.SgprParts[0];
- int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
}};
}
@@ -1998,14 +3067,15 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
return None;
const GEPInfo &GEPInfo = AddrInfo[0];
- unsigned PtrReg = GEPInfo.SgprParts[0];
- int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
- if (!isUInt<32>(EncodedImm))
+ Register PtrReg = GEPInfo.SgprParts[0];
+ Optional<int64_t> EncodedImm =
+ AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
+ if (!EncodedImm)
return None;
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
}};
}
@@ -2023,14 +3093,15 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
return None;
const GEPInfo &GEPInfo = AddrInfo[0];
- if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm))
+ // SGPR offset is unsigned.
+ if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
return None;
// If we make it this far we have a load with an 32-bit immediate offset.
// It is OK to select this using a sgpr offset, because we have already
// failed trying to select this load into one of the _IMM variants since
// the _IMM Patterns are considered before the _SGPR patterns.
- unsigned PtrReg = GEPInfo.SgprParts[0];
+ Register PtrReg = GEPInfo.SgprParts[0];
Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(GEPInfo.Imm);
@@ -2099,7 +3170,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
int64_t Offset = 0;
- if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) {
+ if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
+ Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// TODO: Should this be inside the render function? The iterator seems to
@@ -2118,17 +3190,17 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
const MachineMemOperand *MMO = *MI->memoperands_begin();
const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
- Register SOffsetReg = isStackPtrRelative(PtrInfo)
- ? Info->getStackPtrOffsetReg()
- : Info->getScratchWaveOffsetReg();
- MIB.addReg(SOffsetReg);
+ if (isStackPtrRelative(PtrInfo))
+ MIB.addReg(Info->getStackPtrOffsetReg());
+ else
+ MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset & 4095);
}}};
}
- assert(Offset == 0);
+ assert(Offset == 0 || Offset == -1);
// Try to fold a frame index directly into the MUBUF vaddr field, and any
// offsets.
@@ -2158,13 +3230,6 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
}
}
- // If we don't know this private access is a local stack object, it needs to
- // be relative to the entry point's scratch wave offset register.
- // TODO: Should split large offsets that don't fit like above.
- // TODO: Don't use scratch wave offset just because the offset didn't fit.
- Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg()
- : Info->getScratchWaveOffsetReg();
-
return {{[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
},
@@ -2175,15 +3240,22 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MIB.addReg(VAddr);
},
[=](MachineInstrBuilder &MIB) { // soffset
- MIB.addReg(SOffset);
+ // If we don't know this private access is a local stack object, it
+ // needs to be relative to the entry point's scratch wave offset.
+ // TODO: Should split large offsets that don't fit like above.
+ // TODO: Don't use scratch wave offset just because the offset
+ // didn't fit.
+ if (!Info->isEntryFunction() && FI.hasValue())
+ MIB.addReg(Info->getStackPtrOffsetReg());
+ else
+ MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset);
}}};
}
-bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI,
- const MachineOperand &Base,
+bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
int64_t Offset,
unsigned OffsetBits) const {
if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
@@ -2195,7 +3267,7 @@ bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI,
// On Southern Islands instruction with a negative base value and an offset
// don't seem to work.
- return KnownBits->signBitIsZero(Base.getReg());
+ return KnownBits->signBitIsZero(Base);
}
InstructionSelector::ComplexRendererFns
@@ -2214,68 +3286,485 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
const MachineMemOperand *MMO = *MI->memoperands_begin();
const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
- Register SOffsetReg = isStackPtrRelative(PtrInfo)
- ? Info->getStackPtrOffsetReg()
- : Info->getScratchWaveOffsetReg();
return {{
- [=](MachineInstrBuilder &MIB) {
+ [=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
- }, // rsrc
- [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (isStackPtrRelative(PtrInfo))
+ MIB.addReg(Info->getStackPtrOffsetReg());
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
}};
}
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
+ const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
+ if (!RootDef)
+ return std::make_pair(Root.getReg(), 0);
+
+ int64_t ConstAddr = 0;
+
+ Register PtrBase;
+ int64_t Offset;
+ std::tie(PtrBase, Offset) =
+ getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+
+ if (Offset) {
+ if (isDSOffsetLegal(PtrBase, Offset, 16)) {
+ // (add n0, c0)
+ return std::make_pair(PtrBase, Offset);
+ }
+ } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+ // TODO
+
+
+ } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+ // TODO
+
+ }
+
+ return std::make_pair(Root.getReg(), 0);
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
+ Register Reg;
+ unsigned Offset;
+ std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
+ Register Reg;
+ unsigned Offset;
+ std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
+ }};
+}
+
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
- if (!RootDef) {
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
- }};
- }
+ if (!RootDef)
+ return std::make_pair(Root.getReg(), 0);
int64_t ConstAddr = 0;
- if (isBaseWithConstantOffset(Root, *MRI)) {
- const MachineOperand &LHS = RootDef->getOperand(1);
- const MachineOperand &RHS = RootDef->getOperand(2);
- const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
- const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
- if (LHSDef && RHSDef) {
- int64_t PossibleOffset =
- RHSDef->getOperand(1).getCImm()->getSExtValue();
- if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) {
- // (add n0, c0)
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); }
- }};
- }
+
+ Register PtrBase;
+ int64_t Offset;
+ std::tie(PtrBase, Offset) =
+ getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+
+ if (Offset) {
+ int64_t DWordOffset0 = Offset / 4;
+ int64_t DWordOffset1 = DWordOffset0 + 1;
+ if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
+ // (add n0, c0)
+ return std::make_pair(PtrBase, DWordOffset0);
}
} else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+ // TODO
+ } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+ // TODO
+ }
- } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+ return std::make_pair(Root.getReg(), 0);
+}
+
+/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
+/// the base value with the constant offset. There may be intervening copies
+/// between \p Root and the identified constant. Returns \p Root, 0 if this does
+/// not match the pattern.
+std::pair<Register, int64_t>
+AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
+ Register Root, const MachineRegisterInfo &MRI) const {
+ MachineInstr *RootI = MRI.getVRegDef(Root);
+ if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
+ return {Root, 0};
+
+ MachineOperand &RHS = RootI->getOperand(2);
+ Optional<ValueAndVReg> MaybeOffset
+ = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
+ if (!MaybeOffset)
+ return {Root, 0};
+ return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
+}
+
+static void addZeroImm(MachineInstrBuilder &MIB) {
+ MIB.addImm(0);
+}
+
+/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
+/// BasePtr is not valid, a null base pointer will be used.
+static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ uint32_t FormatLo, uint32_t FormatHi,
+ Register BasePtr) {
+ Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
+
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(RSrc2)
+ .addImm(FormatLo);
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(RSrc3)
+ .addImm(FormatHi);
+
+ // Build the half of the subregister with the constants before building the
+ // full 128-bit register. If we are building multiple resource descriptors,
+ // this will allow CSEing of the 2-component register.
+ B.buildInstr(AMDGPU::REG_SEQUENCE)
+ .addDef(RSrcHi)
+ .addReg(RSrc2)
+ .addImm(AMDGPU::sub0)
+ .addReg(RSrc3)
+ .addImm(AMDGPU::sub1);
+
+ Register RSrcLo = BasePtr;
+ if (!BasePtr) {
+ RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ B.buildInstr(AMDGPU::S_MOV_B64)
+ .addDef(RSrcLo)
+ .addImm(0);
+ }
+
+ B.buildInstr(AMDGPU::REG_SEQUENCE)
+ .addDef(RSrc)
+ .addReg(RSrcLo)
+ .addImm(AMDGPU::sub0_sub1)
+ .addReg(RSrcHi)
+ .addImm(AMDGPU::sub2_sub3);
+
+ return RSrc;
+}
+
+static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ const SIInstrInfo &TII, Register BasePtr) {
+ uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
+
+ // FIXME: Why are half the "default" bits ignored based on the addressing
+ // mode?
+ return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
+}
+
+static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ const SIInstrInfo &TII, Register BasePtr) {
+ uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
+
+ // FIXME: Why are half the "default" bits ignored based on the addressing
+ // mode?
+ return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
+}
+
+AMDGPUInstructionSelector::MUBUFAddressData
+AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
+ MUBUFAddressData Data;
+ Data.N0 = Src;
+
+ Register PtrBase;
+ int64_t Offset;
+
+ std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
+ if (isUInt<32>(Offset)) {
+ Data.N0 = PtrBase;
+ Data.Offset = Offset;
+ }
+
+ if (MachineInstr *InputAdd
+ = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
+ Data.N2 = InputAdd->getOperand(1).getReg();
+ Data.N3 = InputAdd->getOperand(2).getReg();
+
+ // FIXME: Need to fix extra SGPR->VGPRcopies inserted
+ // FIXME: Don't know this was defined by operand 0
+ //
+ // TODO: Remove this when we have copy folding optimizations after
+ // RegBankSelect.
+ Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
+ Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
+ }
+
+ return Data;
+}
+
+/// Return if the addr64 mubuf mode should be used for the given address.
+bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
+ // (ptr_add N2, N3) -> addr64, or
+ // (ptr_add (ptr_add N2, N3), C1) -> addr64
+ if (Addr.N2)
+ return true;
+
+ const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
+ return N0Bank->getID() == AMDGPU::VGPRRegBankID;
+}
+/// Split an immediate offset \p ImmOffset depending on whether it fits in the
+/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
+/// component.
+void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
+ MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
+ if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
+ return;
+
+ // Illegal offset, store it in soffset.
+ SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(SOffset)
+ .addImm(ImmOffset);
+ ImmOffset = 0;
+}
+bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
+ MachineOperand &Root, Register &VAddr, Register &RSrcReg,
+ Register &SOffset, int64_t &Offset) const {
+ // FIXME: Predicates should stop this from reaching here.
+ // addr64 bit was removed for volcanic islands.
+ if (!STI.hasAddr64() || STI.useFlatForGlobal())
+ return false;
+
+ MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
+ if (!shouldUseAddr64(AddrData))
+ return false;
+
+ Register N0 = AddrData.N0;
+ Register N2 = AddrData.N2;
+ Register N3 = AddrData.N3;
+ Offset = AddrData.Offset;
+
+ // Base pointer for the SRD.
+ Register SRDPtr;
+
+ if (N2) {
+ if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
+ assert(N3);
+ if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
+ // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
+ // addr64, and construct the default resource from a 0 address.
+ VAddr = N0;
+ } else {
+ SRDPtr = N3;
+ VAddr = N2;
+ }
+ } else {
+ // N2 is not divergent.
+ SRDPtr = N2;
+ VAddr = N3;
+ }
+ } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
+ // Use the default null pointer in the resource
+ VAddr = N0;
+ } else {
+ // N0 -> offset, or
+ // (N0 + C1) -> offset
+ SRDPtr = N0;
}
+ MachineIRBuilder B(*Root.getParent());
+ RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
+ splitIllegalMUBUFOffset(B, SOffset, Offset);
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
+ MachineOperand &Root, Register &RSrcReg, Register &SOffset,
+ int64_t &Offset) const {
+ MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
+ if (shouldUseAddr64(AddrData))
+ return false;
+
+ // N0 -> offset, or
+ // (N0 + C1) -> offset
+ Register SRDPtr = AddrData.N0;
+ Offset = AddrData.Offset;
+
+ // TODO: Look through extensions for 32-bit soffset.
+ MachineIRBuilder B(*Root.getParent());
+
+ RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
+ splitIllegalMUBUFOffset(B, SOffset, Offset);
+ return true;
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
+ Register VAddr;
+ Register RSrcReg;
+ Register SOffset;
+ int64_t Offset = 0;
+
+ if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
+ return {};
+
+ // FIXME: Use defaulted operands for trailing 0s and remove from the complex
+ // pattern.
return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(RSrcReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // vaddr
+ MIB.addReg(VAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(Offset);
+ },
+ addZeroImm, // glc
+ addZeroImm, // slc
+ addZeroImm, // tfe
+ addZeroImm, // dlc
+ addZeroImm // swz
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
+ Register RSrcReg;
+ Register SOffset;
+ int64_t Offset = 0;
+
+ if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
+ return {};
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(RSrcReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
+ addZeroImm, // glc
+ addZeroImm, // slc
+ addZeroImm, // tfe
+ addZeroImm, // dlc
+ addZeroImm // swz
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
+ Register VAddr;
+ Register RSrcReg;
+ Register SOffset;
+ int64_t Offset = 0;
+
+ if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
+ return {};
+
+ // FIXME: Use defaulted operands for trailing 0s and remove from the complex
+ // pattern.
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(RSrcReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // vaddr
+ MIB.addReg(VAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(Offset);
+ },
+ addZeroImm // slc
}};
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
+ Register RSrcReg;
+ Register SOffset;
+ int64_t Offset = 0;
+
+ if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
+ return {};
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(RSrcReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
+ addZeroImm // slc
+ }};
+}
+
+/// Get an immediate that must be 32-bits, and treated as zero extended.
+static Optional<uint64_t> getConstantZext32Val(Register Reg,
+ const MachineRegisterInfo &MRI) {
+ // getConstantVRegVal sexts any values, so see if that matters.
+ Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
+ if (!OffsetVal || !isInt<32>(*OffsetVal))
+ return None;
+ return Lo_32(*OffsetVal);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
+ Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
+ if (!OffsetVal)
+ return {};
+
+ Optional<int64_t> EncodedImm =
+ AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
+ if (!EncodedImm)
+ return {};
+
+ return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
+ assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
+
+ Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
+ if (!OffsetVal)
+ return {};
+
+ Optional<int64_t> EncodedImm
+ = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
+ if (!EncodedImm)
+ return {};
+
+ return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
+}
+
void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
"Expected G_CONSTANT");
- Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), *MRI);
- assert(CstVal && "Expected constant value");
- MIB.addImm(CstVal.getValue());
+ MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
}
void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
@@ -2316,6 +3805,34 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
MIB.addImm(MI.getOperand(OpIdx).getImm());
}
+void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
+}
+
+void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
+}
+
+void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
+}
+
+void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
+}
+
bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
}