aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp770
1 files changed, 572 insertions, 198 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b7d0f0580cda..3f242fdb6d8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -80,8 +81,11 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
if (RC) {
const LLT Ty = MRI.getType(Reg);
- return RC->hasSuperClassEq(TRI.getBoolRC()) &&
- Ty.isValid() && Ty.getSizeInBits() == 1;
+ if (!Ty.isValid() || Ty.getSizeInBits() != 1)
+ return false;
+ // G_TRUNC s1 result is never vcc.
+ return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
+ RC->hasSuperClassEq(TRI.getBoolRC());
}
const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
@@ -91,7 +95,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
unsigned NewOpc) const {
MI.setDesc(TII.get(NewOpc));
- MI.RemoveOperand(1); // Remove intrinsic ID.
+ MI.removeOperand(1); // Remove intrinsic ID.
MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
MachineOperand &Dst = MI.getOperand(0);
@@ -216,7 +220,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
- DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
+ DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
if (!DefRC) {
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
return false;
@@ -454,6 +458,24 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
return true;
}
+bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
+ MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
+
+ unsigned Opc;
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11)
+ Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
+ : AMDGPU::V_MAD_I64_I32_gfx11_e64;
+ else
+ Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
+ I.setDesc(TII.get(Opc));
+ I.addOperand(*MF, MachineOperand::CreateImm(0));
+ I.addImplicitDefUseOperands(*MF);
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
// TODO: We should probably legalize these to only using 32-bit results.
bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
@@ -481,7 +503,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
const TargetRegisterClass *SrcRC =
- TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
if (!SrcRC)
return false;
unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
@@ -514,7 +536,7 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
const unsigned DstSize = DstTy.getSizeInBits();
const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
if (!DstRC)
return false;
@@ -556,7 +578,7 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
const TargetRegisterClass *SrcRC =
- TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
return false;
@@ -630,7 +652,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
MI.setDesc(TII.get(AMDGPU::COPY));
- MI.RemoveOperand(2);
+ MI.removeOperand(2);
return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
}
@@ -643,6 +665,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
//
// (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
// => (S_PACK_HH_B32_B16 $src0, $src1)
+ // (build_vector_trunc (lshr_oneuse SReg_32:$src0, 16), $src1)
+ // => (S_PACK_HL_B32_B16 $src0, $src1)
// (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
// => (S_PACK_LH_B32_B16 $src0, $src1)
// (build_vector_trunc $src0, $src1)
@@ -662,14 +686,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
} else if (Shift1) {
Opc = AMDGPU::S_PACK_LH_B32_B16;
MI.getOperand(2).setReg(ShiftSrc1);
- } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
- // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
- auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
- .addReg(ShiftSrc0)
- .addImm(16);
+ } else if (Shift0) {
+ if (ConstSrc1 && ConstSrc1->Value == 0) {
+ // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
+ auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
+ .addReg(ShiftSrc0)
+ .addImm(16);
- MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ }
+ if (STI.hasSPackHL()) {
+ Opc = AMDGPU::S_PACK_HL_B32_B16;
+ MI.getOperand(1).setReg(ShiftSrc0);
+ }
}
MI.setDesc(TII.get(Opc));
@@ -722,16 +752,16 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
if (!DstRC)
return false;
const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
const TargetRegisterClass *Src0RC =
- TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
const TargetRegisterClass *Src1RC =
- TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
+ TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
// Deal with weird cases where the class only partially supports the subreg
// index.
@@ -970,6 +1000,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return selectGroupStaticSize(I);
case Intrinsic::returnaddress:
return selectReturnAddress(I);
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
+ case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
+ case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+ return selectSMFMACIntrin(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -1142,7 +1179,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
Optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
- if (Arg.hasValue()) {
+ if (Arg) {
const int64_t Value = Arg.getValue().Value.getSExtValue();
if (Value == 0) {
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
@@ -1164,8 +1201,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
- const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
+ const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
return false;
@@ -1300,12 +1336,14 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
unsigned Offset0 = OrderedCountIndex << 2;
- unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
- (Instruction << 4);
+ unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
Offset1 |= (CountDw - 1) << 6;
+ if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
+ Offset1 |= ShaderType << 2;
+
unsigned Offset = Offset0 | (Offset1 << 8);
Register M0Val = MI.getOperand(2).getReg();
@@ -1424,23 +1462,7 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
if (HasVSrc) {
Register VSrc = MI.getOperand(1).getReg();
-
- if (STI.needsAlignedVGPRs()) {
- // Add implicit aligned super-reg to force alignment on the data operand.
- Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
- Register NewVR =
- MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
- BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
- .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
- .addImm(AMDGPU::sub0)
- .addReg(Undef)
- .addImm(AMDGPU::sub1);
- MIB.addReg(NewVR, 0, AMDGPU::sub0);
- MIB.addReg(NewVR, RegState::Implicit);
- } else {
- MIB.addReg(VSrc);
- }
+ MIB.addReg(VSrc);
if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
return false;
@@ -1449,6 +1471,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
MIB.addImm(ImmOffset)
.cloneMemRefs(MI);
+ TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
+
MI.eraseFromParent();
return true;
}
@@ -1523,6 +1547,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
unsigned IntrOpcode = Intr->BaseOpcode;
const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
+ const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
@@ -1627,7 +1652,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
}
// The legalizer preprocessed the intrinsic arguments. If we aren't using
- // NSA, these should have beeen packed into a single value in the first
+ // NSA, these should have been packed into a single value in the first
// address register
const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
@@ -1639,13 +1664,29 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
++NumVDataDwords;
int Opcode = -1;
- if (IsGFX10Plus) {
+ if (IsGFX11Plus) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+ UseNSA ? AMDGPU::MIMGEncGfx11NSA
+ : AMDGPU::MIMGEncGfx11Default,
+ NumVDataDwords, NumVAddrDwords);
+ } else if (IsGFX10Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx10NSA
: AMDGPU::MIMGEncGfx10Default,
NumVDataDwords, NumVAddrDwords);
} else {
- if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->hasGFX90AInsts()) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1) {
+ LLVM_DEBUG(
+ dbgs()
+ << "requested image instruction is not supported on this GPU\n");
+ return false;
+ }
+ }
+ if (Opcode == -1 &&
+ STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
@@ -1703,7 +1744,13 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
if (IsGFX10Plus)
MIB.addImm(IsA16 ? -1 : 0);
- MIB.addImm(TFE); // tfe
+ if (!Subtarget->hasGFX90AInsts()) {
+ MIB.addImm(TFE); // tfe
+ } else if (TFE) {
+ LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
+ return false;
+ }
+
MIB.addImm(LWE); // lwe
if (!IsGFX10Plus)
MIB.addImm(DimInfo->DA ? -1 : 0);
@@ -1743,7 +1790,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
}
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
+ return true;
}
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
@@ -1770,10 +1819,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectSBarrier(I);
case Intrinsic::amdgcn_global_atomic_fadd:
return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
- default: {
- return selectImpl(I, *CoverageInfo);
- }
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds:
+ return selectBufferLoadLds(I);
+ case Intrinsic::amdgcn_global_load_lds:
+ return selectGlobalLoadLds(I);
+ case Intrinsic::amdgcn_exp_compr:
+ if (!STI.hasCompressedExport()) {
+ Function &F = I.getMF()->getFunction();
+ DiagnosticInfoUnsupported NoFpRet(
+ F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
+ F.getContext().diagnose(NoFpRet);
+ return false;
+ }
+ break;
}
+ return selectImpl(I, *CoverageInfo);
}
bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
@@ -1872,10 +1933,10 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
- const TargetRegisterClass *SrcRC
- = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
- const TargetRegisterClass *DstRC
- = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
+ const TargetRegisterClass *SrcRC =
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
if (!SrcRC || !DstRC)
return false;
@@ -2014,10 +2075,10 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return selectCOPY(I);
const TargetRegisterClass *SrcRC =
- TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
+ TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
Register UndefReg = MRI->createVirtualRegister(SrcRC);
BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
@@ -2384,65 +2445,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
return selectImpl(I, *CoverageInfo);
}
-// TODO: No rtn optimization.
-bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
- MachineInstr &MI) const {
- Register PtrReg = MI.getOperand(1).getReg();
- const LLT PtrTy = MRI->getType(PtrReg);
- if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
- STI.useFlatForGlobal())
- return selectImpl(MI, *CoverageInfo);
-
- Register DstReg = MI.getOperand(0).getReg();
- const LLT Ty = MRI->getType(DstReg);
- const bool Is64 = Ty.getSizeInBits() == 64;
- const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
- Register TmpReg = MRI->createVirtualRegister(
- Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
-
- const DebugLoc &DL = MI.getDebugLoc();
- MachineBasicBlock *BB = MI.getParent();
-
- Register VAddr, RSrcReg, SOffset;
- int64_t Offset = 0;
-
- unsigned Opcode;
- if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
- Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
- AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
- } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
- RSrcReg, SOffset, Offset)) {
- Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
- AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
- } else
- return selectImpl(MI, *CoverageInfo);
-
- auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
- .addReg(MI.getOperand(2).getReg());
-
- if (VAddr)
- MIB.addReg(VAddr);
-
- MIB.addReg(RSrcReg);
- if (SOffset)
- MIB.addReg(SOffset);
- else
- MIB.addImm(0);
-
- MIB.addImm(Offset);
- MIB.addImm(AMDGPU::CPol::GLC);
- MIB.cloneMemRefs(MI);
-
- BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
- .addReg(TmpReg, RegState::Kill, SubReg);
-
- MI.eraseFromParent();
-
- MRI->setRegClass(
- DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
-}
-
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
if (Reg.isPhysical())
return false;
@@ -2551,7 +2553,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
// Try to avoid emitting a bit operation when we only need to touch half of
// the 64-bit pointer.
- APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
+ APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zext(64);
const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
@@ -2571,12 +2573,10 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
const TargetRegisterClass &RegRC
= IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
- const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
- *MRI);
- const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
- *MRI);
+ const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
+ const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
const TargetRegisterClass *MaskRC =
- TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
+ TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
@@ -2689,10 +2689,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
return false;
- const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
- *MRI);
- const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
- *MRI);
+ const TargetRegisterClass *SrcRC =
+ TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
if (!SrcRC || !DstRC)
return false;
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
@@ -2771,10 +2771,10 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
return false;
- const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
- *MRI);
- const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
- *MRI);
+ const TargetRegisterClass *VecRC =
+ TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
+ const TargetRegisterClass *ValRC =
+ TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
!RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
@@ -2867,7 +2867,6 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
return false;
assert(ShufMask.size() == 2);
- assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2924,17 +2923,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
}
} else if (Mask[0] == 0 && Mask[1] == 0) {
if (IsVALU) {
- // Write low half of the register into the high half.
- MachineInstr *MovSDWA =
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
- .addImm(0) // $src0_modifiers
- .addReg(SrcVec) // $src0
- .addImm(0) // $clamp
- .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
- .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
- .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
- .addReg(SrcVec, RegState::Implicit);
- MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ if (STI.hasSDWA()) {
+ // Write low half of the register into the high half.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(SrcVec) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
+ .addReg(SrcVec, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
+ .addImm(0xFFFF)
+ .addReg(SrcVec);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg)
+ .addReg(TmpReg)
+ .addImm(16)
+ .addReg(TmpReg);
+ }
} else {
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
.addReg(SrcVec)
@@ -2942,17 +2952,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
}
} else if (Mask[0] == 1 && Mask[1] == 1) {
if (IsVALU) {
- // Write high half of the register into the low half.
- MachineInstr *MovSDWA =
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
- .addImm(0) // $src0_modifiers
- .addReg(SrcVec) // $src0
- .addImm(0) // $clamp
- .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
- .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
- .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
- .addReg(SrcVec, RegState::Implicit);
- MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ if (STI.hasSDWA()) {
+ // Write high half of the register into the low half.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(SrcVec) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
+ .addReg(SrcVec, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+ .addImm(16)
+ .addReg(SrcVec);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg)
+ .addReg(TmpReg)
+ .addImm(16)
+ .addReg(TmpReg);
+ }
} else {
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
.addReg(SrcVec)
@@ -2965,13 +2986,19 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
.addReg(SrcVec)
.addImm(16);
} else {
- Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
- .addReg(SrcVec)
- .addImm(16);
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
- .addReg(TmpReg)
- .addReg(SrcVec);
+ if (STI.hasSPackHL()) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HL_B32_B16), DstReg)
+ .addReg(SrcVec)
+ .addReg(SrcVec);
+ } else {
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
+ .addReg(SrcVec)
+ .addImm(16);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
+ .addReg(TmpReg)
+ .addReg(SrcVec);
+ }
}
} else
llvm_unreachable("all shuffle masks should be handled");
@@ -2982,13 +3009,15 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
MachineInstr &MI) const {
- if (STI.hasGFX90AInsts())
+ const Register DefReg = MI.getOperand(0).getReg();
+ LLT DefTy = MRI->getType(DefReg);
+ if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy))
return selectImpl(MI, *CoverageInfo);
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
+ if (!MRI->use_nodbg_empty(DefReg)) {
Function &F = MBB->getParent()->getFunction();
DiagnosticInfoUnsupported
NoFpRet(F, "return versions of fp atomics not supported",
@@ -3105,9 +3134,236 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
+bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
+ unsigned Opc;
+ unsigned Size = MI.getOperand(3).getImm();
+
+ // The struct intrinsic variants add one additional operand over raw.
+ const bool HasVIndex = MI.getNumOperands() == 9;
+ Register VIndex;
+ int OpOffset = 0;
+ if (HasVIndex) {
+ VIndex = MI.getOperand(4).getReg();
+ OpOffset = 1;
+ }
+
+ Register VOffset = MI.getOperand(4 + OpOffset).getReg();
+ Optional<ValueAndVReg> MaybeVOffset =
+ getIConstantVRegValWithLookThrough(VOffset, *MRI);
+ const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
+
+ switch (Size) {
+ default:
+ return false;
+ case 1:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
+ break;
+ case 2:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
+ break;
+ case 4:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
+ break;
+ }
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .add(MI.getOperand(2));
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
+
+ if (HasVIndex && HasVOffset) {
+ Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
+ BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
+ .addReg(VIndex)
+ .addImm(AMDGPU::sub0)
+ .addReg(VOffset)
+ .addImm(AMDGPU::sub1);
+
+ MIB.addReg(IdxReg);
+ } else if (HasVIndex) {
+ MIB.addReg(VIndex);
+ } else if (HasVOffset) {
+ MIB.addReg(VOffset);
+ }
+
+ MIB.add(MI.getOperand(1)); // rsrc
+ MIB.add(MI.getOperand(5 + OpOffset)); // soffset
+ MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
+ unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
+ MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
+ MIB.addImm((Aux >> 3) & 1); // swz
+
+ MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ StorePtrI.V = nullptr;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+
+ MachineMemOperand *StoreMMO =
+ MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), LoadMMO->getBaseAlign());
+
+ MIB.setMemRefs({LoadMMO, StoreMMO});
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+/// Match a zero extend from a 32-bit value to 64-bits.
+static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+ Register ZExtSrc;
+ if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
+ return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+
+ // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return false;
+
+ if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+ return Def->getOperand(1).getReg();
+ }
+
+ return Register();
+}
+
+bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
+ unsigned Opc;
+ unsigned Size = MI.getOperand(3).getImm();
+
+ switch (Size) {
+ default:
+ return false;
+ case 1:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
+ break;
+ case 2:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
+ break;
+ case 4:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
+ break;
+ }
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .add(MI.getOperand(2));
+
+ Register Addr = MI.getOperand(1).getReg();
+ Register VOffset;
+ // Try to split SAddr and VOffset. Global and LDS pointers share the same
+ // immediate offset, so we cannot use a regular SelectGlobalSAddr().
+ if (!isSGPR(Addr)) {
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+ if (isSGPR(AddrDef->Reg)) {
+ Addr = AddrDef->Reg;
+ } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+ Register SAddr =
+ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+ if (SAddr && isSGPR(SAddr)) {
+ Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+ if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ Addr = SAddr;
+ VOffset = Off;
+ }
+ }
+ }
+ }
+
+ if (isSGPR(Addr)) {
+ Opc = AMDGPU::getGlobalSaddrOp(Opc);
+ if (!VOffset) {
+ VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
+ .addImm(0);
+ }
+ }
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
+ .addReg(Addr);
+
+ if (isSGPR(Addr))
+ MIB.addReg(VOffset);
+
+ MIB.add(MI.getOperand(4)) // offset
+ .add(MI.getOperand(5)); // cpol
+
+ MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = MI.getOperand(4).getImm();
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+ MachineMemOperand *StoreMMO =
+ MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), Align(4));
+
+ MIB.setMemRefs({LoadMMO, StoreMMO});
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
MI.setDesc(TII.get(MI.getOperand(1).getImm()));
- MI.RemoveOperand(1);
+ MI.removeOperand(1);
+ MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
+ unsigned Opc;
+ switch (MI.getIntrinsicID()) {
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
+ Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
+ Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
+ Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+ Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
+ break;
+ default:
+ llvm_unreachable("unhandled smfmac intrinsic");
+ }
+
+ auto VDst_In = MI.getOperand(4);
+
+ MI.setDesc(TII.get(Opc));
+ MI.removeOperand(4); // VDst_In
+ MI.removeOperand(1); // Intrinsic ID
+ MI.addOperand(VDst_In); // Readd VDst_In to the end
MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
return true;
}
@@ -3166,6 +3422,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_UADDE:
case TargetOpcode::G_USUBE:
return selectG_UADDO_USUBO_UADDE_USUBE(I);
+ case AMDGPU::G_AMDGPU_MAD_U64_U32:
+ case AMDGPU::G_AMDGPU_MAD_I64_I32:
+ return selectG_AMDGPU_MAD_64_32(I);
case TargetOpcode::G_INTTOPTR:
case TargetOpcode::G_BITCAST:
case TargetOpcode::G_PTRTOINT:
@@ -3226,8 +3485,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
return selectG_LOAD_STORE_ATOMICRMW(I);
- case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
- return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
case TargetOpcode::G_TRUNC:
@@ -3286,9 +3543,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
- bool AllowAbs) const {
+std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
+ MachineOperand &Root, bool AllowAbs, bool OpSel, bool ForceVGPR) const {
Register Src = Root.getReg();
Register OrigSrc = Src;
unsigned Mods = 0;
@@ -3305,7 +3561,10 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
Mods |= SISrcMods::ABS;
}
- if (Mods != 0 &&
+ if (OpSel)
+ Mods |= SISrcMods::OP_SEL_0;
+
+ if ((Mods != 0 || ForceVGPR) &&
RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
MachineInstr *UseMI = Root.getParent();
@@ -3407,7 +3666,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PModsImpl(
- Register Src, const MachineRegisterInfo &MRI) const {
+ Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
unsigned Mods = 0;
MachineInstr *MI = MRI.getVRegDef(Src);
@@ -3421,6 +3680,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
}
// TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+ (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
// Packed instructions do not have abs modifiers.
Mods |= SISrcMods::OP_SEL_1;
@@ -3444,6 +3704,50 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
+ // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+ // Value is in Imm operand as i1 sign extended to int64_t.
+ // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
+ assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
+ "expected i1 value");
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (Root.getImm() == -1)
+ Mods ^= SISrcMods::NEG;
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
+ MachineOperand &Root) const {
+ assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
+ "expected i1 value");
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (Root.getImm() != 0)
+ Mods |= SISrcMods::OP_SEL_0;
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
Register Src;
unsigned Mods;
@@ -3467,6 +3771,36 @@ AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ /* AllowAbs */ false,
+ /* OpSel */ false,
+ /* ForceVGPR */ true);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ /* AllowAbs */ false,
+ /* OpSel */ true,
+ /* ForceVGPR */ true);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
SmallVector<GEPInfo, 4> AddrInfo;
getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
@@ -3594,24 +3928,6 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
}};
}
-/// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
- Register ZExtSrc;
- if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
- return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
-
- // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
- const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
- if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
- return false;
-
- if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
- return Def->getOperand(1).getReg();
- }
-
- return Register();
-}
-
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
@@ -3631,9 +3947,6 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
ImmOffset = ConstOffset;
} else {
auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
- if (!PtrBaseDef)
- return None;
-
if (isSGPR(PtrBaseDef->Reg)) {
if (ConstOffset > 0) {
// Offset is too large.
@@ -3679,11 +3992,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
}
}
- auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
- if (!AddrDef)
- return None;
-
// Match the variable offset.
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
// Look through the SGPR->VGPR copy.
Register SAddr =
@@ -3749,9 +4059,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
}
auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
- if (!AddrDef)
- return None;
-
if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
int FI = AddrDef->MI->getOperand(1).getIndex();
return {{
@@ -3768,8 +4075,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
- if (LHSDef && RHSDef &&
- LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
+ if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
isSGPR(RHSDef->Reg)) {
int FI = LHSDef->MI->getOperand(1).getIndex();
MachineInstr &I = *Root.getParent();
@@ -3792,6 +4098,74 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
}};
}
+// Check whether the flat scratch SVS swizzle bug affects this access.
+bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
+ Register VAddr, Register SAddr, uint64_t ImmOffset) const {
+ if (!Subtarget->hasFlatScratchSVSSwizzleBug())
+ return false;
+
+ // The bug affects the swizzling of SVS accesses if there is any carry out
+ // from the two low order bits (i.e. from bit 1 into bit 2) when adding
+ // voffset to (soffset + inst_offset).
+ auto VKnown = KnownBits->getKnownBits(VAddr);
+ auto SKnown = KnownBits::computeForAddSub(
+ true, false, KnownBits->getKnownBits(SAddr),
+ KnownBits::makeConstant(APInt(32, ImmOffset)));
+ uint64_t VMax = VKnown.getMaxValue().getZExtValue();
+ uint64_t SMax = SKnown.getMaxValue().getZExtValue();
+ return (VMax & 3) + (SMax & 3) >= 4;
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
+ Register Addr = Root.getReg();
+ Register PtrBase;
+ int64_t ConstOffset;
+ int64_t ImmOffset = 0;
+
+ // Match the immediate offset first, which canonically is moved as low as
+ // possible.
+ std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+
+ if (ConstOffset != 0 &&
+ TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ Addr = PtrBase;
+ ImmOffset = ConstOffset;
+ }
+
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+ if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
+ return None;
+
+ Register RHS = AddrDef->MI->getOperand(2).getReg();
+ if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
+ return None;
+
+ Register LHS = AddrDef->MI->getOperand(1).getReg();
+ auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
+
+ if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
+ return None;
+
+ if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
+ int FI = LHSDef->MI->getOperand(1).getIndex();
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+ [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
+ }
+
+ if (!isSGPR(LHS))
+ return None;
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
@@ -3856,7 +4230,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MIB.addReg(Info->getScratchRSrcReg());
},
[=](MachineInstrBuilder &MIB) { // vaddr
- if (FI.hasValue())
+ if (FI)
MIB.addFrameIndex(FI.getValue());
else
MIB.addReg(VAddr);