aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp1019
1 files changed, 759 insertions, 260 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 278cf2b69ee3..0a06fa88b6b1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -18,6 +18,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -105,9 +106,27 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
}
+static bool canRemat(const MachineInstr &MI) {
+
+ if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
+ SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
+ SIInstrInfo::isSALU(MI))
+ return true;
+
+ if (SIInstrInfo::isSMRD(MI)) {
+ return !MI.memoperands_empty() &&
+ llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
+ return MMO->isLoad() && MMO->isInvariant();
+ });
+ }
+
+ return false;
+}
+
bool SIInstrInfo::isReallyTriviallyReMaterializable(
const MachineInstr &MI) const {
- if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
+
+ if (canRemat(MI)) {
// Normally VALU use of exec would block the rematerialization, but that
// is OK in this case to have an implicit exec read as all VALU do.
// We really want all of the generic logic for this except for this.
@@ -119,12 +138,13 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(
// There is difference to generic method which does not allow
// rematerialization if there are virtual register uses. We allow this,
// therefore this method includes SOP instructions as well.
- return !MI.hasImplicitDef() &&
- MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
- !MI.mayRaiseFPException();
+ if (!MI.hasImplicitDef() &&
+ MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
+ !MI.mayRaiseFPException())
+ return true;
}
- return false;
+ return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
}
// Returns true if the scalar result of a VALU instruction depends on exec.
@@ -169,6 +189,48 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
}
+bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
+ MachineBasicBlock *SuccToSinkTo,
+ MachineCycleInfo *CI) const {
+ // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
+ if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
+ return true;
+
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ // Check if sinking of MI would create temporal divergent use.
+ for (auto Op : MI.uses()) {
+ if (Op.isReg() && Op.getReg().isVirtual() &&
+ RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
+ MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
+
+ // SgprDef defined inside cycle
+ MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
+ if (FromCycle == nullptr)
+ continue;
+
+ MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
+ // Check if there is a FromCycle that contains SgprDef's basic block but
+ // does not contain SuccToSinkTo and also has divergent exit condition.
+ while (FromCycle && !FromCycle->contains(ToCycle)) {
+ // After structurize-cfg, there should be exactly one cycle exit.
+ SmallVector<MachineBasicBlock *, 1> ExitBlocks;
+ FromCycle->getExitBlocks(ExitBlocks);
+ assert(ExitBlocks.size() == 1);
+ assert(ExitBlocks[0]->getSinglePredecessor());
+
+ // FromCycle has divergent exit condition.
+ if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
+ return false;
+ }
+
+ FromCycle = FromCycle->getParentCycle();
+ }
+ }
+ }
+
+ return true;
+}
+
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
int64_t &Offset0,
int64_t &Offset1) const {
@@ -479,8 +541,10 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
}
bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
- unsigned NumLoads,
+ int64_t Offset2, bool OffsetIsScalable2,
+ unsigned ClusterSize,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
// should not be clustered
@@ -506,8 +570,8 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
// (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
// (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
// (5) LoadSize >= 17: do not cluster
- const unsigned LoadSize = NumBytes / NumLoads;
- const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
+ const unsigned LoadSize = NumBytes / ClusterSize;
+ const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
return NumDWORDs <= 8;
}
@@ -619,7 +683,7 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
}
RS.enterBasicBlockEnd(MBB);
- RS.backward(MI);
+ RS.backward(std::next(MI));
// Ideally we want to have three registers for a long reg_sequence copy
// to hide 2 waitstates between v_mov_b32 and accvgpr_write.
@@ -680,23 +744,27 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
int16_t SubIdx = BaseIndices[Idx];
- Register Reg = RI.getSubReg(DestReg, SubIdx);
+ Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
+ Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+ assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
unsigned Opcode = AMDGPU::S_MOV_B32;
// Is SGPR aligned? If so try to combine with next.
- Register Src = RI.getSubReg(SrcReg, SubIdx);
- bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
- bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
+ bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
+ bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
// Can use SGPR64 copy
unsigned Channel = RI.getChannelFromSubReg(SubIdx);
SubIdx = RI.getSubRegFromChannel(Channel, 2);
+ DestSubReg = RI.getSubReg(DestReg, SubIdx);
+ SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+ assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
Opcode = AMDGPU::S_MOV_B64;
Idx++;
}
- LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
- .addReg(RI.getSubReg(SrcReg, SubIdx))
+ LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
+ .addReg(SrcSubReg)
.addReg(SrcReg, RegState::Implicit);
if (!FirstMI)
@@ -722,24 +790,32 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc) const {
const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
+ unsigned Size = RI.getRegSizeInBits(*RC);
+ const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
+ unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
- // FIXME: This is hack to resolve copies between 16 bit and 32 bit
- // registers until all patterns are fixed.
- if (Fix16BitCopies &&
- ((RI.getRegSizeInBits(*RC) == 16) ^
- (RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) {
- MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
- MCRegister Super = RI.get32BitRegister(RegToFix);
- assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
- RegToFix = Super;
+ // The rest of copyPhysReg assumes Src and Dst size are the same size.
+ // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
+ // we remove Fix16BitCopies and this code block?
+ if (Fix16BitCopies) {
+ if (((Size == 16) != (SrcSize == 16))) {
+ // Non-VGPR Src and Dst will later be expanded back to 32 bits.
+ assert(ST.hasTrue16BitInsts());
+ MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
+ MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
+ RegToFix = SubReg;
- if (DestReg == SrcReg) {
- // Insert empty bundle since ExpandPostRA expects an instruction here.
- BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
- return;
+ if (DestReg == SrcReg) {
+ // Identity copy. Insert empty bundle since ExpandPostRA expects an
+ // instruction here.
+ BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
+ return;
+ }
+ RC = RI.getPhysRegBaseClass(DestReg);
+ Size = RI.getRegSizeInBits(*RC);
+ SrcRC = RI.getPhysRegBaseClass(SrcReg);
+ SrcSize = RI.getRegSizeInBits(*SrcRC);
}
-
- RC = RI.getPhysRegBaseClass(DestReg);
}
if (RC == &AMDGPU::VGPR_32RegClass) {
@@ -863,10 +939,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- const unsigned Size = RI.getRegSizeInBits(*RC);
if (Size == 16) {
- assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
- AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
+ assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
@@ -904,6 +978,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (ST.hasTrue16BitInsts()) {
+ if (IsSGPRSrc) {
+ assert(SrcLow);
+ SrcReg = NewSrcReg;
+ }
+ // Use the smaller instruction encoding if possible.
+ if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
+ (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
+ .addReg(SrcReg);
+ } else {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
+ .addImm(0) // src0_modifiers
+ .addReg(SrcReg)
+ .addImm(0); // op_sel
+ }
+ return;
+ }
+
if (IsSGPRSrc && !ST.hasSDWAScalar()) {
if (!DstLow || !SrcLow) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
@@ -930,14 +1023,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
if (ST.hasMovB64()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
- if (ST.hasPackedFP32Ops()) {
+ if (ST.hasPkMovB32()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
.addImm(SISrcMods::OP_SEL_1)
.addReg(SrcReg)
@@ -984,7 +1076,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (ST.hasMovB64()) {
Opcode = AMDGPU::V_MOV_B64_e32;
EltSize = 8;
- } else if (ST.hasPackedFP32Ops()) {
+ } else if (ST.hasPkMovB32()) {
Opcode = AMDGPU::V_PK_MOV_B32;
EltSize = 8;
}
@@ -1012,6 +1104,9 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
SubIdx = SubIndices[Idx];
else
SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+ Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
+ Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+ assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
bool IsFirstSubreg = Idx == 0;
bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
@@ -1019,30 +1114,26 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
Register ImpUseSuper = SrcReg;
- indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
- RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, Overlap,
- ImpDefSuper, ImpUseSuper);
+ indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
+ *RS, Overlap, ImpDefSuper, ImpUseSuper);
} else if (Opcode == AMDGPU::V_PK_MOV_B32) {
- Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
- Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
MachineInstrBuilder MIB =
- BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
- .addImm(SISrcMods::OP_SEL_1)
- .addReg(SrcSubReg)
- .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
- .addReg(SrcSubReg)
- .addImm(0) // op_sel_lo
- .addImm(0) // op_sel_hi
- .addImm(0) // neg_lo
- .addImm(0) // neg_hi
- .addImm(0) // clamp
- .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0) // clamp
+ .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
if (IsFirstSubreg)
MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
} else {
MachineInstrBuilder Builder =
- BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
- .addReg(RI.getSubReg(SrcReg, SubIdx));
+ BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
if (IsFirstSubreg)
Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
@@ -1286,7 +1377,11 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
if (RI.isAGPRClass(DstRC))
return AMDGPU::COPY;
- if (RI.getRegSizeInBits(*DstRC) == 32) {
+ if (RI.getRegSizeInBits(*DstRC) == 16) {
+ // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
+ // before RA.
+ return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
+ } else if (RI.getRegSizeInBits(*DstRC) == 32) {
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
} else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
return AMDGPU::S_MOV_B64;
@@ -1587,11 +1682,15 @@ static unsigned getAVSpillSaveOpcode(unsigned Size) {
}
}
-static unsigned getWWMRegSpillSaveOpcode(unsigned Size) {
+static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
+ bool IsVectorSuperClass) {
// Currently, there is only 32-bit WWM register spills needed.
if (Size != 4)
llvm_unreachable("unknown wwm register spill size");
+ if (IsVectorSuperClass)
+ return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
+
return AMDGPU::SI_SPILL_WWM_V32_SAVE;
}
@@ -1600,11 +1699,13 @@ static unsigned getVectorRegSpillSaveOpcode(Register Reg,
unsigned Size,
const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &MFI) {
+ bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+
// Choose the right opcode if spilling a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
- return getWWMRegSpillSaveOpcode(Size);
+ return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
- if (TRI.isVectorSuperClass(RC))
+ if (IsVectorSuperClass)
return getAVSpillSaveOpcode(Size);
return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
@@ -1807,11 +1908,15 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) {
}
}
-static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) {
+static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
+ bool IsVectorSuperClass) {
// Currently, there is only 32-bit WWM register spills needed.
if (Size != 4)
llvm_unreachable("unknown wwm register spill size");
+ if (IsVectorSuperClass)
+ return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
+
return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
}
@@ -1819,11 +1924,13 @@ static unsigned
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
unsigned Size, const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &MFI) {
+ bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+
// Choose the right opcode if restoring a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
- return getWWMRegSpillRestoreOpcode(Size);
+ return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
- if (TRI.isVectorSuperClass(RC))
+ if (IsVectorSuperClass)
return getAVSpillRestoreOpcode(Size);
return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
@@ -2006,6 +2113,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
break;
+ case AMDGPU::SI_SPILL_S32_TO_VGPR:
+ MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
+ break;
+
+ case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
+ MI.setDesc(get(AMDGPU::V_READLANE_B32));
+ break;
+
case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -2024,7 +2139,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
- if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
+ if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
@@ -2045,7 +2160,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
} else {
assert(SrcOp.isReg());
- if (ST.hasPackedFP32Ops() &&
+ if (ST.hasPkMovB32() &&
!RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
.addImm(SISrcMods::OP_SEL_1) // src0_mod
@@ -2275,23 +2390,34 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Register Reg = MI.getOperand(0).getReg();
Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+ MachineOperand OpLo = MI.getOperand(1);
+ MachineOperand OpHi = MI.getOperand(2);
// Create a bundle so these instructions won't be re-ordered by the
// post-RA scheduler.
MIBundleBuilder Bundler(MBB, MI);
Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
- // Add 32-bit offset from this instruction to the start of the
- // constant data.
- Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
- .addReg(RegLo)
- .add(MI.getOperand(1)));
+ // What we want here is an offset from the value returned by s_getpc (which
+ // is the address of the s_add_u32 instruction) to the global variable, but
+ // since the encoding of $symbol starts 4 bytes after the start of the
+ // s_add_u32 instruction, we end up with an offset that is 4 bytes too
+ // small. This requires us to add 4 to the global variable offset in order
+ // to compute the correct address. Similarly for the s_addc_u32 instruction,
+ // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
+ // instruction.
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
- .addReg(RegHi);
- MIB.add(MI.getOperand(2));
+ if (OpLo.isGlobal())
+ OpLo.setOffset(OpLo.getOffset() + 4);
+ Bundler.append(
+ BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
+
+ if (OpHi.isGlobal())
+ OpHi.setOffset(OpHi.getOffset() + 12);
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+ .addReg(RegHi)
+ .add(OpHi));
- Bundler.append(MIB);
finalizeBundle(MBB, Bundler.begin());
MI.eraseFromParent();
@@ -2350,12 +2476,98 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
+void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, Register DestReg,
+ unsigned SubIdx, const MachineInstr &Orig,
+ const TargetRegisterInfo &RI) const {
+
+ // Try shrinking the instruction to remat only the part needed for current
+ // context.
+ // TODO: Handle more cases.
+ unsigned Opcode = Orig.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::S_LOAD_DWORDX16_IMM:
+ case AMDGPU::S_LOAD_DWORDX8_IMM: {
+ if (SubIdx != 0)
+ break;
+
+ if (I == MBB.end())
+ break;
+
+ if (I->isBundled())
+ break;
+
+ // Look for a single use of the register that is also a subreg.
+ Register RegToFind = Orig.getOperand(0).getReg();
+ MachineOperand *UseMO = nullptr;
+ for (auto &CandMO : I->operands()) {
+ if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
+ continue;
+ if (UseMO) {
+ UseMO = nullptr;
+ break;
+ }
+ UseMO = &CandMO;
+ }
+ if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
+ break;
+
+ unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
+ unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
+
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
+
+ unsigned NewOpcode = -1;
+ if (SubregSize == 256)
+ NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
+ else if (SubregSize == 128)
+ NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
+ else
+ break;
+
+ const MCInstrDesc &TID = get(NewOpcode);
+ const TargetRegisterClass *NewRC =
+ RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
+ MRI.setRegClass(DestReg, NewRC);
+
+ UseMO->setReg(DestReg);
+ UseMO->setSubReg(AMDGPU::NoSubRegister);
+
+ // Use a smaller load with the desired size, possibly with updated offset.
+ MachineInstr *MI = MF->CloneMachineInstr(&Orig);
+ MI->setDesc(TID);
+ MI->getOperand(0).setReg(DestReg);
+ MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
+ if (Offset) {
+ MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
+ int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
+ OffsetMO->setImm(FinalOffset);
+ }
+ SmallVector<MachineMemOperand *> NewMMOs;
+ for (const MachineMemOperand *MemOp : Orig.memoperands())
+ NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
+ SubregSize / 8));
+ MI->setMemRefs(*MF, NewMMOs);
+
+ MBB.insert(I, MI);
+ return;
+ }
+
+ default:
+ break;
+ }
+
+ TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
+}
+
std::pair<MachineInstr*, MachineInstr*>
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
if (ST.hasMovB64() &&
- AMDGPU::isLegal64BitDPPControl(
+ AMDGPU::isLegalDPALU_DPPControl(
getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
return std::pair(&MI, nullptr);
@@ -2482,6 +2694,9 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
if (CommutedOpcode == -1)
return nullptr;
+ if (Src0Idx > Src1Idx)
+ std::swap(Src0Idx, Src1Idx);
+
assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
static_cast<int>(Src0Idx) &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
@@ -2564,14 +2779,8 @@ bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
return isIntN(BranchOffsetBits, BrOffset);
}
-MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
- const MachineInstr &MI) const {
- if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
- // This would be a difficult analysis to perform, but can always be legal so
- // there's no need to analyze it.
- return nullptr;
- }
-
+MachineBasicBlock *
+SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
return MI.getOperand(0).getMBB();
}
@@ -2882,7 +3091,6 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
= getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
if (!FBB) {
- Cond[1].isUndef();
MachineInstr *CondBr =
BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
@@ -3087,6 +3295,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
case AMDGPU::V_MOV_B64_e64:
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
+ case AMDGPU::S_MOV_B64_IMM_PSEUDO:
case AMDGPU::COPY:
case AMDGPU::WWM_COPY:
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
@@ -3120,11 +3329,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
switch (DefMI.getOpcode()) {
default:
return false;
+ case AMDGPU::V_MOV_B64_e32:
case AMDGPU::S_MOV_B64:
- // TODO: We could fold 64-bit immediates, but this get complicated
- // when there are sub-registers.
- return false;
-
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::S_MOV_B64_IMM_PSEUDO:
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::S_MOV_B32:
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
@@ -3137,19 +3345,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (!ImmOp->isImm())
return false;
+ auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
+ int64_t Imm = ImmOp->getImm();
+ switch (UseOp.getSubReg()) {
+ default:
+ return Imm;
+ case AMDGPU::sub0:
+ return Lo_32(Imm);
+ case AMDGPU::sub1:
+ return Hi_32(Imm);
+ case AMDGPU::lo16:
+ return APInt(16, Imm).getSExtValue();
+ case AMDGPU::hi16:
+ return APInt(32, Imm).ashr(16).getSExtValue();
+ case AMDGPU::sub1_lo16:
+ return APInt(16, Hi_32(Imm)).getSExtValue();
+ case AMDGPU::sub1_hi16:
+ return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
+ }
+ };
+
+ assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
+
unsigned Opc = UseMI.getOpcode();
if (Opc == AMDGPU::COPY) {
+ assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
+
Register DstReg = UseMI.getOperand(0).getReg();
- bool Is16Bit = getOpSize(UseMI, 0) == 2;
+ unsigned OpSize = getOpSize(UseMI, 0);
+ bool Is16Bit = OpSize == 2;
+ bool Is64Bit = OpSize == 8;
bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
- unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
- APInt Imm(32, ImmOp->getImm());
-
- if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
- Imm = Imm.ashr(16);
+ unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
+ : AMDGPU::V_MOV_B32_e32
+ : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
+ : AMDGPU::S_MOV_B32;
+ APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
if (RI.isAGPR(*MRI, DstReg)) {
- if (!isInlineConstant(Imm))
+ if (Is64Bit || !isInlineConstant(Imm))
return false;
NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
}
@@ -3209,14 +3443,32 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
// Multiplied part is the constant: Use v_madmk_{f16, f32}.
- // We should only expect these to be on src0 due to canonicalization.
- if (Src0->isReg() && Src0->getReg() == Reg) {
- if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
+ if ((Src0->isReg() && Src0->getReg() == Reg) ||
+ (Src1->isReg() && Src1->getReg() == Reg)) {
+ MachineOperand *RegSrc =
+ Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
+ if (!RegSrc->isReg())
+ return false;
+ if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
+ ST.getConstantBusLimit(Opc) < 2)
return false;
if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
return false;
+ // If src2 is also a literal constant then we have to choose which one to
+ // fold. In general it is better to choose madak so that the other literal
+ // can be materialized in an sgpr instead of a vgpr:
+ // s_mov_b32 s0, literal
+ // v_madak_f32 v0, s0, v0, literal
+ // Instead of:
+ // v_mov_b32 v1, literal
+ // v_madmk_f32 v0, v0, literal, v1
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ !isInlineConstant(Def->getOperand(1)))
+ return false;
+
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
@@ -3225,18 +3477,22 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // We need to swap operands 0 and 1 since madmk constant is at operand 1.
+ // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
+ // would also require restricting their register classes. For now
+ // just bail out.
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
+ return false;
- const int64_t Imm = ImmOp->getImm();
+ const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
- Register Src1Reg = Src1->getReg();
- unsigned Src1SubReg = Src1->getSubReg();
- Src0->setReg(Src1Reg);
- Src0->setSubReg(Src1SubReg);
- Src0->setIsKill(Src1->isKill());
+ Register SrcReg = RegSrc->getReg();
+ unsigned SrcSubReg = RegSrc->getSubReg();
+ Src0->setReg(SrcReg);
+ Src0->setSubReg(SrcSubReg);
+ Src0->setIsKill(RegSrc->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
@@ -3258,43 +3514,38 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// Added part is the constant: Use v_madak_{f16, f32}.
if (Src2->isReg() && Src2->getReg() == Reg) {
- // Not allowed to use constant bus for another operand.
- // We can however allow an inline immediate as src0.
- bool Src0Inlined = false;
- if (Src0->isReg()) {
- // Try to inline constant if possible.
- // If the Def moves immediate and the use is single
- // We are saving VGPR here.
- MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
- if (Def && Def->isMoveImmediate() &&
- isInlineConstant(Def->getOperand(1)) &&
- MRI->hasOneUse(Src0->getReg())) {
- Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- Src0Inlined = true;
- } else if ((Src0->getReg().isPhysical() &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) ||
- (Src0->getReg().isVirtual() &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
- return false;
+ if (ST.getConstantBusLimit(Opc) < 2) {
+ // Not allowed to use constant bus for another operand.
+ // We can however allow an inline immediate as src0.
+ bool Src0Inlined = false;
+ if (Src0->isReg()) {
+ // Try to inline constant if possible.
+ // If the Def moves immediate and the use is single
+ // We are saving VGPR here.
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src0->getReg())) {
+ Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+ Src0Inlined = true;
+ } else if (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRReg(*MRI, Src0->getReg())) {
+ return false;
+ }
// VGPR is okay as Src0 - fallthrough
- }
+ }
- if (Src1->isReg() && !Src0Inlined ) {
- // We have one slot for inlinable constant so far - try to fill it
- MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
- if (Def && Def->isMoveImmediate() &&
- isInlineConstant(Def->getOperand(1)) &&
- MRI->hasOneUse(Src1->getReg()) &&
- commuteInstruction(UseMI)) {
+ if (Src1->isReg() && !Src0Inlined) {
+ // We have one slot for inlinable constant so far - try to fill it
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- } else if ((Src1->getReg().isPhysical() &&
- RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
- (Src1->getReg().isVirtual() &&
- RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
- return false;
+ else if (RI.isSGPRReg(*MRI, Src1->getReg()))
+ return false;
// VGPR is okay as Src1 - fallthrough
+ }
}
unsigned NewOpc =
@@ -3305,7 +3556,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- const int64_t Imm = ImmOp->getImm();
+ // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
+ // would also require restricting their register classes. For now
+ // just bail out.
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
+ return false;
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
@@ -3317,7 +3572,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
// ChangingToImmediate adds Src2 back to the instruction.
- Src2->ChangeToImmediate(Imm);
+ Src2->ChangeToImmediate(getImmFor(*Src2));
// These come before src2.
removeModOperands(UseMI);
@@ -3412,19 +3667,30 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (isMUBUF(MIb) || isMTBUF(MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(MIb) && !isSMRD(MIb);
+ if (isFLAT(MIb))
+ return isFLATScratch(MIb);
+
+ return !isSMRD(MIb);
}
if (isSMRD(MIa)) {
if (isSMRD(MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
+ if (isFLAT(MIb))
+ return isFLATScratch(MIb);
+
+ return !isMUBUF(MIb) && !isMTBUF(MIb);
}
if (isFLAT(MIa)) {
- if (isFLAT(MIb))
+ if (isFLAT(MIb)) {
+ if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
+ (isFLATGlobal(MIa) && isFLATScratch(MIb)))
+ return true;
+
return checkInstOffsetsDoNotOverlap(MIa, MIb);
+ }
return false;
}
@@ -3731,13 +3997,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
}
bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
- return Opcode == AMDGPU::DS_ORDERED_COUNT ||
- Opcode == AMDGPU::DS_GWS_INIT ||
- Opcode == AMDGPU::DS_GWS_SEMA_V ||
- Opcode == AMDGPU::DS_GWS_SEMA_BR ||
- Opcode == AMDGPU::DS_GWS_SEMA_P ||
- Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
- Opcode == AMDGPU::DS_GWS_BARRIER;
+ return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
}
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
@@ -3782,7 +4042,9 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
// However, executing them with EXEC = 0 causes them to operate on undefined
// data, which we avoid by returning true here.
if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
- Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
+ Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
+ Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
+ Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
return true;
return false;
@@ -3836,9 +4098,7 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
uint8_t OperandType) const {
assert(!MO.isReg() && "isInlineConstant called on register operand!");
- if (!MO.isImm() ||
- OperandType < AMDGPU::OPERAND_SRC_FIRST ||
- OperandType > AMDGPU::OPERAND_SRC_LAST)
+ if (!MO.isImm())
return false;
// MachineOperand provides no way to tell the true operand size, since it only
@@ -3886,12 +4146,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- // This suffers the same problem as the scalar 16-bit cases.
- return AMDGPU::isInlinableIntLiteralV216(Imm);
+ return (isInt<16>(Imm) || isUInt<16>(Imm)) &&
+ AMDGPU::isInlinableIntLiteral((int16_t)Imm);
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
// where 16-bit instructions are not legal.
@@ -3904,17 +4167,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return false;
}
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
- uint32_t Trunc = static_cast<uint32_t>(Imm);
- return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
- }
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16:
return false;
+ case AMDGPU::OPERAND_INPUT_MODS:
+ case MCOI::OPERAND_IMMEDIATE:
+ // Always embedded in the instruction for free.
+ return true;
+ case MCOI::OPERAND_UNKNOWN:
+ case MCOI::OPERAND_REGISTER:
+ case MCOI::OPERAND_PCREL:
+ case MCOI::OPERAND_GENERIC_0:
+ case MCOI::OPERAND_GENERIC_1:
+ case MCOI::OPERAND_GENERIC_2:
+ case MCOI::OPERAND_GENERIC_3:
+ case MCOI::OPERAND_GENERIC_4:
+ case MCOI::OPERAND_GENERIC_5:
+ // Just ignore anything else.
+ return true;
default:
- llvm_unreachable("invalid bitwidth");
+ llvm_unreachable("invalid operand type");
}
}
@@ -4163,7 +4435,9 @@ static bool shouldReadExec(const MachineInstr &MI) {
if (SIInstrInfo::isVALU(MI)) {
switch (MI.getOpcode()) {
case AMDGPU::V_READLANE_B32:
+ case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
case AMDGPU::V_WRITELANE_B32:
+ case AMDGPU::SI_SPILL_S32_TO_VGPR:
return false;
}
@@ -4788,20 +5062,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
-
if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
- ((DstIdx >= 0 &&
- (Desc.operands()[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
- Desc.operands()[DstIdx].RegClass ==
- AMDGPU::VReg_64_Align2RegClassID)) ||
- ((Src0Idx >= 0 &&
- (Desc.operands()[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
- Desc.operands()[Src0Idx].RegClass ==
- AMDGPU::VReg_64_Align2RegClassID)))) &&
- !AMDGPU::isLegal64BitDPPControl(DC)) {
+ !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) {
ErrInfo = "Invalid dpp_ctrl value: "
- "64 bit dpp only support row_newbcast";
+ "DP ALU dpp only support row_newbcast";
return false;
}
}
@@ -4969,6 +5233,64 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
+ case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
+ case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
+ case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
+ case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
+ case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
+ case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
+ case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
+ case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
+ case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
+ case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
+ case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
+ case AMDGPU::S_CEIL_F16: return AMDGPU::V_CEIL_F16_t16_e64;
+ case AMDGPU::S_FLOOR_F16: return AMDGPU::V_FLOOR_F16_t16_e64;
+ case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_t16_e64;
+ case AMDGPU::S_RNDNE_F16: return AMDGPU::V_RNDNE_F16_t16_e64;
+ case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
+ case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
+ case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
+ case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
+ case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
+ case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
+ case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
+ case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
+ case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
+ case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
+ case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
+ case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
+ case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
+ case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
+ case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
+ case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
+ case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
+ case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
+ case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
+ case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
+ case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
+ case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
+ case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
+ case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
+ case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
+ case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
+ case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
+ case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
+ case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
+ case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
+ case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
+ case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
+ case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
+ case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
+ case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
+ case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
+ case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
+ case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
+ case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
+ case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
+ case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
+ case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
+ case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
}
llvm_unreachable(
"Unexpected scalar opcode without corresponding vector one!");
@@ -5123,13 +5445,10 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
MO.ChangeToRegister(Reg, false);
}
-unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
- MachineRegisterInfo &MRI,
- MachineOperand &SuperReg,
- const TargetRegisterClass *SuperRC,
- unsigned SubIdx,
- const TargetRegisterClass *SubRC)
- const {
+unsigned SIInstrInfo::buildExtractSubReg(
+ MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
+ const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
+ unsigned SubIdx, const TargetRegisterClass *SubRC) const {
MachineBasicBlock *MBB = MI->getParent();
DebugLoc DL = MI->getDebugLoc();
Register SubReg = MRI.createVirtualRegister(SubRC);
@@ -5156,12 +5475,9 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
}
MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
- MachineBasicBlock::iterator MII,
- MachineRegisterInfo &MRI,
- MachineOperand &Op,
- const TargetRegisterClass *SuperRC,
- unsigned SubIdx,
- const TargetRegisterClass *SubRC) const {
+ MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
+ const MachineOperand &Op, const TargetRegisterClass *SuperRC,
+ unsigned SubIdx, const TargetRegisterClass *SubRC) const {
if (Op.isImm()) {
if (SubIdx == AMDGPU::sub0)
return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
@@ -5256,9 +5572,8 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
return false;
SGPRsUsed.insert(SGPR);
}
- } else if (InstDesc.operands()[i].OperandType == AMDGPU::OPERAND_KIMM32 ||
- (AMDGPU::isSISrcOperand(InstDesc, i) &&
- !isInlineConstant(Op, InstDesc.operands()[i]))) {
+ } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
+ !isInlineConstant(Op, InstDesc.operands()[i])) {
if (!LiteralLimit--)
return false;
if (--ConstantBusLimit <= 0)
@@ -5306,6 +5621,27 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
return true;
}
+ if (MO->isImm()) {
+ uint64_t Imm = MO->getImm();
+ bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
+ bool Is64BitOp = Is64BitFPOp ||
+ OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
+ OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
+ OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
+ if (Is64BitOp &&
+ !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
+ if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
+ return false;
+
+ // FIXME: We can use sign extended 64-bit literals, but only for signed
+ // operands. At the moment we do not know if an operand is signed.
+ // Such operand will be encoded as its low 32 bits and then either
+ // correctly sign extended or incorrectly zero extended by HW.
+ if (!Is64BitFPOp && (int32_t)Imm < 0)
+ return false;
+ }
+ }
+
// Handle non-register types that are treated like immediates.
assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
@@ -5363,6 +5699,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
legalizeOpWithMove(MI, Src1Idx);
+ // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
+ if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
+ legalizeOpWithMove(MI, Src2Idx);
+ }
+
// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
@@ -5512,6 +5855,11 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
// legalize it.
legalizeOpWithMove(MI, Idx);
}
+
+ // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
+ if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
+ !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
+ legalizeOpWithMove(MI, VOP3Idx[2]);
}
Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
@@ -5883,6 +6231,17 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ // Save SCC. Waterfall Loop may overwrite SCC.
+ Register SaveSCCReg;
+ bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) !=
+ MachineBasicBlock::LQR_Dead);
+ if (SCCNotDead) {
+ SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
+ .addImm(1)
+ .addImm(0);
+ }
+
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
// Save the EXEC mask
@@ -5938,8 +6297,15 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
- // Restore the EXEC mask
MachineBasicBlock::iterator First = RemainderBB->begin();
+ // Restore SCC
+ if (SCCNotDead) {
+ BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
+ .addReg(SaveSCCReg, RegState::Kill)
+ .addImm(0);
+ }
+
+ // Restore the EXEC mask
BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
return BodyBB;
}
@@ -6124,6 +6490,18 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return CreatedBB;
}
+ // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
+ if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
+ MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
+ MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
+ MI.getOpcode() == AMDGPU::S_WQM_B32 ||
+ MI.getOpcode() == AMDGPU::S_WQM_B64) {
+ MachineOperand &Src = MI.getOperand(1);
+ if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
+ Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+ return CreatedBB;
+ }
+
// Legalize MIMG and MUBUF/MTBUF for shaders.
//
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
@@ -6391,10 +6769,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
default:
break;
case AMDGPU::S_ADD_U64_PSEUDO:
+ NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
+ break;
case AMDGPU::S_SUB_U64_PSEUDO:
- splitScalar64BitAddSub(Worklist, Inst, MDT);
- Inst.eraseFromParent();
- return;
+ NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
+ break;
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32: {
// FIXME: The u32 versions currently selected use the carry.
@@ -6644,21 +7023,78 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
case AMDGPU::S_CMP_LT_U32:
case AMDGPU::S_CMP_LE_U32:
case AMDGPU::S_CMP_EQ_U64:
- case AMDGPU::S_CMP_LG_U64: {
- const MCInstrDesc &NewDesc = get(NewOpcode);
+ case AMDGPU::S_CMP_LG_U64:
+ case AMDGPU::S_CMP_LT_F32:
+ case AMDGPU::S_CMP_EQ_F32:
+ case AMDGPU::S_CMP_LE_F32:
+ case AMDGPU::S_CMP_GT_F32:
+ case AMDGPU::S_CMP_LG_F32:
+ case AMDGPU::S_CMP_GE_F32:
+ case AMDGPU::S_CMP_O_F32:
+ case AMDGPU::S_CMP_U_F32:
+ case AMDGPU::S_CMP_NGE_F32:
+ case AMDGPU::S_CMP_NLG_F32:
+ case AMDGPU::S_CMP_NGT_F32:
+ case AMDGPU::S_CMP_NLE_F32:
+ case AMDGPU::S_CMP_NEQ_F32:
+ case AMDGPU::S_CMP_NLT_F32:
+ case AMDGPU::S_CMP_LT_F16:
+ case AMDGPU::S_CMP_EQ_F16:
+ case AMDGPU::S_CMP_LE_F16:
+ case AMDGPU::S_CMP_GT_F16:
+ case AMDGPU::S_CMP_LG_F16:
+ case AMDGPU::S_CMP_GE_F16:
+ case AMDGPU::S_CMP_O_F16:
+ case AMDGPU::S_CMP_U_F16:
+ case AMDGPU::S_CMP_NGE_F16:
+ case AMDGPU::S_CMP_NLG_F16:
+ case AMDGPU::S_CMP_NGT_F16:
+ case AMDGPU::S_CMP_NLE_F16:
+ case AMDGPU::S_CMP_NEQ_F16:
+ case AMDGPU::S_CMP_NLT_F16: {
Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
- MachineInstr *NewInstr =
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
- .add(Inst.getOperand(0))
- .add(Inst.getOperand(1));
+ auto NewInstr =
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
+ .setMIFlags(Inst.getFlags());
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src0_modifiers) >= 0) {
+ NewInstr
+ .addImm(0) // src0_modifiers
+ .add(Inst.getOperand(0)) // src0
+ .addImm(0) // src1_modifiers
+ .add(Inst.getOperand(1)) // src1
+ .addImm(0); // clamp
+ } else {
+ NewInstr
+ .add(Inst.getOperand(0))
+ .add(Inst.getOperand(1));
+ }
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
MachineOperand SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
+ return;
}
+ case AMDGPU::S_CVT_HI_F32_F16: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+ .addImm(16)
+ .add(Inst.getOperand(1));
+ BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .addReg(TmpReg)
+ .addImm(0) // clamp
+ .addImm(0); // omod
+
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+ addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+ Inst.eraseFromParent();
return;
}
+ }
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
@@ -6702,8 +7138,61 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Use the new VALU Opcode.
auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
.setMIFlags(Inst.getFlags());
- for (const MachineOperand &Op : Inst.explicit_operands())
- NewInstr->addOperand(Op);
+ if (isVOP3(NewOpcode)) {
+ // Intersperse VOP3 modifiers among the SALU operands.
+ NewInstr->addOperand(Inst.getOperand(0));
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src0_modifiers) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0) >= 0)
+ NewInstr->addOperand(Inst.getOperand(1));
+
+ if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+ // We are converting these to a BFE, so we need to add the missing
+ // operands for the size and offset.
+ unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+ NewInstr.addImm(0);
+ NewInstr.addImm(Size);
+ } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+ // The VALU version adds the second operand to the result, so insert an
+ // extra 0 operand.
+ NewInstr.addImm(0);
+ } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+ const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
+ // If we need to move this to VGPRs, we need to unpack the second
+ // operand back into the 2 separate ones for bit offset and width.
+ assert(OffsetWidthOp.isImm() &&
+ "Scalar BFE is only implemented for constant width and offset");
+ uint32_t Imm = OffsetWidthOp.getImm();
+
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+ NewInstr.addImm(Offset);
+ NewInstr.addImm(BitWidth);
+ } else {
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src1_modifiers) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
+ NewInstr->addOperand(Inst.getOperand(2));
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src2_modifiers) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
+ NewInstr->addOperand(Inst.getOperand(3));
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
+ NewInstr.addImm(0);
+ }
+ } else {
+ // Just copy the SALU operands.
+ for (const MachineOperand &Op : Inst.explicit_operands())
+ NewInstr->addOperand(Op);
+ }
+
// Remove any references to SCC. Vector instructions can't read from it, and
// We're just about to add the implicit use / defs of VCC, and we don't want
// both.
@@ -6727,30 +7216,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
}
- if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
- // We are converting these to a BFE, so we need to add the missing
- // operands for the size and offset.
- unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
- NewInstr.addImm(0);
- NewInstr.addImm(Size);
- } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
- // The VALU version adds the second operand to the result, so insert an
- // extra 0 operand.
- NewInstr.addImm(0);
- }
- if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
- const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2);
- // If we need to move this to VGPRs, we need to unpack the second operand
- // back into the 2 separate ones for bit offset and width.
- assert(OffsetWidthOp.isImm() &&
- "Scalar BFE is only implemented for constant width and offset");
- uint32_t Imm = OffsetWidthOp.getImm();
- uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
- uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
- NewInstr->removeOperand(2);
- NewInstr.addImm(Offset);
- NewInstr.addImm(BitWidth);
- }
fixImplicitOperands(*NewInstr);
// Legalize the operands
legalizeOperands(*NewInstr, MDT);
@@ -6808,27 +7273,27 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineOperand &Src1 = Inst.getOperand(2);
MachineOperand &Cond = Inst.getOperand(3);
- Register SCCSource = Cond.getReg();
- bool IsSCC = (SCCSource == AMDGPU::SCC);
+ Register CondReg = Cond.getReg();
+ bool IsSCC = (CondReg == AMDGPU::SCC);
// If this is a trivial select where the condition is effectively not SCC
- // (SCCSource is a source of copy to SCC), then the select is semantically
- // equivalent to copying SCCSource. Hence, there is no need to create
+ // (CondReg is a source of copy to SCC), then the select is semantically
+ // equivalent to copying CondReg. Hence, there is no need to create
// V_CNDMASK, we can just use that and bail out.
if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
(Src1.getImm() == 0)) {
- MRI.replaceRegWith(Dest.getReg(), SCCSource);
+ MRI.replaceRegWith(Dest.getReg(), CondReg);
return;
}
- const TargetRegisterClass *TC =
- RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-
- Register CopySCC = MRI.createVirtualRegister(TC);
-
+ Register NewCondReg = CondReg;
if (IsSCC) {
+ const TargetRegisterClass *TC =
+ RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ NewCondReg = MRI.createVirtualRegister(TC);
+
// Now look for the closest SCC def if it is a copy
- // replacing the SCCSource with the COPY source register
+ // replacing the CondReg with the COPY source register
bool CopyFound = false;
for (MachineInstr &CandI :
make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
@@ -6836,7 +7301,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
-1) {
if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
- BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
+ BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
.addReg(CandI.getOperand(1).getReg());
CopyFound = true;
}
@@ -6851,24 +7316,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
: AMDGPU::S_CSELECT_B32;
auto NewSelect =
- BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+ BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
}
}
- Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- auto UpdatedInst =
- BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
- .addImm(0)
- .add(Src1) // False
- .addImm(0)
- .add(Src0) // True
- .addReg(IsSCC ? CopySCC : SCCSource);
-
- MRI.replaceRegWith(Dest.getReg(), ResultReg);
- legalizeOperands(*UpdatedInst, MDT);
- addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ Register NewDestReg = MRI.createVirtualRegister(
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
+ MachineInstr *NewInst;
+ if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
+ NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
+ .addImm(0)
+ .add(Src1) // False
+ .addImm(0)
+ .add(Src0) // True
+ .addReg(NewCondReg);
+ } else {
+ NewInst =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
+ .add(Src1) // False
+ .add(Src0) // True
+ .addReg(NewCondReg);
+ }
+ MRI.replaceRegWith(Dest.getReg(), NewDestReg);
+ legalizeOperands(*NewInst, MDT);
+ addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
}
void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
@@ -8011,9 +8483,26 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
return AMDGPU::COPY;
}
-bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
- return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
- MI.modifiesRegister(AMDGPU::EXEC, &RI);
+bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
+ Register Reg) const {
+ // We need to handle instructions which may be inserted during register
+ // allocation to handle the prolog. The initial prolog instruction may have
+ // been separated from the start of the block by spills and copies inserted
+ // needed by the prolog. However, the insertions for scalar registers can
+ // always be placed at the BB top as they are independent of the exec mask
+ // value.
+ bool IsNullOrVectorRegister = true;
+ if (Reg) {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
+ }
+
+ uint16_t Opc = MI.getOpcode();
+ // FIXME: Copies inserted in the block prolog for live-range split should also
+ // be included.
+ return IsNullOrVectorRegister &&
+ (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
+ MI.modifiesRegister(AMDGPU::EXEC, &RI)));
}
MachineInstrBuilder
@@ -8254,6 +8743,8 @@ static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
return SIEncodingFamily::GFX10;
case AMDGPUSubtarget::GFX11:
return SIEncodingFamily::GFX11;
+ case AMDGPUSubtarget::GFX12:
+ return SIEncodingFamily::GFX12;
}
llvm_unreachable("Unknown subtarget generation!");
}
@@ -8313,6 +8804,12 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+ // TODO-GFX12: Remove this.
+ // Hack to allow some GFX12 codegen tests to run before all the encodings are
+ // implemented.
+ if (MCOp == (uint16_t)-1 && Gen == SIEncodingFamily::GFX12)
+ MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX11);
+
// -1 means that Opcode is already a native instruction.
if (MCOp == -1)
return Opcode;
@@ -8603,9 +9100,8 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
InstructionUniformity
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
unsigned opcode = MI.getOpcode();
- if (opcode == AMDGPU::G_INTRINSIC ||
- opcode == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS) {
- auto IID = static_cast<Intrinsic::ID>(MI.getIntrinsicID());
+ if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
+ auto IID = GI->getIntrinsicID();
if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
return InstructionUniformity::NeverUniform;
if (AMDGPU::isIntrinsicAlwaysUniform(IID))
@@ -8643,7 +9139,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
- opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
+ opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
+ AMDGPU::isGenericAtomic(opcode)) {
return InstructionUniformity::NeverUniform;
}
return InstructionUniformity::Default;
@@ -8656,7 +9153,9 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::NeverUniform;
unsigned opcode = MI.getOpcode();
- if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
+ if (opcode == AMDGPU::V_READLANE_B32 ||
+ opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+ opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
return InstructionUniformity::AlwaysUniform;
if (isCopyInstr(MI)) {