summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
commitcfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
parent706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
Notes
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp1090
1 files changed, 802 insertions, 288 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d53950ca44655..9af8ffedce0f3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,6 +63,8 @@
using namespace llvm;
+#define DEBUG_TYPE "si-instr-info"
+
#define GET_INSTRINFO_CTOR_DTOR
#include "AMDGPUGenInstrInfo.inc"
@@ -83,6 +85,12 @@ static cl::opt<unsigned>
BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
cl::desc("Restrict range of branch instructions (DEBUG)"));
+static cl::opt<bool> Fix16BitCopies(
+ "amdgpu-fix-16-bit-physreg-copies",
+ cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
+ cl::init(true),
+ cl::ReallyHidden);
+
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
RI(ST), ST(ST) {
@@ -136,6 +144,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_ACCVGPR_READ_B32:
+ case AMDGPU::V_ACCVGPR_WRITE_B32:
// No implicit operands.
return MI.getNumOperands() == MI.getDesc().getNumOperands();
default:
@@ -258,43 +268,49 @@ static bool isStride64(unsigned Opc) {
}
}
-bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
- const MachineOperand *&BaseOp,
- int64_t &Offset,
- const TargetRegisterInfo *TRI) const {
+bool SIInstrInfo::getMemOperandsWithOffsetWidth(
+ const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+ int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const {
if (!LdSt.mayLoadOrStore())
return false;
unsigned Opc = LdSt.getOpcode();
+ OffsetIsScalable = false;
+ const MachineOperand *BaseOp, *OffsetOp;
+ int DataOpIdx;
if (isDS(LdSt)) {
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- if (OffsetImm) {
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+ OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ if (OffsetOp) {
// Normal, single offset LDS instruction.
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
- // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
- // report that here?
- if (!BaseOp || !BaseOp->isReg())
+ if (!BaseOp) {
+ // DS_CONSUME/DS_APPEND use M0 for the base address.
+ // TODO: find the implicit use operand for M0 and use that as BaseOp?
+ return false;
+ }
+ BaseOps.push_back(BaseOp);
+ Offset = OffsetOp->getImm();
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DataOpIdx == -1)
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+ Width = getOpSize(LdSt, DataOpIdx);
+ } else {
+ // The 2 offset instructions use offset0 and offset1 instead. We can treat
+ // these as a load with a single offset if the 2 offsets are consecutive.
+ // We will use this for some partially aligned loads.
+ const MachineOperand *Offset0Op =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset0);
+ const MachineOperand *Offset1Op =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset1);
+
+ unsigned Offset0 = Offset0Op->getImm();
+ unsigned Offset1 = Offset1Op->getImm();
+ if (Offset0 + 1 != Offset1)
return false;
- Offset = OffsetImm->getImm();
-
- return true;
- }
-
- // The 2 offset instructions use offset0 and offset1 instead. We can treat
- // these as a load with a single offset if the 2 offsets are consecutive. We
- // will use this for some partially aligned loads.
- const MachineOperand *Offset0Imm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset0);
- const MachineOperand *Offset1Imm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset1);
-
- uint8_t Offset0 = Offset0Imm->getImm();
- uint8_t Offset1 = Offset1Imm->getImm();
-
- if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
// Each of these offsets is in element sized units, so we need to convert
// to bytes of the individual reads.
@@ -310,16 +326,20 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
if (isStride64(Opc))
EltSize *= 64;
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
- if (!BaseOp->isReg())
- return false;
-
+ BaseOps.push_back(BaseOp);
Offset = EltSize * Offset0;
-
- return true;
+ // Get appropriate operand(s), and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DataOpIdx == -1) {
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+ Width = getOpSize(LdSt, DataOpIdx);
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
+ Width += getOpSize(LdSt, DataOpIdx);
+ } else {
+ Width = getOpSize(LdSt, DataOpIdx);
+ }
}
-
- return false;
+ return true;
}
if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
@@ -339,59 +359,78 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
const MachineOperand *OffsetImm =
getNamedOperand(LdSt, AMDGPU::OpName::offset);
- BaseOp = SOffset;
+ BaseOps.push_back(RSrc);
+ BaseOps.push_back(SOffset);
Offset = OffsetImm->getImm();
- return true;
- }
-
- const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
- if (!AddrReg)
- return false;
+ } else {
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
+ if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL
+ return false;
+ BaseOps.push_back(BaseOp);
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- BaseOp = AddrReg;
- Offset = OffsetImm->getImm();
- if (SOffset) // soffset can be an inline immediate.
- Offset += SOffset->getImm();
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (BaseOp)
+ BaseOps.push_back(BaseOp);
- if (!BaseOp->isReg())
- return false;
+ const MachineOperand *OffsetImm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ Offset = OffsetImm->getImm();
+ if (SOffset) // soffset can be an inline immediate.
+ Offset += SOffset->getImm();
+ }
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DataOpIdx == -1)
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ Width = getOpSize(LdSt, DataOpIdx);
+ return true;
+ }
+ if (isMIMG(LdSt)) {
+ int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
+ int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
+ if (VAddr0Idx >= 0) {
+ // GFX10 possible NSA encoding.
+ for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
+ BaseOps.push_back(&LdSt.getOperand(I));
+ } else {
+ BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
+ }
+ Offset = 0;
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ Width = getOpSize(LdSt, DataOpIdx);
return true;
}
if (isSMRD(LdSt)) {
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- if (!OffsetImm)
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+ if (!BaseOp) // e.g. S_MEMTIME
return false;
-
- const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
- BaseOp = SBaseReg;
- Offset = OffsetImm->getImm();
- if (!BaseOp->isReg())
- return false;
-
+ BaseOps.push_back(BaseOp);
+ OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ Offset = OffsetOp ? OffsetOp->getImm() : 0;
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
+ Width = getOpSize(LdSt, DataOpIdx);
return true;
}
if (isFLAT(LdSt)) {
- const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
- if (VAddr) {
- // Can't analyze 2 offsets.
- if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
- return false;
-
- BaseOp = VAddr;
- } else {
- // scratch instructions have either vaddr or saddr.
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
- }
-
+ // Instructions have either vaddr or saddr or both.
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (BaseOp)
+ BaseOps.push_back(BaseOp);
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
+ if (BaseOp)
+ BaseOps.push_back(BaseOp);
Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
- if (!BaseOp->isReg())
- return false;
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DataOpIdx == -1)
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ Width = getOpSize(LdSt, DataOpIdx);
return true;
}
@@ -399,15 +438,13 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
}
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
- const MachineOperand &BaseOp1,
+ ArrayRef<const MachineOperand *> BaseOps1,
const MachineInstr &MI2,
- const MachineOperand &BaseOp2) {
- // Support only base operands with base registers.
- // Note: this could be extended to support FI operands.
- if (!BaseOp1.isReg() || !BaseOp2.isReg())
- return false;
-
- if (BaseOp1.isIdenticalTo(BaseOp2))
+ ArrayRef<const MachineOperand *> BaseOps2) {
+ // Only examine the first "base" operand of each instruction, on the
+ // assumption that it represents the real base address of the memory access.
+ // Other operands are typically offsets or indices from this base address.
+ if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
return true;
if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
@@ -433,62 +470,31 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
return Base1 == Base2;
}
-bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
- const MachineOperand &BaseOp2,
- unsigned NumLoads) const {
- const MachineInstr &FirstLdSt = *BaseOp1.getParent();
- const MachineInstr &SecondLdSt = *BaseOp2.getParent();
-
- if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
- return false;
-
- const MachineOperand *FirstDst = nullptr;
- const MachineOperand *SecondDst = nullptr;
-
- if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
- (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
- (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
- const unsigned MaxGlobalLoadCluster = 6;
- if (NumLoads > MaxGlobalLoadCluster)
- return false;
-
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
- if (!FirstDst)
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
- if (!SecondDst)
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
- } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
- } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
- }
-
- if (!FirstDst || !SecondDst)
+bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2,
+ unsigned NumLoads,
+ unsigned NumBytes) const {
+ // If current mem ops pair do not have same base pointer, then they cannot be
+ // clustered.
+ assert(!BaseOps1.empty() && !BaseOps2.empty());
+ const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
+ const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
+ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;
- // Try to limit clustering based on the total number of bytes loaded
- // rather than the number of instructions. This is done to help reduce
- // register pressure. The method used is somewhat inexact, though,
- // because it assumes that all loads in the cluster will load the
- // same number of bytes as FirstLdSt.
-
- // The unit of this value is bytes.
- // FIXME: This needs finer tuning.
- unsigned LoadClusterThreshold = 16;
-
- const MachineRegisterInfo &MRI =
- FirstLdSt.getParent()->getParent()->getRegInfo();
-
- const Register Reg = FirstDst->getReg();
-
- const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
- ? MRI.getRegClass(Reg)
- : RI.getPhysRegClass(Reg);
-
- return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
+ // Compute max cluster size based on average number bytes clustered till now,
+ // and decide based on it, if current mem ops pair can be clustered or not.
+ assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) &&
+ "Invalid NumLoads/NumBytes values");
+ unsigned MaxNumLoads;
+ if (NumBytes <= 4 * NumLoads) {
+ // Loads are dword or smaller (on average).
+ MaxNumLoads = 5;
+ } else {
+ // Loads are bigger than a dword (on average).
+ MaxNumLoads = 4;
+ }
+ return NumLoads <= MaxNumLoads;
}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
@@ -516,11 +522,10 @@ bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg,
- MCRegister SrcReg, bool KillSrc) {
+ MCRegister SrcReg, bool KillSrc,
+ const char *Msg = "illegal SGPR to VGPR copy") {
MachineFunction *MF = MBB.getParent();
- DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
- "illegal SGPR to VGPR copy",
- DL, DS_Error);
+ DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
LLVMContext &C = MF->getFunction().getContext();
C.diagnose(IllegalCopy);
@@ -534,6 +539,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MCRegister SrcReg, bool KillSrc) const {
const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
+ // FIXME: This is hack to resolve copies between 16 bit and 32 bit
+ // registers until all patterns are fixed.
+ if (Fix16BitCopies &&
+ ((RI.getRegSizeInBits(*RC) == 16) ^
+ (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) {
+ MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
+ MCRegister Super = RI.get32BitRegister(RegToFix);
+ assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
+ RegToFix = Super;
+
+ if (DestReg == SrcReg) {
+ // Insert empty bundle since ExpandPostRA expects an instruction here.
+ BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
+ return;
+ }
+
+ RC = RI.getPhysRegClass(DestReg);
+ }
+
if (RC == &AMDGPU::VGPR_32RegClass) {
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
AMDGPU::SReg_32RegClass.contains(SrcReg) ||
@@ -580,6 +604,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (RC == &AMDGPU::SReg_64RegClass) {
+ if (SrcReg == AMDGPU::SCC) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
+ .addImm(1)
+ .addImm(0);
+ return;
+ }
+
if (DestReg == AMDGPU::VCC) {
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
@@ -606,10 +637,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (DestReg == AMDGPU::SCC) {
+ // Copying 64-bit or 32-bit sources to SCC barely makes sense,
+ // but SelectionDAG emits such copies for i1 sources.
+ // TODO: Use S_BITCMP0_B32 instead and only consider the 0th bit.
+ if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+ SrcReg = RI.getSubReg(SrcReg, AMDGPU::sub0);
+ }
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+
BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
- .addReg(SrcReg, getKillRegState(KillSrc))
- .addImm(0);
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0);
+
return;
}
@@ -660,7 +699,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Registers in the sequence are allocated contiguously so we can just
// use register number to pick one of three round-robin temps.
unsigned RegNo = DestReg % 3;
- unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
if (!Tmp)
report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
RS.setRegUsed(Tmp);
@@ -685,6 +724,72 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (RI.getRegSizeInBits(*RC) == 16) {
+ assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
+ AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
+
+ bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
+ bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
+ bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
+ bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
+ bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
+ AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(DestReg);
+ bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
+ MCRegister NewDestReg = RI.get32BitRegister(DestReg);
+ MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
+
+ if (IsSGPRDst) {
+ if (!IsSGPRSrc) {
+ reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+ return;
+ }
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
+ .addReg(NewSrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (IsAGPRDst || IsAGPRSrc) {
+ if (!DstLow || !SrcLow) {
+ reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
+ "Cannot use hi16 subreg with an AGPR!");
+ }
+
+ copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
+ return;
+ }
+
+ if (IsSGPRSrc && !ST.hasSDWAScalar()) {
+ if (!DstLow || !SrcLow) {
+ reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
+ "Cannot use hi16 subreg on VI!");
+ }
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
+ .addReg(NewSrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
+ .addImm(0) // src0_modifiers
+ .addReg(NewSrcReg)
+ .addImm(0) // clamp
+ .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
+ : AMDGPU::SDWA::SdwaSel::WORD_1)
+ .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
+ .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
+ : AMDGPU::SDWA::SdwaSel::WORD_1)
+ .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
+ // First implicit operand is $exec.
+ MIB->tieOperands(0, MIB->getNumOperands() - 1);
+ return;
+ }
+
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isSGPRClass(RC)) {
@@ -806,7 +911,7 @@ void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
int64_t IdxValue = Idx == 0 ? Value : 0;
MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
- get(Opcode), RI.getSubReg(DestReg, Idx));
+ get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
Builder.addImm(IdxValue);
}
}
@@ -818,10 +923,10 @@ SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DstReg,
+ const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond,
- unsigned TrueReg,
- unsigned FalseReg) const {
+ Register TrueReg,
+ Register FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -944,10 +1049,10 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
}
}
-unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
+Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
- unsigned SrcReg, int Value) const {
+ Register SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
@@ -957,10 +1062,10 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
return Reg;
}
-unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
+Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
- unsigned SrcReg, int Value) const {
+ Register SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
@@ -984,6 +1089,80 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
return AMDGPU::COPY;
}
+static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) {
+ if (VecSize <= 32) // 4 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1;
+ if (VecSize <= 64) // 8 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2;
+ if (VecSize <= 96) // 12 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3;
+ if (VecSize <= 128) // 16 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4;
+ if (VecSize <= 160) // 20 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5;
+ if (VecSize <= 256) // 32 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8;
+ if (VecSize <= 512) // 64 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16;
+ if (VecSize <= 1024) // 128 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32;
+
+ llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
+}
+
+static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) {
+ if (VecSize <= 32) // 4 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1;
+ if (VecSize <= 64) // 8 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2;
+ if (VecSize <= 96) // 12 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3;
+ if (VecSize <= 128) // 16 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4;
+ if (VecSize <= 160) // 20 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5;
+ if (VecSize <= 256) // 32 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8;
+ if (VecSize <= 512) // 64 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16;
+ if (VecSize <= 1024) // 128 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32;
+
+ llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
+}
+
+static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) {
+ if (VecSize <= 64) // 8 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1;
+ if (VecSize <= 128) // 16 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2;
+ if (VecSize <= 256) // 32 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4;
+ if (VecSize <= 512) // 64 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8;
+ if (VecSize <= 1024) // 128 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16;
+
+ llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
+}
+
+const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo(
+ unsigned VecSize, unsigned EltSize, bool IsSGPR) const {
+ if (IsSGPR) {
+ switch (EltSize) {
+ case 32:
+ return get(getIndirectSGPRWritePseudo32(VecSize));
+ case 64:
+ return get(getIndirectSGPRWritePseudo64(VecSize));
+ default:
+ llvm_unreachable("invalid reg indexing elt size");
+ }
+ }
+
+ assert(EltSize == 32 && "invalid reg indexing elt size");
+ return get(getIndirectVGPRWritePseudoOpc(VecSize));
+}
+
static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
@@ -996,6 +1175,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S128_SAVE;
case 20:
return AMDGPU::SI_SPILL_S160_SAVE;
+ case 24:
+ return AMDGPU::SI_SPILL_S192_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 64:
@@ -1019,6 +1200,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V128_SAVE;
case 20:
return AMDGPU::SI_SPILL_V160_SAVE;
+ case 24:
+ return AMDGPU::SI_SPILL_V192_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
case 64:
@@ -1049,7 +1232,7 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- unsigned SrcReg, bool isKill,
+ Register SrcReg, bool isKill,
int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
@@ -1058,18 +1241,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const DebugLoc &DL = MBB.findDebugLoc(MI);
- unsigned Size = FrameInfo.getObjectSize(FrameIndex);
- unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- Size, Align);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
+ FrameInfo.getObjectAlign(FrameIndex));
unsigned SpillSize = TRI->getSpillSize(*RC);
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
+ assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
+ SrcReg != AMDGPU::EXEC && "exec should not be spilled");
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for spilling SGPRs.
@@ -1079,7 +1262,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// to make sure we are using the correct register class.
if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
- MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
BuildMI(MBB, MI, DL, OpDesc)
@@ -1126,6 +1309,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S128_RESTORE;
case 20:
return AMDGPU::SI_SPILL_S160_RESTORE;
+ case 24:
+ return AMDGPU::SI_SPILL_S192_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
case 64:
@@ -1149,6 +1334,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V128_RESTORE;
case 20:
return AMDGPU::SI_SPILL_V160_RESTORE;
+ case 24:
+ return AMDGPU::SI_SPILL_V192_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
case 64:
@@ -1179,33 +1366,34 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- unsigned DestReg, int FrameIndex,
+ Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const DebugLoc &DL = MBB.findDebugLoc(MI);
- unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
- unsigned Size = FrameInfo.getObjectSize(FrameIndex);
unsigned SpillSize = TRI->getSpillSize(*RC);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
MachineMemOperand *MMO = MF->getMachineMemOperand(
- PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+ PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
+ FrameInfo.getObjectAlign(FrameIndex));
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
+ assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
+ DestReg != AMDGPU::EXEC && "exec should not be spilled");
// FIXME: Maybe this should not include a memoperand because it will be
// lowered to non-memory instructions.
const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
- if (Register::isVirtualRegister(DestReg) && SpillSize == 4) {
+ if (DestReg.isVirtual() && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
- MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
if (RI.spillSGPRToVGPR())
@@ -1244,7 +1432,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
- unsigned TIDReg = MFI->getTIDReg();
+ Register TIDReg = MFI->getTIDReg();
if (!MFI->hasCalculatedTID()) {
MachineBasicBlock &Entry = MBB.getParent()->front();
MachineBasicBlock::iterator Insert = Entry.front();
@@ -1272,8 +1460,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
RS->enterBasicBlock(Entry);
// FIXME: Can we scavenge an SReg_64 and access the subregs?
- unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
- unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
.addReg(InputPtrReg)
.addImm(SI::KernelInputOffsets::NGROUPS_Z);
@@ -1482,30 +1670,55 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_MOVRELD_B32_V1:
- case AMDGPU::V_MOVRELD_B32_V2:
- case AMDGPU::V_MOVRELD_B32_V4:
- case AMDGPU::V_MOVRELD_B32_V8:
- case AMDGPU::V_MOVRELD_B32_V16: {
- const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: {
+ const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
+
+ unsigned Opc;
+ if (RI.hasVGPRs(EltRC)) {
+ Opc = ST.useVGPRIndexMode() ?
+ AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32;
+ } else {
+ Opc = RI.getRegSizeInBits(*EltRC) == 64 ?
+ AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32;
+ }
+
+ const MCInstrDesc &OpDesc = get(Opc);
Register VecReg = MI.getOperand(0).getReg();
bool IsUndef = MI.getOperand(1).isUndef();
- unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
+ unsigned SubReg = MI.getOperand(3).getImm();
assert(VecReg == MI.getOperand(1).getReg());
- MachineInstr *MovRel =
- BuildMI(MBB, MI, DL, MovRelDesc)
- .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
- .add(MI.getOperand(2))
- .addReg(VecReg, RegState::ImplicitDefine)
- .addReg(VecReg,
- RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, DL, OpDesc)
+ .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+ .add(MI.getOperand(2))
+ .addReg(VecReg, RegState::ImplicitDefine)
+ .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
const int ImpDefIdx =
- MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
+ OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
const int ImpUseIdx = ImpDefIdx + 1;
- MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
-
+ MIB->tieOperands(ImpDefIdx, ImpUseIdx);
MI.eraseFromParent();
break;
}
@@ -1549,22 +1762,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
- case TargetOpcode::BUNDLE: {
- if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
- return false;
-
- // If it is a load it must be a memory clause
- for (MachineBasicBlock::instr_iterator I = MI.getIterator();
- I->isBundledWithSucc(); ++I) {
- I->unbundleFromSucc();
- for (MachineOperand &MO : I->operands())
- if (MO.isReg())
- MO.setIsInternalRead(false);
- }
-
- MI.eraseFromParent();
- break;
- }
}
return true;
}
@@ -1662,9 +1859,15 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
RegOp.ChangeToImmediate(NonRegOp.getImm());
else if (NonRegOp.isFI())
RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
- else
+ else if (NonRegOp.isGlobal()) {
+ RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
+ NonRegOp.getTargetFlags());
+ } else
return nullptr;
+ // Make sure we don't reinterpret a subreg index in the target flags.
+ RegOp.setTargetFlags(NonRegOp.getTargetFlags());
+
NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
NonRegOp.setSubReg(SubReg);
@@ -2085,6 +2288,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
// Copy the flags onto the implicit condition register operand.
preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
+ fixImplicitOperands(*CondBr);
if (BytesAdded)
*BytesAdded = 4;
@@ -2125,8 +2329,8 @@ bool SIInstrInfo::reverseBranchCondition(
bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg,
- int &CondCycles,
+ Register DstReg, Register TrueReg,
+ Register FalseReg, int &CondCycles,
int &TrueCycles, int &FalseCycles) const {
switch (Cond[0].getImm()) {
case VCCNZ:
@@ -2165,8 +2369,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- unsigned DstReg, ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg) const {
+ Register DstReg, ArrayRef<MachineOperand> Cond,
+ Register TrueReg, Register FalseReg) const {
BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
if (Pred == VCCZ || Pred == SCC_FALSE) {
Pred = static_cast<BranchPredicate>(-Pred);
@@ -2178,14 +2382,17 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
unsigned DstSize = RI.getRegSizeInBits(*DstRC);
if (DstSize == 32) {
- unsigned SelOp = Pred == SCC_TRUE ?
- AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
-
- // Instruction's operands are backwards from what is expected.
- MachineInstr *Select =
- BuildMI(MBB, I, DL, get(SelOp), DstReg)
- .addReg(FalseReg)
- .addReg(TrueReg);
+ MachineInstr *Select;
+ if (Pred == SCC_TRUE) {
+ Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
+ .addReg(TrueReg)
+ .addReg(FalseReg);
+ } else {
+ // Instruction's operands are backwards from what is expected.
+ Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg);
+ }
preserveCondRegFlags(Select->getOperand(3), Cond[1]);
return;
@@ -2194,8 +2401,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
if (DstSize == 64 && Pred == SCC_TRUE) {
MachineInstr *Select =
BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
- .addReg(FalseReg)
- .addReg(TrueReg);
+ .addReg(TrueReg)
+ .addReg(FalseReg);
preserveCondRegFlags(Select->getOperand(3), Cond[1]);
return;
@@ -2239,17 +2446,26 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
I = MIB->getIterator();
- SmallVector<unsigned, 8> Regs;
+ SmallVector<Register, 8> Regs;
for (int Idx = 0; Idx != NElts; ++Idx) {
Register DstElt = MRI.createVirtualRegister(EltRC);
Regs.push_back(DstElt);
unsigned SubIdx = SubIndices[Idx];
- MachineInstr *Select =
- BuildMI(MBB, I, DL, get(SelOp), DstElt)
- .addReg(FalseReg, 0, SubIdx)
- .addReg(TrueReg, 0, SubIdx);
+ MachineInstr *Select;
+ if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
+ Select =
+ BuildMI(MBB, I, DL, get(SelOp), DstElt)
+ .addReg(FalseReg, 0, SubIdx)
+ .addReg(TrueReg, 0, SubIdx);
+ } else {
+ Select =
+ BuildMI(MBB, I, DL, get(SelOp), DstElt)
+ .addReg(TrueReg, 0, SubIdx)
+ .addReg(FalseReg, 0, SubIdx);
+ }
+
preserveCondRegFlags(Select->getOperand(3), Cond[1]);
fixImplicitOperands(*Select);
@@ -2313,7 +2529,7 @@ static void removeModOperands(MachineInstr &MI) {
}
bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
- unsigned Reg, MachineRegisterInfo *MRI) const {
+ Register Reg, MachineRegisterInfo *MRI) const {
if (!MRI->hasOneNonDBGUse(Reg))
return false;
@@ -2339,15 +2555,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned Opc = UseMI.getOpcode();
if (Opc == AMDGPU::COPY) {
- bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
+ Register DstReg = UseMI.getOperand(0).getReg();
+ bool Is16Bit = getOpSize(UseMI, 0) == 2;
+ bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
- if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) {
- if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32))
+ APInt Imm(32, ImmOp->getImm());
+
+ if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
+ Imm = Imm.ashr(16);
+
+ if (RI.isAGPR(*MRI, DstReg)) {
+ if (!isInlineConstant(Imm))
return false;
NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32;
}
+
+ if (Is16Bit) {
+ if (isVGPRCopy)
+ return false; // Do not clobber vgpr_hi16
+
+ if (DstReg.isVirtual() &&
+ UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
+ return false;
+
+ UseMI.getOperand(0).setSubReg(0);
+ if (DstReg.isPhysical()) {
+ DstReg = RI.get32BitRegister(DstReg);
+ UseMI.getOperand(0).setReg(DstReg);
+ }
+ assert(UseMI.getOperand(1).getReg().isVirtual());
+ }
+
UseMI.setDesc(get(NewOpc));
- UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
+ UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
+ UseMI.getOperand(1).setTargetFlags(0);
UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
return true;
}
@@ -2517,6 +2758,18 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
return false;
}
+static bool
+memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2) {
+ if (BaseOps1.size() != BaseOps2.size())
+ return false;
+ for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
+ if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
+ return false;
+ }
+ return true;
+}
+
static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
int WidthB, int OffsetB) {
int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
@@ -2527,26 +2780,26 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
const MachineInstr &MIb) const {
- const MachineOperand *BaseOp0, *BaseOp1;
+ SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
int64_t Offset0, Offset1;
+ unsigned Dummy0, Dummy1;
+ bool Offset0IsScalable, Offset1IsScalable;
+ if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
+ Dummy0, &RI) ||
+ !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
+ Dummy1, &RI))
+ return false;
- if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
- getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
- if (!BaseOp0->isIdenticalTo(*BaseOp1))
- return false;
+ if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
+ return false;
- if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
- // FIXME: Handle ds_read2 / ds_write2.
- return false;
- }
- unsigned Width0 = (*MIa.memoperands_begin())->getSize();
- unsigned Width1 = (*MIb.memoperands_begin())->getSize();
- if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
- return true;
- }
+ if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
+ // FIXME: Handle ds_read2 / ds_write2.
+ return false;
}
-
- return false;
+ unsigned Width0 = MIa.memoperands().front()->getSize();
+ unsigned Width1 = MIb.memoperands().front()->getSize();
+ return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
}
bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
@@ -2586,7 +2839,7 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (isSMRD(MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
+ return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
}
if (isFLAT(MIa)) {
@@ -2732,16 +2985,30 @@ static bool changesVGPRIndexingMode(const MachineInstr &MI) {
bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const {
- // XXX - Do we want the SP check in the base implementation?
+ // Skipping the check for SP writes in the base implementation. The reason it
+ // was added was apparently due to compile time concerns.
+ //
+ // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
+ // but is probably avoidable.
+
+ // Copied from base implementation.
+ // Terminators and labels can't be scheduled around.
+ if (MI.isTerminator() || MI.isPosition())
+ return true;
+
+ // INLINEASM_BR can jump to another block
+ if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+ return true;
// Target-independent instructions do not have an implicit-use of EXEC, even
// when they operate on VGPRs. Treating EXEC modifications as scheduling
// boundaries prevents incorrect movements of such instructions.
- return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
- MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+
+ // TODO: Don't treat setreg with known constant that only changes MODE as
+ // barrier.
+ return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
- MI.getOpcode() == AMDGPU::S_DENORM_MODE ||
changesVGPRIndexingMode(MI);
}
@@ -2755,6 +3022,20 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
Opcode == AMDGPU::DS_GWS_BARRIER;
}
+bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
+ // Skip the full operand and register alias search modifiesRegister
+ // does. There's only a handful of instructions that touch this, it's only an
+ // implicit def, and doesn't alias any other registers.
+ if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) {
+ for (; ImpDef && *ImpDef; ++ImpDef) {
+ if (*ImpDef == AMDGPU::MODE)
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
@@ -2780,6 +3061,10 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
if (MI.isCall() || MI.isInlineAsm())
return true; // conservative assumption
+ // A mode change is a scalar operation that influences vector instructions.
+ if (modifiesModeRegister(MI))
+ return true;
+
// These are like SALU instructions in terms of effects, so it's questionable
// whether we should return true for those.
//
@@ -2866,10 +3151,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return AMDGPU::isInlinableLiteral64(MO.getImm(),
ST.hasInv2PiInlineImm());
case AMDGPU::OPERAND_REG_IMM_INT16:
- case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ // We would expect inline immediates to not be concerned with an integer/fp
+ // distinction. However, in the case of 16-bit integer operations, the
+ // "floating point" values appear to not work. It seems read the low 16-bits
+ // of 32-bit immediates, which happens to always work for the integer
+ // values.
+ //
+ // See llvm bugzilla 46302.
+ //
+ // TODO: Theoretically we could use op-sel to use the high bits of the
+ // 32-bit FP values.
+ return AMDGPU::isInlinableIntLiteral(Imm);
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ // This suffers the same problem as the scalar 16-bit cases.
+ return AMDGPU::isInlinableIntLiteralV216(Imm);
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
@@ -2883,11 +3184,8 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return false;
}
- case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
uint32_t Trunc = static_cast<uint32_t>(Imm);
return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
@@ -3056,7 +3354,8 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
const MachineOperand &Orig) {
for (MachineOperand &Use : MI.implicit_operands()) {
- if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
+ if (Use.isUse() &&
+ (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
Use.setIsUndef(Orig.isUndef());
Use.setIsKill(Orig.isKill());
return;
@@ -3068,7 +3367,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
unsigned Op32) const {
MachineBasicBlock *MBB = MI.getParent();;
MachineInstrBuilder Inst32 =
- BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
+ BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
+ .setMIFlags(MI.getFlags());
// Add the dst operand if the 32-bit encoding also has an explicit $vdst.
// For VOPC instructions, this is replaced by an implicit def of vcc.
@@ -3138,7 +3438,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
}
}
-static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
+static Register findImplicitSGPRRead(const MachineInstr &MI) {
for (const MachineOperand &MO : MI.implicit_operands()) {
// We only care about reads.
if (MO.isDef())
@@ -3239,6 +3539,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return true;
}
+ if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
+ ErrInfo = "missing memory operand from MIMG instruction.";
+ return false;
+ }
+
// Make sure the register classes are correct.
for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
if (MI.getOperand(i).isFPImm()) {
@@ -3446,8 +3751,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
++ConstantBusCount;
- SmallVector<unsigned, 2> SGPRsUsed;
- unsigned SGPRUsed = findImplicitSGPRRead(MI);
+ SmallVector<Register, 2> SGPRsUsed;
+ Register SGPRUsed = findImplicitSGPRRead(MI);
if (SGPRUsed != AMDGPU::NoRegister) {
++ConstantBusCount;
SGPRsUsed.push_back(SGPRUsed);
@@ -3482,7 +3787,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
if (isVOP3(MI) && LiteralCount) {
- if (LiteralCount && !ST.hasVOP3Literal()) {
+ if (!ST.hasVOP3Literal()) {
ErrInfo = "VOP3 instruction uses literal";
return false;
}
@@ -3665,11 +3970,34 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return false;
}
+ bool IsA16 = false;
+ if (ST.hasR128A16()) {
+ const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
+ IsA16 = R128A16->getImm() != 0;
+ } else if (ST.hasGFX10A16()) {
+ const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
+ IsA16 = A16->getImm() != 0;
+ }
+
+ bool PackDerivatives = IsA16 || BaseOpcode->G16;
bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
- unsigned AddrWords = BaseOpcode->NumExtraArgs +
- (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
- (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
- (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+
+ unsigned AddrWords = BaseOpcode->NumExtraArgs;
+ unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ if (IsA16)
+ AddrWords += (AddrComponents + 1) / 2;
+ else
+ AddrWords += AddrComponents;
+
+ if (BaseOpcode->Gradients) {
+ if (PackDerivatives)
+ // There are two gradients per coordinate, we pack them separately.
+ // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv)
+ AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2;
+ else
+ AddrWords += Dim->NumGradients;
+ }
unsigned VAddrWords;
if (IsNSA) {
@@ -3681,14 +4009,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
AddrWords = 16;
else if (AddrWords > 4)
AddrWords = 8;
- else if (AddrWords == 3 && VAddrWords == 4) {
- // CodeGen uses the V4 variant of instructions for three addresses,
- // because the selection DAG does not support non-power-of-two types.
+ else if (AddrWords == 4)
AddrWords = 4;
- }
+ else if (AddrWords == 3)
+ AddrWords = 3;
}
if (VAddrWords != AddrWords) {
+ LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
+ << " but got " << VAddrWords << "\n");
ErrInfo = "bad vaddr size";
return false;
}
@@ -4217,7 +4546,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
}
}
-unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
MachineRegisterInfo &MRI) const {
const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
@@ -5002,6 +5331,76 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
Inst.eraseFromParent();
continue;
+
+ // TODO: remove as soon as everything is ready
+ // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
+ // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
+ // can only be selected from the uniform SDNode.
+ case AMDGPU::S_ADD_CO_PSEUDO:
+ case AMDGPU::S_SUB_CO_PSEUDO: {
+ unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
+ ? AMDGPU::V_ADDC_U32_e64
+ : AMDGPU::V_SUBB_U32_e64;
+ const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
+ Register CarryInReg = Inst.getOperand(4).getReg();
+ if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
+ Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
+ BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
+ .addReg(CarryInReg);
+ }
+
+ Register CarryOutReg = Inst.getOperand(1).getReg();
+
+ Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
+ MRI.getRegClass(Inst.getOperand(0).getReg())));
+ MachineInstr *CarryOp =
+ BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
+ .addReg(CarryOutReg, RegState::Define)
+ .add(Inst.getOperand(2))
+ .add(Inst.getOperand(3))
+ .addReg(CarryInReg)
+ .addImm(0);
+ legalizeOperands(*CarryOp);
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+ Inst.eraseFromParent();
+ }
+ continue;
+ case AMDGPU::S_UADDO_PSEUDO:
+ case AMDGPU::S_USUBO_PSEUDO: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineOperand &Dest0 = Inst.getOperand(0);
+ MachineOperand &Dest1 = Inst.getOperand(1);
+ MachineOperand &Src0 = Inst.getOperand(2);
+ MachineOperand &Src1 = Inst.getOperand(3);
+
+ unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
+ ? AMDGPU::V_ADD_I32_e64
+ : AMDGPU::V_SUB_I32_e64;
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
+ .addReg(Dest1.getReg(), RegState::Define)
+ .add(Src0)
+ .add(Src1)
+ .addImm(0); // clamp bit
+
+ legalizeOperands(*NewInstr, MDT);
+
+ MRI.replaceRegWith(Dest0.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
+ Worklist);
+ Inst.eraseFromParent();
+ }
+ continue;
+
+ case AMDGPU::S_CSELECT_B32:
+ case AMDGPU::S_CSELECT_B64:
+ lowerSelect(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ continue;
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -5142,6 +5541,78 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
return false;
}
+void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
+
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ DebugLoc DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ MachineOperand &Cond = Inst.getOperand(3);
+
+ Register SCCSource = Cond.getReg();
+ // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead.
+ if (!Cond.isUndef()) {
+ for (MachineInstr &CandI :
+ make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
+ Inst.getParent()->rend())) {
+ if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
+ -1) {
+ if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
+ SCCSource = CandI.getOperand(1).getReg();
+ }
+ break;
+ }
+ }
+ }
+
+ // If this is a trivial select where the condition is effectively not SCC
+ // (SCCSource is a source of copy to SCC), then the select is semantically
+ // equivalent to copying SCCSource. Hence, there is no need to create
+ // V_CNDMASK, we can just use that and bail out.
+ if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) &&
+ Src1.isImm() && (Src1.getImm() == 0)) {
+ MRI.replaceRegWith(Dest.getReg(), SCCSource);
+ return;
+ }
+
+ const TargetRegisterClass *TC = ST.getWavefrontSize() == 64
+ ? &AMDGPU::SReg_64_XEXECRegClass
+ : &AMDGPU::SReg_32_XM0_XEXECRegClass;
+ Register CopySCC = MRI.createVirtualRegister(TC);
+
+ if (SCCSource == AMDGPU::SCC) {
+ // Insert a trivial select instead of creating a copy, because a copy from
+ // SCC would semantically mean just copying a single bit, but we may need
+ // the result to be a vector condition mask that needs preserving.
+ unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
+ : AMDGPU::S_CSELECT_B32;
+ auto NewSelect =
+ BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+ NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
+ } else {
+ BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource);
+ }
+
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ auto UpdatedInst =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
+ .addImm(0)
+ .add(Src1) // False
+ .addImm(0)
+ .add(Src0) // True
+ .addReg(CopySCC);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ legalizeOperands(*UpdatedInst, MDT);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -5623,7 +6094,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
}
void SIInstrInfo::addUsersToMoveToVALUWorklist(
- unsigned DstReg,
+ Register DstReg,
MachineRegisterInfo &MRI,
SetVectorType &Worklist) const {
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
@@ -5723,20 +6194,60 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
SetVectorType &Worklist) const {
+ bool SCCUsedImplicitly = false;
+
// Ensure that def inst defines SCC, which is still live.
assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
!Op.isDead() && Op.getParent() == &SCCDefInst);
+ SmallVector<MachineInstr *, 4> CopyToDelete;
// This assumes that all the users of SCC are in the same block
// as the SCC def.
for (MachineInstr &MI : // Skip the def inst itself.
make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
SCCDefInst.getParent()->end())) {
// Check if SCC is used first.
- if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
- Worklist.insert(&MI);
+ if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
+ if (MI.isCopy()) {
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ unsigned DestReg = MI.getOperand(0).getReg();
+
+ for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
+ if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
+ (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
+ User.getOperand(4).setReg(RI.getVCC());
+ Worklist.insert(&User);
+ } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
+ User.getOperand(5).setReg(RI.getVCC());
+ // No need to add to Worklist.
+ }
+ }
+ CopyToDelete.push_back(&MI);
+ } else {
+ if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ MI.getOpcode() == AMDGPU::S_CSELECT_B64) {
+ // This is an implicit use of SCC and it is really expected by
+ // the SCC users to handle.
+ // We cannot preserve the edge to the user so add the explicit
+ // copy: SCC = COPY VCC.
+ // The copy will be cleaned up during the processing of the user
+ // in lowerSelect.
+ SCCUsedImplicitly = true;
+ }
+
+ Worklist.insert(&MI);
+ }
+ }
// Exit if we find another SCC def.
if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
- return;
+ break;
+ }
+ for (auto &Copy : CopyToDelete)
+ Copy->eraseFromParent();
+
+ if (SCCUsedImplicitly) {
+ BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()),
+ SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(RI.getVCC());
}
}
@@ -5789,7 +6300,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
}
// Find the one SGPR operand we are allowed to use.
-unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
+Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
int OpIndices[3]) const {
const MCInstrDesc &Desc = MI.getDesc();
@@ -5802,11 +6313,11 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
//
// If the operand's class is an SGPR, we can never move it.
- unsigned SGPRReg = findImplicitSGPRRead(MI);
+ Register SGPRReg = findImplicitSGPRRead(MI);
if (SGPRReg != AMDGPU::NoRegister)
return SGPRReg;
- unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
+ Register UsedSGPRs[3] = { AMDGPU::NoRegister };
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
for (unsigned i = 0; i < 3; ++i) {
@@ -5919,10 +6430,9 @@ bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
return isSMRD(Opc);
}
-bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
- unsigned Opc = MI.getOpcode();
-
- return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
+bool SIInstrInfo::isHighLatencyDef(int Opc) const {
+ return get(Opc).mayLoad() &&
+ (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
}
unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
@@ -6198,7 +6708,7 @@ MachineInstrBuilder
SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
- unsigned DestReg) const {
+ Register DestReg) const {
if (ST.hasAddNoCarry())
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
@@ -6608,20 +7118,24 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
// %0 may even spill. We can't spill $m0 normally (it would require copying to
// a numbered SGPR anyway), and since it is in the SReg_32 register class,
// TargetInstrInfo::foldMemoryOperand() is going to try.
+ // A similar issue also exists with spilling and reloading $exec registers.
//
// To prevent that, constrain the %0 register class here.
if (MI.isFullCopy()) {
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
-
- if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) {
- MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
- return nullptr;
- }
-
- if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) {
- MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
- return nullptr;
+ if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
+ (DstReg.isVirtual() != SrcReg.isVirtual())) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
+ const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
+ if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
+ MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
+ return nullptr;
+ } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
+ MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
+ return nullptr;
+ }
}
}