summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIRegisterInfo.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
commit1d5ae1026e831016fc29fd927877c86af904481f (patch)
tree2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Target/AMDGPU/SIRegisterInfo.cpp
parente6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff)
Notes
Diffstat (limited to 'lib/Target/AMDGPU/SIRegisterInfo.cpp')
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp439
1 files changed, 167 insertions, 272 deletions
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f152deb28004..f58bc3060c42 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -48,11 +48,6 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
}
}
-static cl::opt<bool> EnableSpillSGPRToSMEM(
- "amdgpu-spill-sgpr-to-smem",
- cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
- cl::init(false));
-
static cl::opt<bool> EnableSpillSGPRToVGPR(
"amdgpu-spill-sgpr-to-vgpr",
cl::desc("Enable spilling VGPRs to SGPRs"),
@@ -61,17 +56,12 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
AMDGPURegisterInfo(),
+ ST(ST),
SGPRPressureSets(getNumRegPressureSets()),
VGPRPressureSets(getNumRegPressureSets()),
AGPRPressureSets(getNumRegPressureSets()),
- SpillSGPRToVGPR(false),
- SpillSGPRToSMEM(false),
+ SpillSGPRToVGPR(EnableSpillSGPRToVGPR),
isWave32(ST.isWave32()) {
- if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
- SpillSGPRToSMEM = true;
- else if (EnableSpillSGPRToVGPR)
- SpillSGPRToVGPR = true;
-
unsigned NumRegPressureSets = getNumRegPressureSets();
SGPRSetID = NumRegPressureSets;
@@ -118,11 +108,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
- return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
}
static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
@@ -144,7 +132,6 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
@@ -202,8 +189,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(AMDGPU::VCC_HI);
}
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
@@ -220,6 +205,14 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, Reg);
}
+ // Reserve all the rest AGPRs if there are no instructions to use it.
+ if (!ST.hasMAIInsts()) {
+ for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
+ unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
+ }
+ }
+
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
@@ -293,32 +286,17 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const
bool SIRegisterInfo::requiresFrameIndexScavenging(
const MachineFunction &MF) const {
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (MFI.hasStackObjects())
- return true;
-
- // May need to deal with callee saved registers.
- const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- return !Info->isEntryFunction();
+ // Do not use frame virtual registers. They used to be used for SGPRs, but
+ // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
+ // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
+ // spill.
+ return false;
}
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (!MFI.hasStackObjects())
- return false;
-
- // The scavenger is used for large frames which may require finding a free
- // register for large offsets.
- if (!isUInt<12>(MFI.getStackSize()))
- return true;
-
- // If using scalar stores, for spills, m0 is needed for the scalar store
- // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
- // register for it during frame index elimination, so the scavenger is
- // directly needed.
- return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
- MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
+ return MFI.hasStackObjects();
}
bool SIRegisterInfo::requiresVirtualBaseRegisters(
@@ -372,8 +350,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
DL = Ins->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = Subtarget.getInstrInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
if (Offset == 0) {
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
@@ -382,9 +359,9 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
}
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(Offset);
@@ -399,11 +376,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
int64_t Offset) const {
-
- MachineBasicBlock *MBB = MI.getParent();
- MachineFunction *MF = MBB->getParent();
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = Subtarget.getInstrInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
#ifndef NDEBUG
// FIXME: Is it possible to be storing a frame index to itself?
@@ -419,12 +392,15 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
#endif
MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+#ifndef NDEBUG
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+#endif
assert(FIOp && FIOp->isFI() && "frame index must be address operand");
assert(TII->isMUBUF(MI));
assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
- MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
- "should only be seeing frame offset relative FrameIndex");
-
+ MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() &&
+ "should only be seeing stack pointer offset relative FrameIndex");
MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
int64_t NewOffset = OffsetOp->getImm() + Offset;
@@ -564,7 +540,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
}
}
-static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
+static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
+ MachineBasicBlock::iterator MI,
int Index,
unsigned Lane,
unsigned ValueReg,
@@ -572,7 +549,6 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
@@ -595,11 +571,12 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
// need to handle the case where an SGPR may need to be spilled while spilling.
-static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
+static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
MachineFrameInfo &MFI,
MachineBasicBlock::iterator MI,
int Index,
int64_t Offset) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
MachineBasicBlock *MBB = MI->getParent();
const DebugLoc &DL = MI->getDebugLoc();
bool IsStore = MI->mayStore();
@@ -611,7 +588,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
return false;
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
- if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
+ if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
return true;
MachineInstrBuilder NewMI =
@@ -624,6 +601,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.cloneMemRefs(*MI);
const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
@@ -645,7 +623,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
RegScavenger *RS) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -707,8 +684,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
}
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
- unsigned SubReg = NumSubRegs == 1 ?
- ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
+ Register SubReg = NumSubRegs == 1
+ ? Register(ValueReg)
+ : getSubReg(ValueReg, getSubRegFromChannel(i));
unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -718,7 +696,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
SrcDstRegState |= getKillRegState(IsKill);
}
- auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
+ auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill);
if (!MIB.getInstr()) {
unsigned FinalReg = SubReg;
@@ -743,6 +721,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(NewMMO);
if (!IsStore && TmpReg != AMDGPU::NoRegister)
@@ -763,22 +742,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
}
}
-static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
- bool Store) {
- if (SuperRegSize % 16 == 0) {
- return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
- AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
- }
-
- if (SuperRegSize % 8 == 0) {
- return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
- AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
- }
-
- return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
- AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
-}
-
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS,
@@ -794,98 +757,37 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (OnlyToVGPR && !SpillToVGPR)
return false;
- MachineRegisterInfo &MRI = MF->getRegInfo();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- unsigned SuperReg = MI->getOperand(0).getReg();
+ Register SuperReg = MI->getOperand(0).getReg();
bool IsKill = MI->getOperand(0).isKill();
const DebugLoc &DL = MI->getDebugLoc();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- bool SpillToSMEM = spillSGPRToSMEM();
- if (SpillToSMEM && OnlyToVGPR)
- return false;
-
- Register FrameReg = getFrameRegister(*MF);
-
assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
SuperReg != MFI->getFrameOffsetReg() &&
SuperReg != MFI->getScratchWaveOffsetReg()));
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- unsigned OffsetReg = AMDGPU::M0;
unsigned M0CopyReg = AMDGPU::NoRegister;
- if (SpillToSMEM) {
- if (RS->isRegUsed(AMDGPU::M0)) {
- M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
- .addReg(AMDGPU::M0);
- }
- }
-
- unsigned ScalarStoreOp;
unsigned EltSize = 4;
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
- if (SpillToSMEM && isSGPRClass(RC)) {
- // XXX - if private_element_size is larger than 4 it might be useful to be
- // able to spill wider vmem spills.
- std::tie(EltSize, ScalarStoreOp) =
- getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
- }
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+ // Scavenged temporary VGPR to use. It must be scavenged once for any number
+ // of spilled subregs.
+ Register TmpVGPR;
+
// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- unsigned SubReg = NumSubRegs == 1 ?
- SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
- if (SpillToSMEM) {
- int64_t FrOffset = FrameInfo.getObjectOffset(Index);
-
- // The allocated memory size is really the wavefront size * the frame
- // index size. The widest register class is 64 bytes, so a 4-byte scratch
- // allocation is enough to spill this in a single stack object.
- //
- // FIXME: Frame size/offsets are computed earlier than this, so the extra
- // space is still unnecessarily allocated.
-
- unsigned Align = FrameInfo.getObjectAlignment(Index);
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- EltSize, MinAlign(Align, EltSize * i));
-
- // SMEM instructions only support a single offset, so increment the wave
- // offset.
-
- int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
- if (Offset != 0) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(FrameReg)
- .addImm(Offset);
- } else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(FrameReg);
- }
-
- BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
- .addReg(SubReg, getKillRegState(IsKill)) // sdata
- .addReg(MFI->getScratchRSrcReg()) // sbase
- .addReg(OffsetReg, RegState::Kill) // soff
- .addImm(0) // glc
- .addImm(0) // dlc
- .addMemOperand(MMO);
-
- continue;
- }
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
@@ -915,15 +817,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
return false;
// Spill SGPR to a frame index.
- // TODO: Should VI try to spill to VGPR and then spill to SMEM?
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+ if (!TmpVGPR.isValid())
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
MachineInstrBuilder Mov
- = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(SubReg, SubKillState);
-
// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?
if (NumSubRegs > 1) {
@@ -941,7 +841,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
- .addReg(TmpReg, RegState::Kill) // src
+ .addReg(TmpVGPR, RegState::Kill) // src
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srrsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
@@ -965,7 +865,6 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
RegScavenger *RS,
bool OnlyToVGPR) const {
MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
@@ -976,84 +875,27 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
return false;
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();
- unsigned SuperReg = MI->getOperand(0).getReg();
- bool SpillToSMEM = spillSGPRToSMEM();
- if (SpillToSMEM && OnlyToVGPR)
- return false;
+ Register SuperReg = MI->getOperand(0).getReg();
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- unsigned OffsetReg = AMDGPU::M0;
unsigned M0CopyReg = AMDGPU::NoRegister;
- if (SpillToSMEM) {
- if (RS->isRegUsed(AMDGPU::M0)) {
- M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
- .addReg(AMDGPU::M0);
- }
- }
-
unsigned EltSize = 4;
- unsigned ScalarLoadOp;
-
- Register FrameReg = getFrameRegister(*MF);
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
- if (SpillToSMEM && isSGPRClass(RC)) {
- // XXX - if private_element_size is larger than 4 it might be useful to be
- // able to spill wider vmem spills.
- std::tie(EltSize, ScalarLoadOp) =
- getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
- }
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
- // SubReg carries the "Kill" flag when SubReg == SuperReg.
- int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+ Register TmpVGPR;
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- unsigned SubReg = NumSubRegs == 1 ?
- SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
- if (SpillToSMEM) {
- // FIXME: Size may be > 4 but extra bytes wasted.
- unsigned Align = FrameInfo.getObjectAlignment(Index);
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
- EltSize, MinAlign(Align, EltSize * i));
-
- // Add i * 4 offset
- int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
- if (Offset != 0) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(FrameReg)
- .addImm(Offset);
- } else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(FrameReg);
- }
-
- auto MIB =
- BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
- .addReg(MFI->getScratchRSrcReg()) // sbase
- .addReg(OffsetReg, RegState::Kill) // soff
- .addImm(0) // glc
- .addImm(0) // dlc
- .addMemOperand(MMO);
-
- if (NumSubRegs > 1 && i == 0)
- MIB.addReg(SuperReg, RegState::ImplicitDefine);
-
- continue;
- }
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
@@ -1071,7 +913,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR.isValid())
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
@@ -1081,7 +924,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
MachineMemOperand::MOLoad, EltSize,
MinAlign(Align, EltSize * i));
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
@@ -1090,7 +933,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
auto MIB =
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
- .addReg(TmpReg, RegState::Kill);
+ .addReg(TmpVGPR, RegState::Kill);
if (NumSubRegs > 1)
MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
@@ -1141,11 +984,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
@@ -1255,13 +1096,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// In an entry function/kernel the offset is already the absolute
// address relative to the frame register.
- unsigned DiffReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register TmpDiffReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+
+ // If there's no free SGPR, in-place modify the FP
+ Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg;
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
Register ResultReg = IsCopy ?
MI->getOperand(0).getReg() :
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
.addReg(FrameReg)
@@ -1271,35 +1115,80 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (Offset == 0) {
// XXX - This never happens because of emergency scavenging slot at 0?
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
- .addImm(Log2_32(ST.getWavefrontSize()))
+ .addImm(ST.getWavefrontSizeLog2())
.addReg(DiffReg);
} else {
- unsigned ScaledReg
- = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
+ Register ScaledReg =
+ RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
- .addImm(Log2_32(ST.getWavefrontSize()))
- .addReg(DiffReg, RegState::Kill);
+ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ ScaledReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(DiffReg, RegState::Kill);
- // TODO: Fold if use instruction is another add of a constant.
- if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
- TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
- .addImm(Offset)
- .addReg(ScaledReg, RegState::Kill)
- .addImm(0); // clamp bit
+ const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
+
+ // TODO: Fold if use instruction is another add of a constant.
+ if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+ // FIXME: This can fail
+ MIB.addImm(Offset);
+ MIB.addReg(ScaledReg, RegState::Kill);
+ if (!IsVOP2)
+ MIB.addImm(0); // clamp bit
+ } else {
+ Register ConstOffsetReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false);
+
+ // This should always be able to use the unused carry out.
+ assert(ConstOffsetReg && "this scavenge should not be able to fail");
+
+ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+ .addImm(Offset);
+ MIB.addReg(ConstOffsetReg, RegState::Kill);
+ MIB.addReg(ScaledReg, RegState::Kill);
+ MIB.addImm(0); // clamp bit
+ }
} else {
- unsigned ConstOffsetReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ // We have to produce a carry out, and we there isn't a free SGPR
+ // pair for it. We can keep the whole computation on the SALU to
+ // avoid clobbering an additional register at the cost of an extra
+ // mov.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
- .addImm(Offset);
- TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
- .addReg(ConstOffsetReg, RegState::Kill)
+ // We may have 1 free scratch SGPR even though a carry out is
+ // unavailable. Only one additional mov is needed.
+ Register TmpScaledReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+ Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
+ .addReg(DiffReg, RegState::Kill)
+ .addImm(ST.getWavefrontSizeLog2());
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
.addReg(ScaledReg, RegState::Kill)
- .addImm(0); // clamp bit
+ .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
+ .addReg(ScaledReg, RegState::Kill);
+
+ // If there were truly no free SGPRs, we need to undo everything.
+ if (!TmpScaledReg.isValid()) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
+ .addReg(DiffReg, RegState::Kill)
+ .addImm(ST.getWavefrontSizeLog2());
+ }
}
}
+ if (!TmpDiffReg.isValid()) {
+ // Restore the FP.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg)
+ .addReg(FrameReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
// Don't introduce an extra copy if we're just materializing in a mov.
if (IsCopy)
MI->eraseFromParent();
@@ -1325,7 +1214,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int64_t NewOffset = OldImm + Offset;
if (isUInt<12>(NewOffset) &&
- buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
+ buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
MI->eraseFromParent();
return;
}
@@ -1337,7 +1226,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int64_t Offset = FrameInfo.getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
FIOp.ChangeToRegister(TmpReg, false, false, true);
@@ -1347,27 +1236,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
- const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
- unsigned Size = getRegSizeInBits(*RC);
- unsigned AltName = AMDGPU::NoRegAltName;
-
- switch (Size) {
- case 32: AltName = AMDGPU::Reg32; break;
- case 64: AltName = AMDGPU::Reg64; break;
- case 96: AltName = AMDGPU::Reg96; break;
- case 128: AltName = AMDGPU::Reg128; break;
- case 160: AltName = AMDGPU::Reg160; break;
- case 256: AltName = AMDGPU::Reg256; break;
- case 512: AltName = AMDGPU::Reg512; break;
- case 1024: AltName = AMDGPU::Reg1024; break;
- }
- return AMDGPUInstPrinter::getRegisterName(Reg, AltName);
+ return AMDGPUInstPrinter::getRegisterName(Reg);
}
// FIXME: This is very slow. It might be worth creating a map from physreg to
// register class.
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
- assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(!Register::isVirtualRegister(Reg));
static const TargetRegisterClass *const BaseClasses[] = {
&AMDGPU::VGPR_32RegClass,
@@ -1408,8 +1283,6 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
unsigned Size = getRegSizeInBits(*RC);
- if (Size < 32)
- return false;
switch (Size) {
case 32:
return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
@@ -1427,8 +1300,11 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
case 1024:
return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
+ case 1:
+ return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr;
default:
- llvm_unreachable("Invalid register class size");
+ assert(Size < 32 && "Invalid register class size");
+ return false;
}
}
@@ -1476,6 +1352,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
return &AMDGPU::VReg_512RegClass;
case 1024:
return &AMDGPU::VReg_1024RegClass;
+ case 1:
+ return &AMDGPU::VReg_1RegClass;
default:
llvm_unreachable("Invalid register class size");
}
@@ -1509,7 +1387,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
case 96:
return &AMDGPU::SReg_96RegClass;
case 128:
- return &AMDGPU::SReg_128RegClass;
+ return &AMDGPU::SGPR_128RegClass;
case 160:
return &AMDGPU::SReg_160RegClass;
case 256:
@@ -1539,7 +1417,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
case 3:
return &AMDGPU::SReg_96RegClass;
case 4:
- return &AMDGPU::SReg_128RegClass;
+ return &AMDGPU::SGPR_128RegClass;
case 5:
return &AMDGPU::SReg_160RegClass;
case 8:
@@ -1587,6 +1465,15 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
}
}
+bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
+ if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
+ return !ST.hasMFMAInlineLiteralBug();
+
+ return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+ OpType <= AMDGPU::OPERAND_SRC_LAST;
+}
+
bool SIRegisterInfo::shouldRewriteCopySrc(
const TargetRegisterClass *DefRC,
unsigned DefSubReg,
@@ -1802,7 +1689,7 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
const TargetRegisterClass*
SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
unsigned Reg) const {
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return MRI.getRegClass(Reg);
return getPhysRegClass(Reg);
@@ -1845,8 +1732,6 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
@@ -1900,18 +1785,22 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
return isWave32 ?
&AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
case AMDGPU::SGPRRegBankID:
- return &AMDGPU::SReg_32_XM0RegClass;
+ return &AMDGPU::SReg_32RegClass;
case AMDGPU::SCCRegBankID:
// This needs to return an allocatable class, so don't bother returning
// the dummy SCC class.
- return &AMDGPU::SReg_32_XM0RegClass;
+ //
+ // FIXME: This is a grotesque hack. We use SGPR_32 as an indication this
+ // was not an VCC bank value since we use the larger class SReg_32 for
+ // other values. These should all use SReg_32.
+ return &AMDGPU::SGPR_32RegClass;
default:
llvm_unreachable("unknown register bank");
}
}
case 32:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32_XM0RegClass;
+ &AMDGPU::SReg_32RegClass;
case 64:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
&AMDGPU::SReg_64_XEXECRegClass;
@@ -1920,7 +1809,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
&AMDGPU::SReg_96RegClass;
case 128:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
- &AMDGPU::SReg_128RegClass;
+ &AMDGPU::SGPR_128RegClass;
case 160:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
&AMDGPU::SReg_160RegClass;
@@ -1930,10 +1819,13 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
case 512:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
&AMDGPU::SReg_512RegClass;
+ case 1024:
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass :
+ &AMDGPU::SReg_1024RegClass;
default:
if (Size < 32)
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32_XM0RegClass;
+ &AMDGPU::SReg_32RegClass;
return nullptr;
}
}
@@ -1941,9 +1833,12 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
const TargetRegisterClass *
SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
const MachineRegisterInfo &MRI) const {
- if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
+ const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
+ if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
- return nullptr;
+
+ const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
+ return getAllocatableClass(RC);
}
unsigned SIRegisterInfo::getVCC() const {
@@ -1974,7 +1869,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
SlotIndex UseIdx = LIS->getInstructionIndex(Use);
SlotIndex DefIdx;
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
if (!LIS->hasInterval(Reg))
return nullptr;
LiveInterval &LI = LIS->getInterval(Reg);