diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
| commit | 1d5ae1026e831016fc29fd927877c86af904481f (patch) | |
| tree | 2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Target/AMDGPU/SIRegisterInfo.cpp | |
| parent | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff) | |
Notes
Diffstat (limited to 'lib/Target/AMDGPU/SIRegisterInfo.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/SIRegisterInfo.cpp | 439 |
1 files changed, 167 insertions, 272 deletions
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index f152deb28004..f58bc3060c42 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -48,11 +48,6 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, } } -static cl::opt<bool> EnableSpillSGPRToSMEM( - "amdgpu-spill-sgpr-to-smem", - cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), - cl::init(false)); - static cl::opt<bool> EnableSpillSGPRToVGPR( "amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling VGPRs to SGPRs"), @@ -61,17 +56,12 @@ static cl::opt<bool> EnableSpillSGPRToVGPR( SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPURegisterInfo(), + ST(ST), SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), AGPRPressureSets(getNumRegPressureSets()), - SpillSGPRToVGPR(false), - SpillSGPRToSMEM(false), + SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { - if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) - SpillSGPRToSMEM = true; - else if (EnableSpillSGPRToVGPR) - SpillSGPRToVGPR = true; - unsigned NumRegPressureSets = getNumRegPressureSets(); SGPRSetID = NumRegPressureSets; @@ -118,11 +108,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); } static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { @@ -144,7 +132,6 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); return AMDGPU::SGPR_32RegClass.getRegister(Reg); } @@ -202,8 +189,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::VCC_HI); } - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { @@ -220,6 +205,14 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, Reg); } + // Reserve all the rest AGPRs if there are no instructions to use it. + if (!ST.hasMAIInsts()) { + for (unsigned i = 0; i < MaxNumVGPRs; ++i) { + unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + } + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); @@ -293,32 +286,17 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const bool SIRegisterInfo::requiresFrameIndexScavenging( const MachineFunction &MF) const { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.hasStackObjects()) - return true; - - // May need to deal with callee saved registers. - const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - return !Info->isEntryFunction(); + // Do not use frame virtual registers. They used to be used for SGPRs, but + // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the + // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a + // spill. + return false; } bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (!MFI.hasStackObjects()) - return false; - - // The scavenger is used for large frames which may require finding a free - // register for large offsets. - if (!isUInt<12>(MFI.getStackSize())) - return true; - - // If using scalar stores, for spills, m0 is needed for the scalar store - // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual - // register for it during frame index elimination, so the scavenger is - // directly needed. - return MF.getSubtarget<GCNSubtarget>().hasScalarStores() && - MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); + return MFI.hasStackObjects(); } bool SIRegisterInfo::requiresVirtualBaseRegisters( @@ -372,8 +350,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, DL = Ins->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = Subtarget.getInstrInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); if (Offset == 0) { BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) @@ -382,9 +359,9 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, } MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(Offset); @@ -399,11 +376,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const { - - MachineBasicBlock *MBB = MI.getParent(); - MachineFunction *MF = MBB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = Subtarget.getInstrInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); #ifndef NDEBUG // FIXME: Is it possible to be storing a frame index to itself? @@ -419,12 +392,15 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, #endif MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); +#ifndef NDEBUG + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); +#endif assert(FIOp && FIOp->isFI() && "frame index must be address operand"); assert(TII->isMUBUF(MI)); assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == - MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() && - "should only be seeing frame offset relative FrameIndex"); - + MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() && + "should only be seeing stack pointer offset relative FrameIndex"); MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); int64_t NewOffset = OffsetOp->getImm() + Offset; @@ -564,7 +540,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) { } } -static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, +static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, + MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, @@ -572,7 +549,6 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); @@ -595,11 +571,12 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not // need to handle the case where an SGPR may need to be spilled while spilling. -static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, +static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset) { + const SIInstrInfo *TII = ST.getInstrInfo(); MachineBasicBlock *MBB = MI->getParent(); const DebugLoc &DL = MI->getDebugLoc(); bool IsStore = MI->mayStore(); @@ -611,7 +588,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, return false; const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); - if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr()) + if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr()) return true; MachineInstrBuilder NewMI = @@ -624,6 +601,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, @@ -645,7 +623,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, RegScavenger *RS) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); @@ -707,8 +684,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, } for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { - unsigned SubReg = NumSubRegs == 1 ? - ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i)); + Register SubReg = NumSubRegs == 1 + ? Register(ValueReg) + : getSubReg(ValueReg, getSubRegFromChannel(i)); unsigned SOffsetRegState = 0; unsigned SrcDstRegState = getDefRegState(!IsStore); @@ -718,7 +696,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, SrcDstRegState |= getKillRegState(IsKill); } - auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill); + auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill); if (!MIB.getInstr()) { unsigned FinalReg = SubReg; @@ -743,6 +721,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(NewMMO); if (!IsStore && TmpReg != AMDGPU::NoRegister) @@ -763,22 +742,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, } } -static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, - bool Store) { - if (SuperRegSize % 16 == 0) { - return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; - } - - if (SuperRegSize % 8 == 0) { - return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; - } - - return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; -} - bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, @@ -794,98 +757,37 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (OnlyToVGPR && !SpillToVGPR) return false; - MachineRegisterInfo &MRI = MF->getRegInfo(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned SuperReg = MI->getOperand(0).getReg(); + Register SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - bool SpillToSMEM = spillSGPRToSMEM(); - if (SpillToSMEM && OnlyToVGPR) - return false; - - Register FrameReg = getFrameRegister(*MF); - assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg() && SuperReg != MFI->getScratchWaveOffsetReg())); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; - if (SpillToSMEM) { - if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) - .addReg(AMDGPU::M0); - } - } - - unsigned ScalarStoreOp; unsigned EltSize = 4; const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - if (SpillToSMEM && isSGPRClass(RC)) { - // XXX - if private_element_size is larger than 4 it might be useful to be - // able to spill wider vmem spills. - std::tie(EltSize, ScalarStoreOp) = - getSpillEltSize(getRegSizeInBits(*RC) / 8, true); - } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + // Scavenged temporary VGPR to use. It must be scavenged once for any number + // of spilled subregs. + Register TmpVGPR; + // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, SplitParts[i]); - - if (SpillToSMEM) { - int64_t FrOffset = FrameInfo.getObjectOffset(Index); - - // The allocated memory size is really the wavefront size * the frame - // index size. The widest register class is 64 bytes, so a 4-byte scratch - // allocation is enough to spill this in a single stack object. - // - // FIXME: Frame size/offsets are computed earlier than this, so the extra - // space is still unnecessarily allocated. - - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - EltSize, MinAlign(Align, EltSize * i)); - - // SMEM instructions only support a single offset, so increment the wave - // offset. - - int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(FrameReg) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(FrameReg); - } - - BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) - .addReg(SubReg, getKillRegState(IsKill)) // sdata - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc - .addImm(0) // dlc - .addMemOperand(MMO); - - continue; - } + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; @@ -915,15 +817,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, return false; // Spill SGPR to a frame index. - // TODO: Should VI try to spill to VGPR and then spill to SMEM? - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - // TODO: Should VI try to spill to VGPR and then spill to SMEM? + if (!TmpVGPR.isValid()) + TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); MachineInstrBuilder Mov - = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(SubReg, SubKillState); - // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (NumSubRegs > 1) { @@ -941,7 +841,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src + .addReg(TmpVGPR, RegState::Kill) // src .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srrsrc .addReg(MFI->getStackPtrOffsetReg()) // soffset @@ -965,7 +865,6 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, RegScavenger *RS, bool OnlyToVGPR) const { MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); @@ -976,84 +875,27 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, return false; MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); - unsigned SuperReg = MI->getOperand(0).getReg(); - bool SpillToSMEM = spillSGPRToSMEM(); - if (SpillToSMEM && OnlyToVGPR) - return false; + Register SuperReg = MI->getOperand(0).getReg(); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; - if (SpillToSMEM) { - if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) - .addReg(AMDGPU::M0); - } - } - unsigned EltSize = 4; - unsigned ScalarLoadOp; - - Register FrameReg = getFrameRegister(*MF); const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - if (SpillToSMEM && isSGPRClass(RC)) { - // XXX - if private_element_size is larger than 4 it might be useful to be - // able to spill wider vmem spills. - std::tie(EltSize, ScalarLoadOp) = - getSpillEltSize(getRegSizeInBits(*RC) / 8, false); - } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - // SubReg carries the "Kill" flag when SubReg == SuperReg. - int64_t FrOffset = FrameInfo.getObjectOffset(Index); + Register TmpVGPR; for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, SplitParts[i]); - - if (SpillToSMEM) { - // FIXME: Size may be > 4 but extra bytes wasted. - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, - EltSize, MinAlign(Align, EltSize * i)); - - // Add i * 4 offset - int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(FrameReg) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(FrameReg); - } - - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc - .addImm(0) // dlc - .addMemOperand(MMO); - - if (NumSubRegs > 1 && i == 0) - MIB.addReg(SuperReg, RegState::ImplicitDefine); - - continue; - } + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; @@ -1071,7 +913,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (!TmpVGPR.isValid()) + TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo @@ -1081,7 +924,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, MachineMemOperand::MOLoad, EltSize, MinAlign(Align, EltSize * i)); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srsrc .addReg(MFI->getStackPtrOffsetReg()) // soffset @@ -1090,7 +933,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) - .addReg(TmpReg, RegState::Kill); + .addReg(TmpVGPR, RegState::Kill); if (NumSubRegs > 1) MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); @@ -1141,11 +984,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -1255,13 +1096,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // In an entry function/kernel the offset is already the absolute // address relative to the frame register. - unsigned DiffReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register TmpDiffReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + + // If there's no free SGPR, in-place modify the FP + Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; Register ResultReg = IsCopy ? MI->getOperand(0).getReg() : - MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) .addReg(FrameReg) @@ -1271,35 +1115,80 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (Offset == 0) { // XXX - This never happens because of emergency scavenging slot at 0? BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) - .addImm(Log2_32(ST.getWavefrontSize())) + .addImm(ST.getWavefrontSizeLog2()) .addReg(DiffReg); } else { - unsigned ScaledReg - = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { + Register ScaledReg = + RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) - .addImm(Log2_32(ST.getWavefrontSize())) - .addReg(DiffReg, RegState::Kill); + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), + ScaledReg) + .addImm(ST.getWavefrontSizeLog2()) + .addReg(DiffReg, RegState::Kill); - // TODO: Fold if use instruction is another add of a constant. - if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { - TII->getAddNoCarry(*MBB, MI, DL, ResultReg) - .addImm(Offset) - .addReg(ScaledReg, RegState::Kill) - .addImm(0); // clamp bit + const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; + + // TODO: Fold if use instruction is another add of a constant. + if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { + // FIXME: This can fail + MIB.addImm(Offset); + MIB.addReg(ScaledReg, RegState::Kill); + if (!IsVOP2) + MIB.addImm(0); // clamp bit + } else { + Register ConstOffsetReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false); + + // This should always be able to use the unused carry out. + assert(ConstOffsetReg && "this scavenge should not be able to fail"); + + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) + .addImm(Offset); + MIB.addReg(ConstOffsetReg, RegState::Kill); + MIB.addReg(ScaledReg, RegState::Kill); + MIB.addImm(0); // clamp bit + } } else { - unsigned ConstOffsetReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + // We have to produce a carry out, and we there isn't a free SGPR + // pair for it. We can keep the whole computation on the SALU to + // avoid clobbering an additional register at the cost of an extra + // mov. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) - .addImm(Offset); - TII->getAddNoCarry(*MBB, MI, DL, ResultReg) - .addReg(ConstOffsetReg, RegState::Kill) + // We may have 1 free scratch SGPR even though a carry out is + // unavailable. Only one additional mov is needed. + Register TmpScaledReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg; + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) + .addReg(DiffReg, RegState::Kill) + .addImm(ST.getWavefrontSizeLog2()); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) .addReg(ScaledReg, RegState::Kill) - .addImm(0); // clamp bit + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) + .addReg(ScaledReg, RegState::Kill); + + // If there were truly no free SGPRs, we need to undo everything. + if (!TmpScaledReg.isValid()) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg) + .addReg(ScaledReg, RegState::Kill) + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) + .addReg(DiffReg, RegState::Kill) + .addImm(ST.getWavefrontSizeLog2()); + } } } + if (!TmpDiffReg.isValid()) { + // Restore the FP. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) + .addReg(FrameReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + // Don't introduce an extra copy if we're just materializing in a mov. if (IsCopy) MI->eraseFromParent(); @@ -1325,7 +1214,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int64_t NewOffset = OldImm + Offset; if (isUInt<12>(NewOffset) && - buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { + buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { MI->eraseFromParent(); return; } @@ -1337,7 +1226,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); @@ -1347,27 +1236,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { - const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg); - unsigned Size = getRegSizeInBits(*RC); - unsigned AltName = AMDGPU::NoRegAltName; - - switch (Size) { - case 32: AltName = AMDGPU::Reg32; break; - case 64: AltName = AMDGPU::Reg64; break; - case 96: AltName = AMDGPU::Reg96; break; - case 128: AltName = AMDGPU::Reg128; break; - case 160: AltName = AMDGPU::Reg160; break; - case 256: AltName = AMDGPU::Reg256; break; - case 512: AltName = AMDGPU::Reg512; break; - case 1024: AltName = AMDGPU::Reg1024; break; - } - return AMDGPUInstPrinter::getRegisterName(Reg, AltName); + return AMDGPUInstPrinter::getRegisterName(Reg); } // FIXME: This is very slow. It might be worth creating a map from physreg to // register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { - assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + assert(!Register::isVirtualRegister(Reg)); static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, @@ -1408,8 +1283,6 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { unsigned Size = getRegSizeInBits(*RC); - if (Size < 32) - return false; switch (Size) { case 32: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; @@ -1427,8 +1300,11 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; case 1024: return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; + case 1: + return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr; default: - llvm_unreachable("Invalid register class size"); + assert(Size < 32 && "Invalid register class size"); + return false; } } @@ -1476,6 +1352,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( return &AMDGPU::VReg_512RegClass; case 1024: return &AMDGPU::VReg_1024RegClass; + case 1: + return &AMDGPU::VReg_1RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1509,7 +1387,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( case 96: return &AMDGPU::SReg_96RegClass; case 128: - return &AMDGPU::SReg_128RegClass; + return &AMDGPU::SGPR_128RegClass; case 160: return &AMDGPU::SReg_160RegClass; case 256: @@ -1539,7 +1417,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( case 3: return &AMDGPU::SReg_96RegClass; case 4: - return &AMDGPU::SReg_128RegClass; + return &AMDGPU::SGPR_128RegClass; case 5: return &AMDGPU::SReg_160RegClass; case 8: @@ -1587,6 +1465,15 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( } } +bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { + if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && + OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) + return !ST.hasMFMAInlineLiteralBug(); + + return OpType >= AMDGPU::OPERAND_SRC_FIRST && + OpType <= AMDGPU::OPERAND_SRC_LAST; +} + bool SIRegisterInfo::shouldRewriteCopySrc( const TargetRegisterClass *DefRC, unsigned DefSubReg, @@ -1802,7 +1689,7 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC const TargetRegisterClass* SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, unsigned Reg) const { - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return MRI.getRegClass(Reg); return getPhysRegClass(Reg); @@ -1845,8 +1732,6 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), @@ -1900,18 +1785,22 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; case AMDGPU::SGPRRegBankID: - return &AMDGPU::SReg_32_XM0RegClass; + return &AMDGPU::SReg_32RegClass; case AMDGPU::SCCRegBankID: // This needs to return an allocatable class, so don't bother returning // the dummy SCC class. - return &AMDGPU::SReg_32_XM0RegClass; + // + // FIXME: This is a grotesque hack. We use SGPR_32 as an indication this + // was not an VCC bank value since we use the larger class SReg_32 for + // other values. These should all use SReg_32. + return &AMDGPU::SGPR_32RegClass; default: llvm_unreachable("unknown register bank"); } } case 32: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32_XM0RegClass; + &AMDGPU::SReg_32RegClass; case 64: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : &AMDGPU::SReg_64_XEXECRegClass; @@ -1920,7 +1809,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, &AMDGPU::SReg_96RegClass; case 128: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : - &AMDGPU::SReg_128RegClass; + &AMDGPU::SGPR_128RegClass; case 160: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : &AMDGPU::SReg_160RegClass; @@ -1930,10 +1819,13 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, case 512: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : &AMDGPU::SReg_512RegClass; + case 1024: + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass : + &AMDGPU::SReg_1024RegClass; default: if (Size < 32) return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32_XM0RegClass; + &AMDGPU::SReg_32RegClass; return nullptr; } } @@ -1941,9 +1833,12 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, const TargetRegisterClass * SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const { - if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg())) + const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); + if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); - return nullptr; + + const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>(); + return getAllocatableClass(RC); } unsigned SIRegisterInfo::getVCC() const { @@ -1974,7 +1869,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, SlotIndex UseIdx = LIS->getInstructionIndex(Use); SlotIndex DefIdx; - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { if (!LIS->hasInterval(Reg)) return nullptr; LiveInterval &LI = LIS->getInterval(Reg); |
