diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIFrameLowering.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/SIFrameLowering.cpp | 142 |
1 files changed, 113 insertions, 29 deletions
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 89bb98dbd028..ac0ef90f25a4 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -12,7 +12,9 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -21,19 +23,19 @@ using namespace llvm; -static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST, +static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); } -static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST, +static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); } -void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, +void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { const SIInstrInfo *TII = ST.getInstrInfo(); @@ -96,7 +98,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, } unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( - const SISubtarget &ST, + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, @@ -147,7 +149,7 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( // SGPRs. std::pair<unsigned, unsigned> SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const SISubtarget &ST, + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, @@ -218,7 +220,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was // specified. - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (ST.debuggerEmitPrologue()) emitDebuggerPrologue(MF, MBB); @@ -235,6 +237,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); // We need to do the replacement of the private segment buffer and wave offset // register even if there are no stack objects. There could be stores to undef @@ -286,7 +289,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdCodeObjectV2(MF)) { + if (ST.isAmdCodeObjectV2(F)) { PreloadedPrivateBufferReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } @@ -305,7 +308,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { - assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)); + assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F)); MRI.addLiveIn(PreloadedPrivateBufferReg); MBB.addLiveIn(PreloadedPrivateBufferReg); } @@ -330,7 +333,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, bool CopyBuffer = ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister && - ST.isAmdCodeObjectV2(MF) && + ST.isAmdCodeObjectV2(F) && ScratchRsrcReg != PreloadedPrivateBufferReg; // This needs to be careful of the copying order to avoid overwriting one of @@ -361,13 +364,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. -void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, +void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, unsigned ScratchRsrcReg) const { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const Function &Fn = MF.getFunction(); DebugLoc DL; if (ST.isAmdPalOS()) { @@ -387,12 +391,27 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); BuildMI(MBB, I, DL, GetPC64, Rsrc01); } + auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in + if (ST.hasMergedShaders()) { + switch (MF.getFunction().getCallingConv()) { + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_GS: + // Low GIT address is passed in s8 rather than s0 for an LS+HS or + // ES+GS merged shader on gfx9+. + GitPtrLo = AMDGPU::SGPR8; + break; + default: + break; + } + } + MF.getRegInfo().addLiveIn(GitPtrLo); + MF.front().addLiveIn(GitPtrLo); BuildMI(MBB, I, DL, SMovB32, RsrcLo) - .addReg(AMDGPU::SGPR0) // Low address passed in + .addReg(GitPtrLo) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); // We now have the GIT ptr - now get the scratch descriptor from the entry - // at offset 0. + // at offset 0 (or offset 16 for a compute shader). PointerType *PtrTy = PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()), AMDGPUAS::CONSTANT_ADDRESS); @@ -403,17 +422,18 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable, 0, 0); + unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) .addReg(Rsrc01) - .addImm(0) // offset + .addImm(Offset) // offset .addImm(0) // glc .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); return; } - if (ST.isMesaGfxShader(MF) + if (ST.isMesaGfxShader(Fn) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { - assert(!ST.isAmdCodeObjectV2(MF)); + assert(!ST.isAmdCodeObjectV2(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); @@ -474,17 +494,52 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, } } +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) { + MachineFunction *MF = MBB.getParent(); + + const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); + LivePhysRegs LiveRegs(TRI); + LiveRegs.addLiveIns(MBB); + + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + + for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + + return AMDGPU::NoRegister; +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); if (FuncInfo->isEntryFunction()) { emitEntryFunctionPrologue(MF, MBB); return; } const MachineFrameInfo &MFI = MF.getFrameInfo(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); @@ -492,8 +547,34 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; + // XXX - Is this the right predicate? + bool NeedFP = hasFP(MF); - if (NeedFP) { + uint32_t NumBytes = MFI.getStackSize(); + uint32_t RoundedSize = NumBytes; + const bool NeedsRealignment = TRI.needsStackRealignment(MF); + + if (NeedsRealignment) { + assert(NeedFP); + const unsigned Alignment = MFI.getMaxAlignment(); + + RoundedSize += Alignment; + + unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB); + assert(ScratchSPReg != AMDGPU::NoRegister); + + // s_add_u32 tmp_reg, s32, NumBytes + // s_and_b32 s32, tmp_reg, 0b111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) + .addReg(StackPtrReg) + .addImm((Alignment - 1) * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) + .addReg(ScratchSPReg, RegState::Kill) + .addImm(-Alignment * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + FuncInfo->setIsStackRealigned(true); + } else if (NeedFP) { // If we need a base pointer, set it up here. It's whatever the value of // the stack pointer is at this point. Any variable size objects will be // allocated after this, so we can still use the base pointer to reference @@ -503,11 +584,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - uint32_t NumBytes = MFI.getStackSize(); - if (NumBytes != 0 && hasSP(MF)) { + if (RoundedSize != 0 && hasSP(MF)) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(NumBytes * ST.getWavefrontSize()) + .addImm(RoundedSize * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); } @@ -527,7 +607,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, if (FuncInfo->isEntryFunction()) return; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); @@ -553,10 +633,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, // it's really whether we need SP to be accurate or not. if (NumBytes != 0 && hasSP(MF)) { + uint32_t RoundedSize = FuncInfo->isStackRealigned() ? + NumBytes + MFI.getMaxAlignment() : NumBytes; + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(NumBytes * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameDestroy); + .addImm(RoundedSize * ST.getWavefrontSize()); } } @@ -572,7 +654,7 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { - const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); FrameReg = RI->getFrameRegister(MF); return MF.getFrameInfo().getObjectOffset(FI); @@ -586,7 +668,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (!MFI.hasStackObjects()) return; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); @@ -611,6 +693,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (TII->isSGPRSpill(MI)) { int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL); if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); (void)Spilled; @@ -667,7 +750,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( if (Amount == 0) return MBB.erase(I); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = I->getDebugLoc(); unsigned Opc = I->getOpcode(); @@ -696,7 +779,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -746,7 +829,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const { } bool SIFrameLowering::hasSP(const MachineFunction &MF) const { + const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); // All stack operations are relative to the frame offset SGPR. const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.hasCalls() || MFI.hasVarSizedObjects(); + return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF); } |
