diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIFrameLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 997 |
1 files changed, 567 insertions, 430 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 8364665dda04c..a2e802009d098 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -24,18 +24,6 @@ using namespace llvm; #define DEBUG_TYPE "frame-info" -static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, - const MachineFunction &MF) { - return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), - ST.getMaxNumSGPRs(MF) / 4); -} - -static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, - const MachineFunction &MF) { - return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), - ST.getMaxNumSGPRs(MF)); -} - // Find a scratch register that we can use at the start of the prologue to // re-align the stack pointer. We avoid using callee-save registers since they // may appear to be free when this is called from canUseAsPrologue (during @@ -47,10 +35,10 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, // but we would then have to make sure that we were in fact saving at least one // callee-save register in the prologue, which is additional complexity that // doesn't seem worth the benefit. -static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, - LivePhysRegs &LiveRegs, - const TargetRegisterClass &RC, - bool Unused = false) { +static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, + LivePhysRegs &LiveRegs, + const TargetRegisterClass &RC, + bool Unused = false) { // Mark callee saved registers as used so we will not choose them. const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); for (unsigned i = 0; CSRegs[i]; ++i) @@ -59,12 +47,12 @@ static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, if (Unused) { // We are looking for a register that can be used throughout the entire // function, so any use is unacceptable. - for (unsigned Reg : RC) { + for (MCRegister Reg : RC) { if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) return Reg; } } else { - for (unsigned Reg : RC) { + for (MCRegister Reg : RC) { if (LiveRegs.available(MRI, Reg)) return Reg; } @@ -76,14 +64,67 @@ static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, if (!Unused) report_fatal_error("failed to find free scratch register"); - return AMDGPU::NoRegister; + return MCRegister(); } -static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { - LivePhysRegs LiveRegs; - LiveRegs.init(*MRI.getTargetRegisterInfo()); - return findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); +static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, + LivePhysRegs &LiveRegs, + Register &TempSGPR, + Optional<int> &FrameIndex, + bool IsFP) { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + +#ifndef NDEBUG + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); +#endif + + // We need to save and restore the current FP/BP. + + // 1: If there is already a VGPR with free lanes, use it. We + // may already have to pay the penalty for spilling a CSR VGPR. + if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { + int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, + TargetStackID::SGPRSpill); + + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + + FrameIndex = NewFI; + + LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n'); + return; + } + + // 2: Next, try to save the FP/BP in an unused SGPR. + TempSGPR = findScratchNonCalleeSaveRegister( + MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); + + if (!TempSGPR) { + int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, + TargetStackID::SGPRSpill); + + if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { + // 3: There's no free lane to spill, and no free register to save FP/BP, + // so we're forced to spill another VGPR to use for the spill. + FrameIndex = NewFI; + } else { + // 4: If all else fails, spill the FP/BP to memory. + FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); + } + + LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n';); + } else { + LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to " + << printReg(TempSGPR, TRI) << '\n'); + } } // We need to specially emit stack operations here because a different frame @@ -91,8 +132,8 @@ static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { // use. static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const SIInstrInfo *TII, unsigned SpillReg, - unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + const SIInstrInfo *TII, Register SpillReg, + Register ScratchRsrcReg, Register SPReg, int FI) { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); @@ -100,7 +141,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, - MFI.getObjectAlignment(FI)); + MFI.getObjectAlign(FI)); if (isUInt<12>(Offset)) { BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) @@ -139,15 +180,15 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const SIInstrInfo *TII, unsigned SpillReg, - unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + const SIInstrInfo *TII, Register SpillReg, + Register ScratchRsrcReg, Register SPReg, int FI) { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); int64_t Offset = MFI.getObjectOffset(FI); MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, - MFI.getObjectAlignment(FI)); + MFI.getObjectAlign(FI)); if (isUInt<12>(Offset)) { BuildMI(MBB, I, DebugLoc(), @@ -184,11 +225,13 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addMemOperand(MMO); } -void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, - MachineFunction &MF, - MachineBasicBlock &MBB) const { +// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` +void SIFrameLowering::emitEntryFunctionFlatScratchInit( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, Register ScratchWaveOffsetReg) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // We don't need this if we only have spills since there is no user facing @@ -201,11 +244,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, // pointer. Because we only detect if flat instructions are used at all, // this will be used more often than necessary on VI. - // Debug location must be unknown since the first debug location is used to - // determine the end of the prologue. - DebugLoc DL; - MachineBasicBlock::iterator I = MBB.begin(); - Register FlatScratchInitReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); @@ -216,8 +254,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - // Do a 64-bit pointer add. if (ST.flatScratchIsPointer()) { if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { @@ -266,19 +302,22 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, .addImm(8); } -unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const { +// Shift down registers reserved for the scratch RSRC. +Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( + MachineFunction &MF) const { + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + assert(MFI->isEntryFunction()); + + Register ScratchRsrcReg = MFI->getScratchRSrcReg(); - // We need to insert initialization of the scratch resource descriptor. - unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - if (ScratchRsrcReg == AMDGPU::NoRegister || - !MRI.isPhysRegUsed(ScratchRsrcReg)) - return AMDGPU::NoRegister; + if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg)) + return Register(); if (ST.hasSGPRInitBug() || ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) @@ -293,18 +332,19 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( // cannot do this for the resources required for scratch access. For now we // skip over user SGPRs and may leave unused holes. - // We find the resource first because it has an alignment requirement. - unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; - ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); + ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); // Skip the last N reserved elements because they should have already been // reserved for VCC etc. + Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other - // reserved input we needed. - if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + // reserved input we needed. Also for PAL, make sure we don't clobber + // the GIT pointer passed in SGPR0 or SGPR8. + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -314,231 +354,138 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( return ScratchRsrcReg; } -// Shift down registers reserved for the scratch wave offset. -std::pair<unsigned, bool> -SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, MachineFunction &MF) const { - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - - assert(MFI->isEntryFunction()); - - // No replacement necessary. - if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) { - return std::make_pair(AMDGPU::NoRegister, false); - } - - if (ST.hasSGPRInitBug()) - return std::make_pair(ScratchWaveOffsetReg, false); - - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - - ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); - if (NumPreloaded > AllSGPRs.size()) - return std::make_pair(ScratchWaveOffsetReg, false); - - AllSGPRs = AllSGPRs.slice(NumPreloaded); - - // We need to drop register from the end of the list that we cannot use - // for the scratch wave offset. - // + 2 s102 and s103 do not exist on VI. - // + 2 for vcc - // + 2 for xnack_mask - // + 2 for flat_scratch - // + 4 for registers reserved for scratch resource register - // + 1 for register reserved for scratch wave offset. (By exluding this - // register from the list to consider, it means that when this - // register is being used for the scratch wave offset and there - // are no other free SGPRs, then the value will stay in this register. - // + 1 if stack pointer is used. - // ---- - // 13 (+1) - unsigned ReservedRegCount = 13; - - if (AllSGPRs.size() < ReservedRegCount) - return std::make_pair(ScratchWaveOffsetReg, false); - - bool HandledScratchWaveOffsetReg = - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); - bool FPAdjusted = false; - - for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { - // Pick the first unallocated SGPR. Be careful not to pick an alias of the - // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - if (!HandledScratchWaveOffsetReg) { - HandledScratchWaveOffsetReg = true; - - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { - assert(!hasFP(MF)); - MFI->setStackPtrOffsetReg(Reg); - } - - MFI->setScratchWaveOffsetReg(Reg); - MFI->setFrameOffsetReg(Reg); - ScratchWaveOffsetReg = Reg; - FPAdjusted = true; - break; - } - } - } - - return std::make_pair(ScratchWaveOffsetReg, FPAdjusted); -} - void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - - // If we only have SGPR spills, we won't actually be using scratch memory - // since these spill to VGPRs. - // - // FIXME: We should be cleaning up these unused SGPR spill frame indices - // somewhere. - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const Function &F = MF.getFunction(); - - // We need to do the replacement of the private segment buffer and wave offset - // register even if there are no stack objects. There could be stores to undef - // or a constant without an associated object. + // FIXME: If we only have SGPR spills, we won't actually be using scratch + // memory since these spill to VGPRs. We should be cleaning up these unused + // SGPR spill frame indices somewhere. // FIXME: We still have implicit uses on SGPR spill instructions in case they // need to spill to vector memory. It's likely that will not happen, but at // this point it appears we need the setup. This part of the prolog should be // emitted after frame indices are eliminated. - if (MFI->hasFlatScratchInit()) - emitFlatScratchInit(ST, MF, MBB); + // FIXME: Remove all of the isPhysRegUsed checks - unsigned ScratchRsrcReg - = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); - unsigned ScratchWaveOffsetReg; - bool FPAdjusted; - std::tie(ScratchWaveOffsetReg, FPAdjusted) = - getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); + assert(MFI->isEntryFunction()); - // We need to insert initialization of the scratch resource descriptor. Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdHsaOrMesa(F)) { - PreloadedPrivateBufferReg = MFI->getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - } - - bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchWaveOffsetReg); - bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchRsrcReg); - // FIXME: Hack to not crash in situations which emitted an error. - if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) + if (!PreloadedScratchWaveOffsetReg) return; - // We added live-ins during argument lowering, but since they were not used - // they were deleted. We're adding the uses now, so add them back. - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - - if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { - assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); - MRI.addLiveIn(PreloadedPrivateBufferReg); - MBB.addLiveIn(PreloadedPrivateBufferReg); + // We need to do the replacement of the private segment buffer register even + // if there are no stack objects. There could be stores to undef or a + // constant without an associated object. + // + // This will return `Register()` in cases where there are no actual + // uses of the SRSRC. + Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); + + // Make the selected register live throughout the function. + if (ScratchRsrcReg) { + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB != &MBB) { + OtherBB.addLiveIn(ScratchRsrcReg); + } + } } - // Make the register selected live throughout the function. - for (MachineBasicBlock &OtherBB : MF) { - if (&OtherBB == &MBB) - continue; - - if (OffsetRegUsed || FPAdjusted) - OtherBB.addLiveIn(ScratchWaveOffsetReg); - - if (ResourceRegUsed) - OtherBB.addLiveIn(ScratchRsrcReg); + // Now that we have fixed the reserved SRSRC we need to locate the + // (potentially) preloaded SRSRC. + Register PreloadedScratchRsrcReg; + if (ST.isAmdHsaOrMesa(F)) { + PreloadedScratchRsrcReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + if (ScratchRsrcReg && PreloadedScratchRsrcReg) { + // We added live-ins during argument lowering, but since they were not + // used they were deleted. We're adding the uses now, so add them back. + MRI.addLiveIn(PreloadedScratchRsrcReg); + MBB.addLiveIn(PreloadedScratchRsrcReg); + } } + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - // If we reserved the original input registers, we don't need to copy to the - // reserved registers. - - bool CopyBuffer = ResourceRegUsed && - PreloadedPrivateBufferReg != AMDGPU::NoRegister && - ST.isAmdHsaOrMesa(F) && - ScratchRsrcReg != PreloadedPrivateBufferReg; - - // This needs to be careful of the copying order to avoid overwriting one of - // the input registers before it's been copied to it's final - // destination. Usually the offset should be copied first. - bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, - ScratchWaveOffsetReg); - if (CopyBuffer && CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); + // We found the SRSRC first because it needs four registers and has an + // alignment requirement. If the SRSRC that we found is clobbering with + // the scratch wave offset, which may be in a fixed SGPR or a free SGPR + // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch + // wave offset to a free SGPR. + Register ScratchWaveOffsetReg; + if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { + ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + AllSGPRs = AllSGPRs.slice( + std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); + Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); + for (MCPhysReg Reg : AllSGPRs) { + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { + ScratchWaveOffsetReg = Reg; + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + break; + } + } + } else { + ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; } + assert(ScratchWaveOffsetReg); - unsigned SPReg = MFI->getStackPtrOffsetReg(); - assert(SPReg != AMDGPU::SP_REG); - - // FIXME: Remove the isPhysRegUsed checks - const bool HasFP = hasFP(MF); - - if (HasFP || OffsetRegUsed) { - assert(ScratchWaveOffsetReg); - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); + if (MF.getFrameInfo().hasCalls()) { + Register SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) + .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize()); } - if (CopyBuffer && !CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); + if (hasFP(MF)) { + Register FPReg = MFI->getFrameOffsetReg(); + assert(FPReg != AMDGPU::FP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } - if (ResourceRegUsed) { - emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, - PreloadedPrivateBufferReg, ScratchRsrcReg); + if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } - if (HasFP) { - DebugLoc DL; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - int64_t StackSize = FrameInfo.getStackSize(); + if (MFI->hasFlatScratchInit()) { + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); + } - // On kernel entry, the private scratch wave offset is the SP value. - if (StackSize == 0) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(StackSize * ST.getWavefrontSize()); - } + if (ScratchRsrcReg) { + emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, + PreloadedScratchRsrcReg, + ScratchRsrcReg, ScratchWaveOffsetReg); } } -// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. -void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, - MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, - MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, - unsigned ScratchRsrcReg) const { +// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` +void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, Register PreloadedScratchRsrcReg, + Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const Function &Fn = MF.getFunction(); - DebugLoc DL; if (ST.isAmdPalOS()) { // The pointer to the GIT is formed from the offset passed in and either @@ -557,19 +504,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); BuildMI(MBB, I, DL, GetPC64, Rsrc01); } - auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in - if (ST.hasMergedShaders()) { - switch (MF.getFunction().getCallingConv()) { - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - // Low GIT address is passed in s8 rather than s0 for an LS+HS or - // ES+GS merged shader on gfx9+. - GitPtrLo = AMDGPU::SGPR8; - break; - default: - break; - } - } + Register GitPtrLo = MFI->getGITPtrLoReg(MF); MF.getRegInfo().addLiveIn(GitPtrLo); MBB.addLiveIn(GitPtrLo); BuildMI(MBB, I, DL, SMovB32, RsrcLo) @@ -582,12 +517,12 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); auto MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | - MachineMemOperand::MOInvariant | - MachineMemOperand::MODereferenceable, - 16, 4); + MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable, + 16, Align(4)); unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); - unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset); + unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) .addReg(Rsrc01) .addImm(EncodedOffset) // offset @@ -595,10 +530,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, .addImm(0) // dlc .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); - return; - } - if (ST.isMesaGfxShader(Fn) - || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { + } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -621,11 +553,11 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); - auto MMO = MF.getMachineMemOperand(PtrInfo, - MachineMemOperand::MOLoad | - MachineMemOperand::MOInvariant | - MachineMemOperand::MODereferenceable, - 8, 4); + auto MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable, + 8, Align(4)); BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addImm(0) // offset @@ -658,7 +590,37 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, BuildMI(MBB, I, DL, SMovB32, Rsrc3) .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } else if (ST.isAmdHsaOrMesa(Fn)) { + assert(PreloadedScratchRsrcReg); + + if (ScratchRsrcReg != PreloadedScratchRsrcReg) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) + .addReg(PreloadedScratchRsrcReg, RegState::Kill); + } } + + // Add the scratch wave offset into the scratch RSRC. + // + // We only want to update the first 48 bits, which is the base address + // pointer, without touching the adjacent 16 bits of flags. We know this add + // cannot carry-out from bit 47, otherwise the scratch allocation would be + // impossible to fit in the 48-bit global address space. + // + // TODO: Evaluate if it is better to just construct an SRD using the flat + // scratch init and some constants rather than update the one we are passed. + Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + + // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in + // the kernel body via inreg arguments. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) + .addReg(ScratchRsrcSub0) + .addReg(ScratchWaveOffsetReg) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) + .addReg(ScratchRsrcSub1) + .addImm(0) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { @@ -673,6 +635,50 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { llvm_unreachable("Invalid TargetStackID::Value"); } +// Activate all lanes, returns saved exec. +static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool IsProlog) { + Register ScratchExecCopy; + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + DebugLoc DL; + + if (LiveRegs.empty()) { + if (IsProlog) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + if (FuncInfo->SGPRForFPSaveRestoreCopy) + LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); + + if (FuncInfo->SGPRForBPSaveRestoreCopy) + LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy); + } else { + // In epilog. + LiveRegs.init(*ST.getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + LiveRegs.stepBackward(*MBBI); + } + } + + ScratchExecCopy = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + + if (!IsProlog) + LiveRegs.removeReg(ScratchExecCopy); + + const unsigned OrSaveExec = + ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1); + + return ScratchExecCopy; +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); @@ -687,51 +693,81 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); - unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); + Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + Register BasePtrReg = + TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); LivePhysRegs LiveRegs; MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; bool HasFP = false; + bool HasBP = false; uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = NumBytes; // To avoid clobbering VGPRs in lanes that weren't active on function entry, // turn on all lanes before doing the spill to memory. - unsigned ScratchExecCopy = AMDGPU::NoRegister; + Register ScratchExecCopy; + + bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); + bool SpillFPToMemory = false; + // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. + // Otherwise we are spilling the FP to memory. + if (HasFPSaveIndex) { + SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != + TargetStackID::SGPRSpill; + } + + bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); + bool SpillBPToMemory = false; + // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. + // Otherwise we are spilling the BP to memory. + if (HasBPSaveIndex) { + SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != + TargetStackID::SGPRSpill; + } // Emit the copy if we need an FP, and are using a free SGPR to save it. - if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + if (FuncInfo->SGPRForFPSaveRestoreCopy) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) .addReg(FramePtrReg) .setMIFlag(MachineInstr::FrameSetup); } + // Emit the copy if we need a BP, and are using a free SGPR to save it. + if (FuncInfo->SGPRForBPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), + FuncInfo->SGPRForBPSaveRestoreCopy) + .addReg(BasePtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + // If a copy has been emitted for FP and/or BP, Make the SGPRs + // used in the copy instructions live throughout the function. + SmallVector<MCPhysReg, 2> TempSGPRs; + if (FuncInfo->SGPRForFPSaveRestoreCopy) + TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); + + if (FuncInfo->SGPRForBPSaveRestoreCopy) + TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); + + if (!TempSGPRs.empty()) { + for (MachineBasicBlock &MBB : MF) { + for (MCPhysReg Reg : TempSGPRs) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + } + } + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI.hasValue()) continue; - if (ScratchExecCopy == AMDGPU::NoRegister) { - if (LiveRegs.empty()) { - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); - if (FuncInfo->SGPRForFPSaveRestoreCopy) - LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); - } - - ScratchExecCopy - = findScratchNonCalleeSaveRegister(MRI, LiveRegs, - *TRI.getWaveMaskRegClass()); - assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy); - - const unsigned OrSaveExec = ST.isWave32() ? - AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), - ScratchExecCopy) - .addImm(-1); - } + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, FuncInfo->getScratchRSrcReg(), @@ -739,84 +775,153 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, Reg.FI.getValue()); } - if (ScratchExecCopy != AMDGPU::NoRegister) { + if (HasFPSaveIndex && SpillFPToMemory) { + assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue())); + + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(FramePtrReg); + + buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, + FuncInfo->FramePointerSaveIndex.getValue()); + } + + if (HasBPSaveIndex && SpillBPToMemory) { + assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex)); + + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(BasePtrReg); + + buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, + *FuncInfo->BasePointerSaveIndex); + } + + if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); + .addReg(ScratchExecCopy, RegState::Kill); LiveRegs.addReg(ScratchExecCopy); } - - if (FuncInfo->FramePointerSaveIndex) { + // In this case, spill the FP to a reserved VGPR. + if (HasFPSaveIndex && !SpillFPToMemory) { const int FI = FuncInfo->FramePointerSaveIndex.getValue(); - assert(!MFI.isDeadObjectIndex(FI) && - MFI.getStackID(FI) == TargetStackID::SGPRSpill); - ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill - = FuncInfo->getSGPRToVGPRSpills(FI); + assert(!MFI.isDeadObjectIndex(FI)); + + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(FI); assert(Spill.size() == 1); // Save FP before setting it up. // FIXME: This should respect spillSGPRToVGPR; BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) - .addReg(FramePtrReg) - .addImm(Spill[0].Lane) - .addReg(Spill[0].VGPR, RegState::Undef); + .addReg(FramePtrReg) + .addImm(Spill[0].Lane) + .addReg(Spill[0].VGPR, RegState::Undef); + } + + // In this case, spill the BP to a reserved VGPR. + if (HasBPSaveIndex && !SpillBPToMemory) { + const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + assert(!MFI.isDeadObjectIndex(BasePtrFI)); + + assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(BasePtrFI); + assert(Spill.size() == 1); + + // Save BP before setting it up. + // FIXME: This should respect spillSGPRToVGPR; + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill[0].VGPR) + .addReg(BasePtrReg) + .addImm(Spill[0].Lane) + .addReg(Spill[0].VGPR, RegState::Undef); } if (TRI.needsStackRealignment(MF)) { HasFP = true; - const unsigned Alignment = MFI.getMaxAlignment(); + const unsigned Alignment = MFI.getMaxAlign().value(); RoundedSize += Alignment; if (LiveRegs.empty()) { LiveRegs.init(TRI); LiveRegs.addLiveIns(MBB); LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); + LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); } - unsigned ScratchSPReg = findScratchNonCalleeSaveRegister( + Register ScratchSPReg = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); - assert(ScratchSPReg != AMDGPU::NoRegister && - ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); + assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy && + ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy); // s_add_u32 tmp_reg, s32, NumBytes // s_and_b32 s32, tmp_reg, 0b111...0000 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) - .addReg(StackPtrReg) - .addImm((Alignment - 1) * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(StackPtrReg) + .addImm((Alignment - 1) * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) - .addReg(ScratchSPReg, RegState::Kill) - .addImm(-Alignment * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(ScratchSPReg, RegState::Kill) + .addImm(-Alignment * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); } else if ((HasFP = hasFP(MF))) { - // If we need a base pointer, set it up here. It's whatever the value of - // the stack pointer is at this point. Any variable size objects will be - // allocated after this, so we can still use the base pointer to reference - // locals. BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) - .addReg(StackPtrReg) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(StackPtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + // If we need a base pointer, set it up here. It's whatever the value of + // the stack pointer is at this point. Any variable size objects will be + // allocated after this, so we can still use the base pointer to reference + // the incoming arguments. + if ((HasBP = TRI.hasBasePointer(MF))) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) + .addReg(StackPtrReg) + .setMIFlag(MachineInstr::FrameSetup); } if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(StackPtrReg) + .addImm(RoundedSize * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); } - assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister || + assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || FuncInfo->FramePointerSaveIndex)) && "Needed to save FP but didn't save it anywhere"); - assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister && + assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && !FuncInfo->FramePointerSaveIndex)) && "Saved FP but didn't need it"); + + assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy || + FuncInfo->BasePointerSaveIndex)) && + "Needed to save BP but didn't save it anywhere"); + + assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy && + !FuncInfo->BasePointerSaveIndex)) && + "Saved BP but didn't need it"); } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -828,81 +933,126 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); LivePhysRegs LiveRegs; DebugLoc DL; const MachineFrameInfo &MFI = MF.getFrameInfo(); uint32_t NumBytes = MFI.getStackSize(); - uint32_t RoundedSize = FuncInfo->isStackRealigned() ? - NumBytes + MFI.getMaxAlignment() : NumBytes; + uint32_t RoundedSize = FuncInfo->isStackRealigned() + ? NumBytes + MFI.getMaxAlign().value() + : NumBytes; + const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + const Register BasePtrReg = + TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); + + bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); + bool SpillFPToMemory = false; + if (HasFPSaveIndex) { + SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != + TargetStackID::SGPRSpill; + } + + bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); + bool SpillBPToMemory = false; + if (HasBPSaveIndex) { + SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != + TargetStackID::SGPRSpill; + } if (RoundedSize != 0 && hasFP(MF)) { - const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) .addImm(RoundedSize * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameDestroy); } - if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg()) - .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) - .setMIFlag(MachineInstr::FrameSetup); + if (FuncInfo->SGPRForFPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) + .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); } - if (FuncInfo->FramePointerSaveIndex) { - const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + if (FuncInfo->SGPRForBPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) + .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); + } - assert(!MF.getFrameInfo().isDeadObjectIndex(FI) && - MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill); + Register ScratchExecCopy; + if (HasFPSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + assert(!MFI.isDeadObjectIndex(FI)); + if (SpillFPToMemory) { + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + + MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) + .addReg(TempVGPR, RegState::Kill); + } else { + // Reload from VGPR spill. + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + FramePtrReg) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } + } - ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill - = FuncInfo->getSGPRToVGPRSpills(FI); - assert(Spill.size() == 1); - BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - FuncInfo->getFrameOffsetReg()) - .addReg(Spill[0].VGPR) - .addImm(Spill[0].Lane); + if (HasBPSaveIndex) { + const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + assert(!MFI.isDeadObjectIndex(BasePtrFI)); + if (SpillBPToMemory) { + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + + MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) + .addReg(TempVGPR, RegState::Kill); + } else { + // Reload from VGPR spill. + assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(BasePtrFI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + BasePtrReg) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } } - unsigned ScratchExecCopy = AMDGPU::NoRegister; - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : + FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI.hasValue()) continue; - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - if (ScratchExecCopy == AMDGPU::NoRegister) { - // See emitPrologue - if (LiveRegs.empty()) { - LiveRegs.init(*ST.getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - LiveRegs.stepBackward(*MBBI); - } - - ScratchExecCopy = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, *TRI.getWaveMaskRegClass()); - LiveRegs.removeReg(ScratchExecCopy); - - const unsigned OrSaveExec = - ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - - BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy) - .addImm(-1); - } + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), - FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue()); + FuncInfo->getScratchRSrcReg(), StackPtrReg, + Reg.FI.getValue()); } - if (ScratchExecCopy != AMDGPU::NoRegister) { + if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); + .addReg(ScratchExecCopy, RegState::Kill); } } @@ -920,12 +1070,14 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { #ifndef NDEBUG static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, - Optional<int> FramePointerSaveIndex) { + Optional<int> FramePointerSaveIndex, + Optional<int> BasePointerSaveIndex) { for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { if (!MFI.isDeadObjectIndex(I) && MFI.getStackID(I) == TargetStackID::SGPRSpill && - FramePointerSaveIndex && I != FramePointerSaveIndex) { + ((FramePointerSaveIndex && I != FramePointerSaveIndex) || + (BasePointerSaveIndex && I != BasePointerSaveIndex))) { return false; } } @@ -935,7 +1087,7 @@ static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, #endif int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { + Register &FrameReg) const { const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); FrameReg = RI->getFrameRegister(MF); @@ -952,7 +1104,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); FuncInfo->removeDeadFrameIndices(MFI); - assert(allSGPRSpillsAreDead(MFI, None) && + assert(allSGPRSpillsAreDead(MFI, None, None) && "SGPR spill should have been removed in SILowerSGPRSpills"); // FIXME: The other checks should be redundant with allStackObjectsAreDead, @@ -967,9 +1119,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( RS->addScavengingFrameIndex(ScavengeFI); } else { int ScavengeFI = MFI.CreateStackObject( - TRI->getSpillSize(AMDGPU::SGPR_32RegClass), - TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass), - false); + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), + TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false); RS->addScavengingFrameIndex(ScavengeFI); } } @@ -984,7 +1135,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MFI->isEntryFunction()) return; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1008,46 +1159,19 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, for (auto SSpill : MFI->getSGPRSpillVGPRs()) SavedVGPRs.reset(SSpill.VGPR); - const bool HasFP = WillHaveFP || hasFP(MF); - if (!HasFP) - return; - - if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { - int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, - TargetStackID::SGPRSpill); - - // If there is already a VGPR with free lanes, use it. We may already have - // to pay the penalty for spilling a CSR VGPR. - if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) - llvm_unreachable("allocate SGPR spill should have worked"); - - MFI->FramePointerSaveIndex = NewFI; + LivePhysRegs LiveRegs; + LiveRegs.init(*TRI); - LLVM_DEBUG( - auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI) - << ':' << Spill.Lane << '\n'); - return; + if (WillHaveFP || hasFP(MF)) { + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy, + MFI->FramePointerSaveIndex, true); } - MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo()); - - if (!MFI->SGPRForFPSaveRestoreCopy) { - // There's no free lane to spill, and no free register to save FP, so we're - // forced to spill another VGPR to use for the spill. - int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, - TargetStackID::SGPRSpill); - if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) - llvm_unreachable("allocate SGPR spill should have worked"); - MFI->FramePointerSaveIndex = NewFI; - - LLVM_DEBUG( - auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI) - << ':' << Spill.Lane << '\n';); - } else { - LLVM_DEBUG(dbgs() << "Saving FP with copy to " << - printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n'); + if (TRI->hasBasePointer(MF)) { + if (MFI->SGPRForFPSaveRestoreCopy) + LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy); + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, + MFI->BasePointerSaveIndex, false); } } @@ -1074,14 +1198,31 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots( return true; // Early exit if no callee saved registers are modified! const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - if (!FuncInfo->SGPRForFPSaveRestoreCopy) + if (!FuncInfo->SGPRForFPSaveRestoreCopy && + !FuncInfo->SGPRForBPSaveRestoreCopy) return false; + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *RI = ST.getRegisterInfo(); + Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + Register BasePtrReg = RI->getBaseRegister(); + unsigned NumModifiedRegs = 0; + + if (FuncInfo->SGPRForFPSaveRestoreCopy) + NumModifiedRegs++; + if (FuncInfo->SGPRForBPSaveRestoreCopy) + NumModifiedRegs++; + for (auto &CS : CSI) { - if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { - if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) - CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); - break; + if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) { + CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); + if (--NumModifiedRegs) + break; + } else if (CS.getReg() == BasePtrReg && + FuncInfo->SGPRForBPSaveRestoreCopy) { + CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy); + if (--NumModifiedRegs) + break; } } @@ -1104,12 +1245,10 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; if (!hasReservedCallFrame(MF)) { - unsigned Align = getStackAlignment(); - - Amount = alignTo(Amount, Align); + Amount = alignTo(Amount, getStackAlign()); assert(isUInt<32>(Amount) && "exceeded stack address space size"); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - unsigned SPReg = MFI->getStackPtrOffsetReg(); + Register SPReg = MFI->getStackPtrOffsetReg(); unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; BuildMI(MBB, I, DL, TII->get(Op), SPReg) @@ -1124,19 +1263,17 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( bool SIFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.hasCalls()) { + + // For entry functions we can use an immediate offset in most cases, so the + // presence of calls doesn't imply we need a distinct frame pointer. + if (MFI.hasCalls() && + !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { // All offsets are unsigned, so need to be addressed in the same direction // as stack growth. // FIXME: This function is pretty broken, since it can be called before the // frame layout is determined or CSR spills are inserted. - if (MFI.getStackSize() != 0) - return true; - - // For the entry point, the input wave scratch offset must be copied to the - // API SP if there are calls. - if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) - return true; + return MFI.getStackSize() != 0; } return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || |