diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-04-16 16:01:22 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-04-16 16:01:22 +0000 |
commit | 71d5a2540a98c81f5bcaeb48805e0e2881f530ef (patch) | |
tree | 5343938942df402b49ec7300a1c25a2d4ccd5821 /lib/Target/AMDGPU/SIFrameLowering.cpp | |
parent | 31bbf64f3a4974a2d6c8b3b27ad2f519caf74057 (diff) |
Diffstat (limited to 'lib/Target/AMDGPU/SIFrameLowering.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIFrameLowering.cpp | 138 |
1 files changed, 110 insertions, 28 deletions
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 0b5715515880..abe6af9a6d3f 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -21,22 +21,24 @@ using namespace llvm; -static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF, - const SIRegisterInfo *TRI) { +static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST, + const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), - TRI->getMaxNumSGPRs(MF) / 4); + ST.getMaxNumSGPRs(MF) / 4); } -static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF, - const SIRegisterInfo *TRI) { +static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST, + const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), - TRI->getMaxNumSGPRs(MF)); + ST.getMaxNumSGPRs(MF)); } -void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, - const SIRegisterInfo* TRI, +void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + // We don't need this if we only have spills since there is no user facing // scratch. @@ -59,16 +61,28 @@ void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, MRI.addLiveIn(FlatScratchInitReg); MBB.addLiveIn(FlatScratchInitReg); - // Copy the size in bytes. - unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) - .addReg(FlatScrInitHi, RegState::Kill); - unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + // Do a 64-bit pointer add. + if (ST.flatScratchIsPointer()) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) + .addReg(FlatScrInitHi) + .addImm(0); + + return; + } + + // Copy the size in bytes. + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitHi, RegState::Kill); + // Add wave offset in bytes to private base offset. // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) @@ -111,16 +125,15 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; - ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI); + ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. + // Skip the last N reserved elements because they should have already been + // reserved for VCC etc. for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other // reserved input we needed. if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - //assert(MRI.isAllocatable(Reg)); MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -143,10 +156,9 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI); + ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) return ScratchWaveOffsetReg; @@ -190,6 +202,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was // specified. const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + auto AMDGPUASI = ST.getAMDGPUAS(); if (ST.debuggerEmitPrologue()) emitDebuggerPrologue(MF, MBB); @@ -229,7 +242,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // emitted after frame indices are eliminated. if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) - emitFlatScratchInit(TII, TRI, MF, MBB); + emitFlatScratchInit(ST, MF, MBB); // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( @@ -328,7 +341,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, PointerType *PtrTy = PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()), - AMDGPUAS::CONSTANT_ADDRESS); + AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); auto MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | @@ -371,6 +384,24 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, } +static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); + I != E; ++I) { + if (!MFI.isDeadObjectIndex(I)) + return false; + } + + return true; +} + +int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + + FrameReg = RI->getFrameRegister(MF); + return MF.getFrameInfo().getObjectOffset(FI); +} + void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { @@ -379,15 +410,66 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (!MFI.hasStackObjects()) return; - bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + bool AllSGPRSpilledToVGPRs = false; + + if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { + AllSGPRSpilledToVGPRs = true; + + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs + // are spilled to VGPRs, in which case we can eliminate the stack usage. + // + // XXX - This operates under the assumption that only other SGPR spills are + // users of the frame index. I'm not 100% sure this is correct. The + // StackColoring pass has a comment saying a future improvement would be to + // merging of allocas with spill slots, but for now according to + // MachineFrameInfo isSpillSlot can't alias any other object. + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (TII->isSGPRSpill(MI)) { + int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { + bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + } else + AllSGPRSpilledToVGPRs = false; + } + } + } - assert((RS || !MayNeedScavengingEmergencySlot) && - "RegScavenger required if spilling"); + FuncInfo->removeSGPRToVGPRFrameIndices(MFI); + } - if (MayNeedScavengingEmergencySlot) { - int ScavengeFI = MFI.CreateStackObject( - AMDGPU::SGPR_32RegClass.getSize(), - AMDGPU::SGPR_32RegClass.getAlignment(), false); + // FIXME: The other checks should be redundant with allStackObjectsAreDead, + // but currently hasNonSpillStackObjects is set only from source + // allocas. Stack temps produced from legalization are not counted currently. + if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() || + !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { + assert(RS && "RegScavenger required if spilling"); + + // We force this to be at offset 0 so no user object ever has 0 as an + // address, so we may use 0 as an invalid pointer value. This is because + // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca + // is required to be address space 0, we are forced to accept this for + // now. Ideally we could have the stack in another address space with 0 as a + // valid pointer, and -1 as the null value. + // + // This will also waste additional space when user stack objects require > 4 + // byte alignment. + // + // The main cost here is losing the offset for addressing modes. However + // this also ensures we shouldn't need a register for the offset when + // emergency scavenging. + int ScavengeFI = MFI.CreateFixedObject( + AMDGPU::SGPR_32RegClass.getSize(), 0, false); RS->addScavengingFrameIndex(ScavengeFI); } } |