diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 195 |
1 files changed, 97 insertions, 98 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index c9376d0ea653..e8142244b7db 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -7,17 +7,18 @@ //===----------------------------------------------------------------------===// #include "SIMachineFunctionInfo.h" -#include "AMDGPUTargetMachine.h" #include "AMDGPUSubtarget.h" -#include "SIRegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" @@ -36,28 +37,12 @@ const GCNTargetMachine &getTM(const GCNSubtarget *STI) { SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI) - : AMDGPUMachineFunction(F, *STI), - Mode(F), - GWSResourcePSV(getTM(STI)), - PrivateSegmentBuffer(false), - DispatchPtr(false), - QueuePtr(false), - KernargSegmentPtr(false), - DispatchID(false), - FlatScratchInit(false), - WorkGroupIDX(false), - WorkGroupIDY(false), - WorkGroupIDZ(false), - WorkGroupInfo(false), - LDSKernelId(false), - PrivateSegmentWaveByteOffset(false), - WorkItemIDX(false), - WorkItemIDY(false), - WorkItemIDZ(false), - ImplicitBufferPtr(false), - ImplicitArgPtr(false), - GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0) { + : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)), + UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false), + WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false), + PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), + WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), + GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); @@ -67,16 +52,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, VRegFlags.reserve(1024); - // FIXME: Should have analysis or something rather than attribute to detect - // calls. - const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; if (IsKernel) { - if (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0) - KernargSegmentPtr = true; WorkGroupIDX = true; WorkItemIDX = true; } else if (CC == CallingConv::AMDGPU_PS) { @@ -85,7 +64,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, MayNeedAGPRs = ST.hasMAIInsts(); - if (!isEntryFunction()) { + if (AMDGPU::isChainCC(CC)) { + // Chain functions don't receive an SP from their caller, but are free to + // set one up. For now, we can use s32 to match what amdgpu_gfx functions + // would use if called, but this can be revisited. + // FIXME: Only reserve this if we actually need it. + StackPtrOffsetReg = AMDGPU::SGPR32; + + ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51; + + ArgInfo.PrivateSegmentBuffer = + ArgDescriptor::createRegister(ScratchRSrcReg); + + ImplicitArgPtr = false; + } else if (!isEntryFunction()) { if (CC != CallingConv::AMDGPU_Gfx) ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; @@ -115,12 +107,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, MayNeedAGPRs = false; // We will select all MAI with VGPR operands. } - bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); - if (isAmdHsaOrMesa && !ST.enableFlatScratch()) - PrivateSegmentBuffer = true; - else if (ST.isMesaGfxShader(F)) - ImplicitBufferPtr = true; - if (!AMDGPU::isGraphics(CC) || (CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs())) { if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x")) @@ -145,33 +131,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, ST.getMaxWorkitemID(F, 2) != 0) WorkItemIDZ = true; - if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) - DispatchPtr = true; - - if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) - QueuePtr = true; - - if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) - DispatchID = true; - if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id")) LDSKernelId = true; } - // FIXME: This attribute is a hack, we just need an analysis on the function - // to look for allocas. - bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); - - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. - if (ST.hasFlatAddressSpace() && isEntryFunction() && - (isAmdHsaOrMesa || ST.enableFlatScratch()) && - (HasCalls || HasStackObjects || ST.enableFlatScratch()) && - !ST.flatScratchIsArchitected()) { - FlatScratchInit = true; - } - if (isEntryFunction()) { // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. @@ -280,12 +243,47 @@ Register SIMachineFunctionInfo::addLDSKernelId() { return ArgInfo.LDSKernelId.getRegister(); } +SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg( + const SIRegisterInfo &TRI, const TargetRegisterClass *RC, + unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) { + assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) && + "Preload kernel argument allocated twice."); + NumUserSGPRs += PaddingSGPRs; + // If the available register tuples are aligned with the kernarg to be + // preloaded use that register, otherwise we need to use a set of SGPRs and + // merge them. + Register PreloadReg = + TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC); + if (PreloadReg && + (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) { + ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg); + NumUserSGPRs += AllocSizeDWord; + } else { + for (unsigned I = 0; I < AllocSizeDWord; ++I) { + ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(getNextUserSGPR()); + NumUserSGPRs++; + } + } + + // Track the actual number of SGPRs that HW will preload to. + UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs); + return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs; +} + void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size, Align Alignment) { // Skip if it is an entry function or the register is already added. if (isEntryFunction() || WWMSpills.count(VGPR)) return; + // Skip if this is a function with the amdgpu_cs_chain or + // amdgpu_cs_chain_preserve calling convention and this is a scratch register. + // We never need to allocate a spill for these because we don't even need to + // restore the inactive lanes for them (they're scratchier than the usual + // scratch registers). + if (isChainFunction() && SIRegisterInfo::isChainScratchRegister(VGPR)) + return; + WWMSpills.insert(std::make_pair( VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment))); } @@ -314,37 +312,23 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, return false; } -bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, - int FI, - unsigned LaneIndex) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); +bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( + MachineFunction &MF, int FI, unsigned LaneIndex) { MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); - if (LaneVGPR == AMDGPU::NoRegister) { - // We have no VGPRs left for spilling SGPRs. Reset because we will not - // partially spill the SGPR to VGPRs. - SGPRSpillToVGPRLanes.erase(FI); - return false; - } - + LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); SpillVGPRs.push_back(LaneVGPR); - // Add this register as live-in to all blocks to avoid machine verifier - // complaining about use of an undefined physical register. - for (MachineBasicBlock &BB : MF) - BB.addLiveIn(LaneVGPR); } else { LaneVGPR = SpillVGPRs.back(); } - SGPRSpillToVGPRLanes[FI].push_back( + SGPRSpillsToVirtualVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } -bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills( +bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( MachineFunction &MF, int FI, unsigned LaneIndex) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -355,16 +339,22 @@ bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills( if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not // partially spill the SGPR to VGPRs. - PrologEpilogSGPRSpillToVGPRLanes.erase(FI); + SGPRSpillsToPhysicalVGPRLanes.erase(FI); return false; } allocateWWMSpill(MF, LaneVGPR); + reserveWWMRegister(LaneVGPR); + for (MachineBasicBlock &MBB : MF) { + MBB.addLiveIn(LaneVGPR); + MBB.sortUniqueLiveIns(); + } + SpillPhysVGPRs.push_back(LaneVGPR); } else { - LaneVGPR = WWMSpills.back().first; + LaneVGPR = SpillPhysVGPRs.back(); } - PrologEpilogSGPRSpillToVGPRLanes[FI].push_back( + SGPRSpillsToPhysicalVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } @@ -373,8 +363,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool IsPrologEpilog) { std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = - IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI] - : SGPRSpillToVGPRLanes[FI]; + IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI] + : SGPRSpillsToVirtualVGPRLanes[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -395,15 +385,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, "not spilling SGPRs to VGPRs"); unsigned &NumSpillLanes = - IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes; + IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes; for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { unsigned LaneIndex = (NumSpillLanes % WaveSize); - bool Allocated = - IsPrologEpilog - ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex) - : allocateVGPRForSGPRSpills(MF, FI, LaneIndex); + bool Allocated = IsPrologEpilog + ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex) + : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex); if (!Allocated) { NumSpillLanes -= I; return false; @@ -484,16 +473,25 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, bool SIMachineFunctionInfo::removeDeadFrameIndices( MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { - // Remove dead frame indices from function frame. And also make sure to remove - // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it - // could result in an unexpected side effect and bug, in case of any - // re-mapping of freed frame indices by later pass(es) like "stack slot + // Remove dead frame indices from function frame, however keep FP & BP since + // spills for them haven't been inserted yet. And also make sure to remove the + // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure, + // otherwise, it could result in an unexpected side effect and bug, in case of + // any re-mapping of freed frame indices by later pass(es) like "stack slot // coloring". - for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) { + for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) { MFI.RemoveStackObject(R.first); - SGPRSpillToVGPRLanes.erase(R.first); + SGPRSpillsToVirtualVGPRLanes.erase(R.first); } + // Remove the dead frame indices of CSR SGPRs which are spilled to physical + // VGPR lanes during SILowerSGPRSpills pass. + if (!ResetSGPRSpillStackIDs) { + for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) { + MFI.RemoveStackObject(R.first); + SGPRSpillsToPhysicalVGPRLanes.erase(R.first); + } + } bool HaveSGPRToMemory = false; if (ResetSGPRSpillStackIDs) { @@ -522,7 +520,7 @@ int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI) { if (ScavengeFI) return *ScavengeFI; - if (isEntryFunction()) { + if (isBottomOfStack()) { ScavengeFI = MFI.CreateFixedObject( TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); } else { @@ -608,6 +606,7 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, return true; }; + // TODO: Need to serialize kernarg preloads. bool Any = false; Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); @@ -730,7 +729,7 @@ bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const { for (const auto &CI : IA->ParseConstraints()) { for (StringRef Code : CI.Codes) { Code.consume_front("{"); - if (Code.startswith("a")) + if (Code.starts_with("a")) return true; } } |
