aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp195
1 files changed, 97 insertions, 98 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index c9376d0ea653..e8142244b7db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -7,17 +7,18 @@
//===----------------------------------------------------------------------===//
#include "SIMachineFunctionInfo.h"
-#include "AMDGPUTargetMachine.h"
#include "AMDGPUSubtarget.h"
-#include "SIRegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
@@ -36,28 +37,12 @@ const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
const GCNSubtarget *STI)
- : AMDGPUMachineFunction(F, *STI),
- Mode(F),
- GWSResourcePSV(getTM(STI)),
- PrivateSegmentBuffer(false),
- DispatchPtr(false),
- QueuePtr(false),
- KernargSegmentPtr(false),
- DispatchID(false),
- FlatScratchInit(false),
- WorkGroupIDX(false),
- WorkGroupIDY(false),
- WorkGroupIDZ(false),
- WorkGroupInfo(false),
- LDSKernelId(false),
- PrivateSegmentWaveByteOffset(false),
- WorkItemIDX(false),
- WorkItemIDY(false),
- WorkItemIDZ(false),
- ImplicitBufferPtr(false),
- ImplicitArgPtr(false),
- GITPtrHigh(0xffffffff),
- HighBitsOf32BitAddress(0) {
+ : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
+ UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
+ WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
+ PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
+ WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
+ GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
@@ -67,16 +52,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
VRegFlags.reserve(1024);
- // FIXME: Should have analysis or something rather than attribute to detect
- // calls.
- const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
-
const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
CC == CallingConv::SPIR_KERNEL;
if (IsKernel) {
- if (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)
- KernargSegmentPtr = true;
WorkGroupIDX = true;
WorkItemIDX = true;
} else if (CC == CallingConv::AMDGPU_PS) {
@@ -85,7 +64,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MayNeedAGPRs = ST.hasMAIInsts();
- if (!isEntryFunction()) {
+ if (AMDGPU::isChainCC(CC)) {
+ // Chain functions don't receive an SP from their caller, but are free to
+ // set one up. For now, we can use s32 to match what amdgpu_gfx functions
+ // would use if called, but this can be revisited.
+ // FIXME: Only reserve this if we actually need it.
+ StackPtrOffsetReg = AMDGPU::SGPR32;
+
+ ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51;
+
+ ArgInfo.PrivateSegmentBuffer =
+ ArgDescriptor::createRegister(ScratchRSrcReg);
+
+ ImplicitArgPtr = false;
+ } else if (!isEntryFunction()) {
if (CC != CallingConv::AMDGPU_Gfx)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
@@ -115,12 +107,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
}
- bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
- if (isAmdHsaOrMesa && !ST.enableFlatScratch())
- PrivateSegmentBuffer = true;
- else if (ST.isMesaGfxShader(F))
- ImplicitBufferPtr = true;
-
if (!AMDGPU::isGraphics(CC) ||
(CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs())) {
if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
@@ -145,33 +131,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
ST.getMaxWorkitemID(F, 2) != 0)
WorkItemIDZ = true;
- if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
- DispatchPtr = true;
-
- if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
- QueuePtr = true;
-
- if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
- DispatchID = true;
-
if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
LDSKernelId = true;
}
- // FIXME: This attribute is a hack, we just need an analysis on the function
- // to look for allocas.
- bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
-
- // TODO: This could be refined a lot. The attribute is a poor way of
- // detecting calls or stack objects that may require it before argument
- // lowering.
- if (ST.hasFlatAddressSpace() && isEntryFunction() &&
- (isAmdHsaOrMesa || ST.enableFlatScratch()) &&
- (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
- !ST.flatScratchIsArchitected()) {
- FlatScratchInit = true;
- }
-
if (isEntryFunction()) {
// X, XY, and XYZ are the only supported combinations, so make sure Y is
// enabled if Z is.
@@ -280,12 +243,47 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
return ArgInfo.LDSKernelId.getRegister();
}
+SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
+ const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
+ unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
+ assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) &&
+ "Preload kernel argument allocated twice.");
+ NumUserSGPRs += PaddingSGPRs;
+ // If the available register tuples are aligned with the kernarg to be
+ // preloaded use that register, otherwise we need to use a set of SGPRs and
+ // merge them.
+ Register PreloadReg =
+ TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
+ if (PreloadReg &&
+ (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
+ ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg);
+ NumUserSGPRs += AllocSizeDWord;
+ } else {
+ for (unsigned I = 0; I < AllocSizeDWord; ++I) {
+ ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(getNextUserSGPR());
+ NumUserSGPRs++;
+ }
+ }
+
+ // Track the actual number of SGPRs that HW will preload to.
+ UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
+ return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
+}
+
void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
uint64_t Size, Align Alignment) {
// Skip if it is an entry function or the register is already added.
if (isEntryFunction() || WWMSpills.count(VGPR))
return;
+ // Skip if this is a function with the amdgpu_cs_chain or
+ // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
+ // We never need to allocate a spill for these because we don't even need to
+ // restore the inactive lanes for them (they're scratchier than the usual
+ // scratch registers).
+ if (isChainFunction() && SIRegisterInfo::isChainScratchRegister(VGPR))
+ return;
+
WWMSpills.insert(std::make_pair(
VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
}
@@ -314,37 +312,23 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
return false;
}
-bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF,
- int FI,
- unsigned LaneIndex) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
+bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
+ MachineFunction &MF, int FI, unsigned LaneIndex) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register LaneVGPR;
if (!LaneIndex) {
- LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
- if (LaneVGPR == AMDGPU::NoRegister) {
- // We have no VGPRs left for spilling SGPRs. Reset because we will not
- // partially spill the SGPR to VGPRs.
- SGPRSpillToVGPRLanes.erase(FI);
- return false;
- }
-
+ LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
SpillVGPRs.push_back(LaneVGPR);
- // Add this register as live-in to all blocks to avoid machine verifier
- // complaining about use of an undefined physical register.
- for (MachineBasicBlock &BB : MF)
- BB.addLiveIn(LaneVGPR);
} else {
LaneVGPR = SpillVGPRs.back();
}
- SGPRSpillToVGPRLanes[FI].push_back(
+ SGPRSpillsToVirtualVGPRLanes[FI].push_back(
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
return true;
}
-bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
+bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
MachineFunction &MF, int FI, unsigned LaneIndex) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -355,16 +339,22 @@ bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
// partially spill the SGPR to VGPRs.
- PrologEpilogSGPRSpillToVGPRLanes.erase(FI);
+ SGPRSpillsToPhysicalVGPRLanes.erase(FI);
return false;
}
allocateWWMSpill(MF, LaneVGPR);
+ reserveWWMRegister(LaneVGPR);
+ for (MachineBasicBlock &MBB : MF) {
+ MBB.addLiveIn(LaneVGPR);
+ MBB.sortUniqueLiveIns();
+ }
+ SpillPhysVGPRs.push_back(LaneVGPR);
} else {
- LaneVGPR = WWMSpills.back().first;
+ LaneVGPR = SpillPhysVGPRs.back();
}
- PrologEpilogSGPRSpillToVGPRLanes[FI].push_back(
+ SGPRSpillsToPhysicalVGPRLanes[FI].push_back(
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
return true;
}
@@ -373,8 +363,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
int FI,
bool IsPrologEpilog) {
std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
- IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI]
- : SGPRSpillToVGPRLanes[FI];
+ IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
+ : SGPRSpillsToVirtualVGPRLanes[FI];
// This has already been allocated.
if (!SpillLanes.empty())
@@ -395,15 +385,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
"not spilling SGPRs to VGPRs");
unsigned &NumSpillLanes =
- IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes;
+ IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
unsigned LaneIndex = (NumSpillLanes % WaveSize);
- bool Allocated =
- IsPrologEpilog
- ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex)
- : allocateVGPRForSGPRSpills(MF, FI, LaneIndex);
+ bool Allocated = IsPrologEpilog
+ ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
+ : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
if (!Allocated) {
NumSpillLanes -= I;
return false;
@@ -484,16 +473,25 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
bool SIMachineFunctionInfo::removeDeadFrameIndices(
MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
- // Remove dead frame indices from function frame. And also make sure to remove
- // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it
- // could result in an unexpected side effect and bug, in case of any
- // re-mapping of freed frame indices by later pass(es) like "stack slot
+ // Remove dead frame indices from function frame, however keep FP & BP since
+ // spills for them haven't been inserted yet. And also make sure to remove the
+ // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
+ // otherwise, it could result in an unexpected side effect and bug, in case of
+ // any re-mapping of freed frame indices by later pass(es) like "stack slot
// coloring".
- for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) {
+ for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
MFI.RemoveStackObject(R.first);
- SGPRSpillToVGPRLanes.erase(R.first);
+ SGPRSpillsToVirtualVGPRLanes.erase(R.first);
}
+ // Remove the dead frame indices of CSR SGPRs which are spilled to physical
+ // VGPR lanes during SILowerSGPRSpills pass.
+ if (!ResetSGPRSpillStackIDs) {
+ for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
+ MFI.RemoveStackObject(R.first);
+ SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
+ }
+ }
bool HaveSGPRToMemory = false;
if (ResetSGPRSpillStackIDs) {
@@ -522,7 +520,7 @@ int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
const SIRegisterInfo &TRI) {
if (ScavengeFI)
return *ScavengeFI;
- if (isEntryFunction()) {
+ if (isBottomOfStack()) {
ScavengeFI = MFI.CreateFixedObject(
TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
} else {
@@ -608,6 +606,7 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
return true;
};
+ // TODO: Need to serialize kernarg preloads.
bool Any = false;
Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
@@ -730,7 +729,7 @@ bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
for (const auto &CI : IA->ParseConstraints()) {
for (StringRef Code : CI.Codes) {
Code.consume_front("{");
- if (Code.startswith("a"))
+ if (Code.starts_with("a"))
return true;
}
}