aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp95
1 files changed, 66 insertions, 29 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 6e2984f2a04f..57a4660bc1eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -27,6 +27,8 @@
#include "SIMachineFunctionInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -34,6 +36,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/TargetParser.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
@@ -111,6 +114,12 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
}
void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
+ IsTargetStreamerInitialized = false;
+}
+
+void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
+ IsTargetStreamerInitialized = true;
+
// TODO: Which one is called first, emitStartOfAsmFile or
// emitFunctionBodyStart?
if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
@@ -143,6 +152,10 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
}
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
+ // Init target streamer if it has not yet happened
+ if (!IsTargetStreamerInitialized)
+ initTargetStreamer(M);
+
// Following code requires TargetStreamer to be present.
if (!getTargetStreamer())
return;
@@ -234,8 +247,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
auto &ObjectFileInfo = *Context.getObjectFileInfo();
auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
- Streamer.PushSection();
- Streamer.SwitchSection(&ReadOnlySection);
+ Streamer.pushSection();
+ Streamer.switchSection(&ReadOnlySection);
// CP microcode requires the kernel descriptor to be allocated on 64 byte
// alignment.
@@ -256,7 +269,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
CurrentProgramInfo.FlatUsed),
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
- Streamer.PopSection();
+ Streamer.popSection();
}
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
@@ -319,7 +332,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
const DataLayout &DL = GV->getParent()->getDataLayout();
uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
- Align Alignment = GV->getAlign().getValueOr(Align(4));
+ Align Alignment = GV->getAlign().value_or(Align(4));
emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
emitLinkage(GV, GVSym);
@@ -339,7 +352,7 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
(STI.getTargetTriple().getOS() == Triple::AMDHSA ||
STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
- OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+ OutStreamer->switchSection(getObjFileLowering().getTextSection());
getTargetStreamer()->EmitCodeEnd(STI);
}
@@ -381,7 +394,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
}
- if (MFI.hasQueuePtr()) {
+ if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
}
@@ -437,6 +450,11 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ // Init target streamer lazily on the first function so that previous passes
+ // can set metadata.
+ if (!IsTargetStreamerInitialized)
+ initTargetStreamer(*MF.getFunction().getParent());
+
ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
CurrentProgramInfo = SIProgramInfo();
@@ -454,7 +472,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
MCSectionELF *ConfigSection =
Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
- OutStreamer->SwitchSection(ConfigSection);
+ OutStreamer->switchSection(ConfigSection);
}
if (MFI->isModuleEntryFunction()) {
@@ -491,7 +509,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
- OutStreamer->SwitchSection(CommentSection);
+ OutStreamer->switchSection(CommentSection);
if (!MFI->isEntryFunction()) {
OutStreamer->emitRawComment(" Function info:", false);
@@ -590,7 +608,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (DumpCodeInstEmitter) {
- OutStreamer->SwitchSection(
+ OutStreamer->switchSection(
Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
for (size_t i = 0; i < DisasmLines.size(); ++i) {
@@ -677,7 +695,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
const uint64_t MaxScratchPerWorkitem =
- GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
+ STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
ProgInfo.ScratchSize,
@@ -857,22 +875,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
LDSAlignShift = 9;
}
- unsigned LDSSpillSize =
- MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
-
- ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
+ ProgInfo.LDSSize = MFI->getLDSSize();
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
- // Scratch is allocated in 256 dword blocks.
- unsigned ScratchAlignShift = 10;
+ // Scratch is allocated in 64-dword or 256-dword blocks.
+ unsigned ScratchAlignShift =
+ STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
// We need to program the hardware with the amount of scratch memory that
// is used by the entire wave. ProgInfo.ScratchSize is the amount of
// scratch memory used per thread.
- ProgInfo.ScratchBlocks =
- alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
- 1ULL << ScratchAlignShift) >>
- ScratchAlignShift;
+ ProgInfo.ScratchBlocks = divideCeil(
+ ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
@@ -886,8 +900,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
else if (MFI->hasWorkItemIDY())
TIDIGCompCnt = 1;
+ // The private segment wave byte offset is the last of the system SGPRs. We
+ // initially assumed it was allocated, and may have used it. It shouldn't harm
+ // anything to disable it if we know the stack isn't used here. We may still
+ // have emitted code reading it to initialize scratch, but if that's unused
+ // reading garbage should be OK.
+ const bool EnablePrivateSegment = ProgInfo.ScratchBlocks > 0;
ProgInfo.ComputePGMRSrc2 =
- S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+ S_00B84C_SCRATCH_EN(EnablePrivateSegment) |
S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
@@ -931,6 +951,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
@@ -942,7 +963,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
- OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks));
+ OutStreamer->emitInt32(
+ STM.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
@@ -951,14 +975,18 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
- OutStreamer->emitIntValue(
- S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
+ OutStreamer->emitInt32(
+ STM.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
}
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
- OutStreamer->emitInt32(
- S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+ unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
+ : CurrentProgramInfo.LDSBlocks;
+ OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
OutStreamer->emitInt32(MFI->getPSInputEnable());
OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
@@ -984,6 +1012,13 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setEntryPoint(CC, MF.getFunction().getName());
MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+
+ // Only set AGPRs for supported devices
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ if (STM.hasMAIInsts()) {
+ MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+ }
+
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
if (AMDGPU::isCompute(CC)) {
@@ -995,12 +1030,14 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
// ScratchSize is in bytes, 16 aligned.
MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
- MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+ unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
+ : CurrentProgramInfo.LDSBlocks;
+ MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
MD->setSpiPsInputEna(MFI->getPSInputEnable());
MD->setSpiPsInputAddr(MFI->getPSInputAddr());
}
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.isWave32())
MD->setWave32(MF.getFunction().getCallingConv());
}
@@ -1067,7 +1104,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (MFI->hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
- if (MFI->hasQueuePtr())
+ if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
if (MFI->hasKernargSegmentPtr())