diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 95 |
1 files changed, 66 insertions, 29 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 6e2984f2a04f..57a4660bc1eb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -27,6 +27,8 @@ #include "SIMachineFunctionInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -34,6 +36,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" @@ -111,6 +114,12 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { } void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { + IsTargetStreamerInitialized = false; +} + +void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { + IsTargetStreamerInitialized = true; + // TODO: Which one is called first, emitStartOfAsmFile or // emitFunctionBodyStart? if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) @@ -143,6 +152,10 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { } void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { + // Init target streamer if it has not yet happened + if (!IsTargetStreamerInitialized) + initTargetStreamer(M); + // Following code requires TargetStreamer to be present. if (!getTargetStreamer()) return; @@ -234,8 +247,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { auto &ObjectFileInfo = *Context.getObjectFileInfo(); auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); - Streamer.PushSection(); - Streamer.SwitchSection(&ReadOnlySection); + Streamer.pushSection(); + Streamer.switchSection(&ReadOnlySection); // CP microcode requires the kernel descriptor to be allocated on 64 byte // alignment. @@ -256,7 +269,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { CurrentProgramInfo.FlatUsed), CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); - Streamer.PopSection(); + Streamer.popSection(); } void AMDGPUAsmPrinter::emitFunctionEntryLabel() { @@ -319,7 +332,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { const DataLayout &DL = GV->getParent()->getDataLayout(); uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); - Align Alignment = GV->getAlign().getValueOr(Align(4)); + Align Alignment = GV->getAlign().value_or(Align(4)); emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); emitLinkage(GV, GVSym); @@ -339,7 +352,7 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && (STI.getTargetTriple().getOS() == Triple::AMDHSA || STI.getTargetTriple().getOS() == Triple::AMDPAL)) { - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + OutStreamer->switchSection(getObjFileLowering().getTextSection()); getTargetStreamer()->EmitCodeEnd(STI); } @@ -381,7 +394,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; } - if (MFI.hasQueuePtr()) { + if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; } @@ -437,6 +450,11 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + // Init target streamer lazily on the first function so that previous passes + // can set metadata. + if (!IsTargetStreamerInitialized) + initTargetStreamer(*MF.getFunction().getParent()); + ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>(); CurrentProgramInfo = SIProgramInfo(); @@ -454,7 +472,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(ConfigSection); + OutStreamer->switchSection(ConfigSection); } if (MFI->isModuleEntryFunction()) { @@ -491,7 +509,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (isVerbose()) { MCSectionELF *CommentSection = Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(CommentSection); + OutStreamer->switchSection(CommentSection); if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); @@ -590,7 +608,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (DumpCodeInstEmitter) { - OutStreamer->SwitchSection( + OutStreamer->switchSection( Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); for (size_t i = 0; i < DisasmLines.size(); ++i) { @@ -677,7 +695,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; const uint64_t MaxScratchPerWorkitem = - GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize(); + STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) { DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ProgInfo.ScratchSize, @@ -857,22 +875,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, LDSAlignShift = 9; } - unsigned LDSSpillSize = - MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize(); - - ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; + ProgInfo.LDSSize = MFI->getLDSSize(); ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; - // Scratch is allocated in 256 dword blocks. - unsigned ScratchAlignShift = 10; + // Scratch is allocated in 64-dword or 256-dword blocks. + unsigned ScratchAlignShift = + STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; // We need to program the hardware with the amount of scratch memory that // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. - ProgInfo.ScratchBlocks = - alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), - 1ULL << ScratchAlignShift) >> - ScratchAlignShift; + ProgInfo.ScratchBlocks = divideCeil( + ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift); if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; @@ -886,8 +900,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, else if (MFI->hasWorkItemIDY()) TIDIGCompCnt = 1; + // The private segment wave byte offset is the last of the system SGPRs. We + // initially assumed it was allocated, and may have used it. It shouldn't harm + // anything to disable it if we know the stack isn't used here. We may still + // have emitted code reading it to initialize scratch, but if that's unused + // reading garbage should be OK. + const bool EnablePrivateSegment = ProgInfo.ScratchBlocks > 0; ProgInfo.ComputePGMRSrc2 = - S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | + S_00B84C_SCRATCH_EN(EnablePrivateSegment) | S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) | @@ -931,6 +951,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { @@ -942,7 +963,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); - OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks)); + OutStreamer->emitInt32( + STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. @@ -951,14 +975,18 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); - OutStreamer->emitIntValue( - S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); + OutStreamer->emitInt32( + STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); } if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); - OutStreamer->emitInt32( - S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) + : CurrentProgramInfo.LDSBlocks; + OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); OutStreamer->emitInt32(MFI->getPSInputEnable()); OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); @@ -984,6 +1012,13 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, MD->setEntryPoint(CC, MF.getFunction().getName()); MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); + + // Only set AGPRs for supported devices + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); + if (STM.hasMAIInsts()) { + MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); + } + MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); if (AMDGPU::isCompute(CC)) { @@ -995,12 +1030,14 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, // ScratchSize is in bytes, 16 aligned. MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { - MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) + : CurrentProgramInfo.LDSBlocks; + MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); MD->setSpiPsInputEna(MFI->getPSInputEnable()); MD->setSpiPsInputAddr(MFI->getPSInputAddr()); } - const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); if (STM.isWave32()) MD->setWave32(MF.getFunction().getCallingConv()); } @@ -1067,7 +1104,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (MFI->hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (MFI->hasQueuePtr()) + if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; if (MFI->hasKernargSegmentPtr()) |