diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 339 |
1 files changed, 179 insertions, 160 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2ded7cdb6489..743ac64b8f10 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,7 +19,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" -#include "InstPrinter/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "R600AsmPrinter.h" @@ -31,10 +30,12 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -100,7 +101,7 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() { AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)) { - if (IsaInfo::hasCodeObjectV3(getSTI())) + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) HSAMetadataStream.reset(new MetadataStreamerV3()); else HSAMetadataStream.reset(new MetadataStreamerV2()); @@ -110,7 +111,7 @@ StringRef AMDGPUAsmPrinter::getPassName() const { return "AMDGPU Assembly Printer"; } -const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const { +const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { return TM.getMCSubtargetInfo(); } @@ -121,10 +122,10 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { } void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { - if (IsaInfo::hasCodeObjectV3(getSTI())) { + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) { std::string ExpectedTarget; raw_string_ostream ExpectedTargetOS(ExpectedTarget); - IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS); + IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS); getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget); } @@ -137,9 +138,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { HSAMetadataStream->begin(M); if (TM.getTargetTriple().getOS() == Triple::AMDPAL) - readPALMetadata(M); + getTargetStreamer()->getPALMetadata()->readFromIR(M); - if (IsaInfo::hasCodeObjectV3(getSTI())) + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) return; // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. @@ -147,7 +148,7 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2. - IsaVersion Version = getIsaVersion(getSTI()->getCPU()); + IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU()); getTargetStreamer()->EmitDirectiveHSACodeObjectISA( Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); } @@ -157,11 +158,11 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { if (!getTargetStreamer()) return; - if (!IsaInfo::hasCodeObjectV3(getSTI())) { + if (!IsaInfo::hasCodeObjectV3(getGlobalSTI())) { // Emit ISA Version (NT_AMD_AMDGPU_ISA). std::string ISAVersionString; raw_string_ostream ISAVersionStream(ISAVersionString); - IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream); + IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream); getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); } @@ -172,20 +173,6 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { (void)Success; assert(Success && "Malformed HSA Metadata"); } - - if (!IsaInfo::hasCodeObjectV3(getSTI())) { - // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA). - if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { - // Copy the PAL metadata from the map where we collected it into a vector, - // then write it as a .note. - PALMD::Metadata PALMetadataVector; - for (auto i : PALMetadataMap) { - PALMetadataVector.push_back(i.first); - PALMetadataVector.push_back(i.second); - } - getTargetStreamer()->EmitPALMetadata(PALMetadataVector); - } - } } bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( @@ -225,7 +212,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); if (!MFI.isEntryFunction()) return; - if (!IsaInfo::hasCodeObjectV3(getSTI()) || + + if (!IsaInfo::hasCodeObjectV3(getGlobalSTI()) || TM.getTargetTriple().getOS() != Triple::AMDHSA) return; @@ -243,23 +231,25 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { if (ReadOnlySection.getAlignment() < 64) ReadOnlySection.setAlignment(64); + const MCSubtargetInfo &STI = MF->getSubtarget(); + SmallString<128> KernelName; getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( - *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), + STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), CurrentProgramInfo.NumVGPRsForWavesPerEU, CurrentProgramInfo.NumSGPRsForWavesPerEU - - IsaInfo::getNumExtraSGPRs(getSTI(), + IsaInfo::getNumExtraSGPRs(&STI, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed), CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, - hasXNACK(*getSTI())); + hasXNACK(STI)); Streamer.PopSection(); } void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { - if (IsaInfo::hasCodeObjectV3(getSTI()) && + if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) && TM.getTargetTriple().getOS() == Triple::AMDHSA) { AsmPrinter::EmitFunctionEntryLabel(); return; @@ -273,8 +263,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { getTargetStreamer()->EmitAMDGPUSymbolType( SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); } - const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>(); - if (STI.dumpCode()) { + if (DumpCodeInstEmitter) { // Disassemble function name label to text. DisasmLines.push_back(MF->getName().str() + ":"); DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); @@ -285,8 +274,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { } void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { - const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>(); - if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) { + if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { // Write a line for the basic block label if it is not only fallthrough. DisasmLines.push_back( (Twine("BB") + Twine(getFunctionNumber()) @@ -298,38 +286,57 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { } void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { + OutContext.reportError({}, + Twine(GV->getName()) + + ": unsupported initializer for address space"); + return; + } + + // LDS variables aren't emitted in HSA or PAL yet. + const Triple::OSType OS = TM.getTargetTriple().getOS(); + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return; - // Group segment variables aren't emitted in HSA. - if (AMDGPU::isGroupSegment(GV)) + MCSymbol *GVSym = getSymbol(GV); + + GVSym->redefineIfPossible(); + if (GVSym->isDefined() || GVSym->isVariable()) + report_fatal_error("symbol '" + Twine(GVSym->getName()) + + "' is already defined"); + + const DataLayout &DL = GV->getParent()->getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); + unsigned Align = GV->getAlignment(); + if (!Align) + Align = 4; + + EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); + EmitLinkage(GV, GVSym); + if (auto TS = getTargetStreamer()) + TS->emitAMDGPULDS(GVSym, Size, Align); return; + } AsmPrinter::EmitGlobalVariable(GV); } bool AMDGPUAsmPrinter::doFinalization(Module &M) { CallGraphResourceInfo.clear(); - return AsmPrinter::doFinalization(M); -} -// For the amdpal OS type, read the amdgpu.pal.metadata supplied by the -// frontend into our PALMetadataMap, ready for per-function modification. It -// is a NamedMD containing an MDTuple containing a number of MDNodes each of -// which is an integer value, and each two integer values forms a key=value -// pair that we store as PALMetadataMap[key]=value in the map. -void AMDGPUAsmPrinter::readPALMetadata(Module &M) { - auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); - if (!NamedMD || !NamedMD->getNumOperands()) - return; - auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0)); - if (!Tuple) - return; - for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) { - auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I)); - auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1)); - if (!Key || !Val) - continue; - PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue(); + // Pad with s_code_end to help tools and guard against instruction prefetch + // causing stale data in caches. Arguably this should be done by the linker, + // which is why this isn't done for Mesa. + const MCSubtargetInfo &STI = *getGlobalSTI(); + if (AMDGPU::isGFX10(STI) && + (STI.getTargetTriple().getOS() == Triple::AMDHSA || + STI.getTargetTriple().getOS() == Triple::AMDPAL)) { + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + getTargetStreamer()->EmitCodeEnd(); } + + return AsmPrinter::doFinalization(M); } // Print comments that apply to both callable functions and entry points. @@ -376,6 +383,10 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; } + if (MF.getSubtarget<GCNSubtarget>().isWave32()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; + } return KernelCodeProperties; } @@ -435,6 +446,18 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { EmitProgramInfoSI(MF, CurrentProgramInfo); } + DumpCodeInstEmitter = nullptr; + if (STM.dumpCode()) { + // For -dumpcode, get the assembler out of the streamer, even if it does + // not really want to let us have it. This only works with -filetype=obj. + bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing(); + OutStreamer->setUseAssemblerInfoForParsing(true); + MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); + OutStreamer->setUseAssemblerInfoForParsing(SaveFlag); + if (Assembler) + DumpCodeInstEmitter = Assembler->getEmitterPtr(); + } + DisasmLines.clear(); HexLines.clear(); DisasmLineMaxLen = 0; @@ -486,15 +509,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); - if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) { - OutStreamer->emitRawComment( - " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + - Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); - OutStreamer->emitRawComment( - " DebuggerPrivateSegmentBufferSGPR: s" + - Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false); - } - OutStreamer->emitRawComment( " COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); @@ -516,7 +530,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); } - if (STM.dumpCode()) { + if (DumpCodeInstEmitter) { OutStreamer->SwitchSection( Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); @@ -620,6 +634,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( HighestVGPRReg = Reg; break; } + MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg); + if (MRI.isPhysRegUsed(AReg)) { + HighestVGPRReg = AReg; + break; + } } MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; @@ -665,8 +684,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::SRC_SHARED_LIMIT: case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SGPR_NULL: continue; + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + llvm_unreachable("src_pops_exiting_wave_id should not be used"); + case AMDGPU::NoRegister: assert(MI.isDebugInstr()); continue; @@ -687,6 +710,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::XNACK_MASK_HI: llvm_unreachable("xnack_mask registers should not be used"); + case AMDGPU::LDS_DIRECT: + llvm_unreachable("lds_direct register should not be used"); + case AMDGPU::TBA: case AMDGPU::TBA_LO: case AMDGPU::TBA_HI: @@ -695,6 +721,15 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::TMA_HI: llvm_unreachable("trap handler registers should not be used"); + case AMDGPU::SRC_VCCZ: + llvm_unreachable("src_vccz register should not be used"); + + case AMDGPU::SRC_EXECZ: + llvm_unreachable("src_execz register should not be used"); + + case AMDGPU::SRC_SCC: + llvm_unreachable("src_scc register should not be used"); + default: break; } @@ -707,6 +742,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { IsSGPR = false; Width = 1; + } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -715,9 +753,14 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { IsSGPR = false; Width = 2; + } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { IsSGPR = false; Width = 3; + } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { + Width = 3; } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -726,6 +769,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { IsSGPR = false; Width = 4; + } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -742,6 +788,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { IsSGPR = false; Width = 16; + } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { + IsSGPR = true; + Width = 32; + } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + Width = 32; + } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + Width = 32; } else { llvm_unreachable("Unknown register class"); } @@ -767,8 +825,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( // 48 SGPRs - vcc, - flat_scr, -xnack int MaxSGPRGuess = - 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true, - ST.hasFlatAddressSpace()); + 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); MaxVGPR = std::max(MaxVGPR, 23); @@ -779,9 +836,19 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else { // We force CodeGen to run in SCC order, so the callee's register // usage etc. should be the cumulative usage of all callees. + auto I = CallGraphResourceInfo.find(Callee); - assert(I != CallGraphResourceInfo.end() && - "callee should have been handled before caller"); + if (I == CallGraphResourceInfo.end()) { + // Avoid crashing on undefined behavior with an illegal call to a + // kernel. If a callsite's calling convention doesn't match the + // function's, it's undefined behavior. If the callsite calling + // convention does match, that would have errored earlier. + // FIXME: The verifier shouldn't allow this. + if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) + report_fatal_error("invalid call to entry function"); + + llvm_unreachable("callee should have been handled before caller"); + } MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); @@ -825,14 +892,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SIInstrInfo *TII = STM.getInstrInfo(); - const SIRegisterInfo *RI = &TII->getRegisterInfo(); // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( - getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed); + &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && @@ -918,24 +983,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( &STM, ProgInfo.NumVGPRsForWavesPerEU); - // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and - // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" - // attribute was requested. - if (STM.debuggerEmitPrologue()) { - ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = - RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); - ProgInfo.DebuggerPrivateSegmentBufferSGPR = - RI->getHWRegIndex(MFI->getScratchRSrcReg()); - } - // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. ProgInfo.FloatMode = getFPMode(MF); - ProgInfo.IEEEMode = STM.enableIEEEBit(MF); + const SIModeRegisterDefaults Mode = MFI->getMode(); + ProgInfo.IEEEMode = Mode.IEEE; // Make clamp modifier on NaN input returns 0. - ProgInfo.DX10Clamp = STM.enableDX10Clamp(); + ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { @@ -963,6 +1019,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 1ULL << ScratchAlignShift) >> ScratchAlignShift; + if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { + ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; + ProgInfo.MemOrdered = 1; + } + ProgInfo.ComputePGMRSrc1 = S_00B848_VGPRS(ProgInfo.VGPRBlocks) | S_00B848_SGPRS(ProgInfo.SGPRBlocks) | @@ -971,7 +1032,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | - S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + S_00B848_IEEE_MODE(ProgInfo.IEEEMode) | + S_00B848_WGP_MODE(ProgInfo.WgpMode) | + S_00B848_MEM_ORDERED(ProgInfo.MemOrdered); // 0 = X, 1 = XY, 2 = XYZ unsigned TIDIGCompCnt = 0; @@ -1053,71 +1116,38 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, // This is the equivalent of EmitProgramInfoSI above, but for when the OS type // is AMDPAL. It stores each compute/SPI register setting and other PAL -// metadata items into the PALMetadataMap, combining with any provided by the -// frontend as LLVM metadata. Once all functions are written, PALMetadataMap is -// then written as a single block in the .note section. +// metadata items into the PALMD::Metadata, combining with any provided by the +// frontend as LLVM metadata. Once all functions are written, the PAL metadata +// is then written as a single block in the .note section. void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - // Given the calling convention, calculate the register number for rsrc1. In - // principle the register number could change in future hardware, but we know - // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so - // we can use the same fixed value that .AMDGPU.config has for Mesa. Note - // that we use a register number rather than a byte offset, so we need to - // divide by 4. - unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4; - unsigned Rsrc2Reg = Rsrc1Reg + 1; - // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used - // with a constant offset to access any non-register shader-specific PAL - // metadata key. - unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE; - switch (MF.getFunction().getCallingConv()) { - case CallingConv::AMDGPU_PS: - ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_VS: - ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_GS: - ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_ES: - ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_HS: - ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_LS: - ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE; - break; - } - unsigned NumUsedVgprsKey = ScratchSizeKey + - PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE; - unsigned NumUsedSgprsKey = ScratchSizeKey + - PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE; - PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU; - PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU; + auto CC = MF.getFunction().getCallingConv(); + auto MD = getTargetStreamer()->getPALMetadata(); + + MD->setEntryPoint(CC, MF.getFunction().getName()); + MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); + MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { - PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1; - PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2; - // ScratchSize is in bytes, 16 aligned. - PALMetadataMap[ScratchSizeKey] |= - alignTo(CurrentProgramInfo.ScratchSize, 16); + MD->setRsrc1(CC, CurrentProgramInfo.ComputePGMRSrc1); + MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2); } else { - PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | - S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks); + MD->setRsrc1(CC, S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | + S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks)); if (CurrentProgramInfo.ScratchBlocks > 0) - PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1); - // ScratchSize is in bytes, 16 aligned. - PALMetadataMap[ScratchSizeKey] |= - alignTo(CurrentProgramInfo.ScratchSize, 16); + MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); } + // ScratchSize is in bytes, 16 aligned. + MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { - PALMetadataMap[Rsrc2Reg] |= - S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); - PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable(); - PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr(); + MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + MD->setSpiPsInputEna(MFI->getPSInputEnable()); + MD->setSpiPsInputAddr(MFI->getPSInputAddr()); } + + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); + if (STM.isWave32()) + MD->setWave32(MF.getFunction().getCallingConv()); } // This is supposed to be log2(Size) @@ -1144,12 +1174,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); - AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI()); + AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); Out.compute_pgm_resource_registers = CurrentProgramInfo.ComputePGMRSrc1 | (CurrentProgramInfo.ComputePGMRSrc2 << 32); - Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; if (CurrentProgramInfo.DynamicCallStack) Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; @@ -1181,9 +1211,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (MFI->hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (STM.debuggerSupported()) - Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; - if (STM.isXNACKEnabled()) Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; @@ -1196,22 +1223,14 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. - Out.kernarg_segment_alignment = std::max((size_t)4, + Out.kernarg_segment_alignment = std::max<size_t>(4, countTrailingZeros(MaxKernArgAlign)); - - if (STM.debuggerEmitPrologue()) { - Out.debug_wavefront_private_segment_offset_sgpr = - CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; - Out.debug_private_segment_buffer_sgpr = - CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR; - } } bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { // First try the generic code, which knows about modifiers like 'c' and 'n'. - if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O)) + if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) return false; if (ExtraCode && ExtraCode[0]) { |