diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 201 |
1 files changed, 134 insertions, 67 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 9e07b4d252b7..eef8fe2fc3b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -49,9 +49,25 @@ using namespace llvm; using namespace llvm::AMDGPU; using namespace llvm::AMDGPU::HSAMD; -// TODO: This should get the default rounding mode from the kernel. We just set -// the default here, but this could change if the OpenCL rounding mode pragmas -// are used. +// We need to tell the runtime some amount ahead of time if we don't know the +// true stack size. Assume a smaller number if this is only due to dynamic / +// non-entry block allocas. +static cl::opt<uint32_t> AssumedStackSizeForExternalCall( + "amdgpu-assume-external-call-stack-size", + cl::desc("Assumed stack use of any external call (in bytes)"), + cl::Hidden, + cl::init(16384)); + +static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( + "amdgpu-assume-dynamic-stack-object-size", + cl::desc("Assumed extra stack use if there are any " + "variable sized objects (in bytes)"), + cl::Hidden, + cl::init(4096)); + +// This should get the default rounding mode from the kernel. We just set the +// default here, but this could change if the OpenCL rounding mode pragmas are +// used. // // The denormal mode here should match what is reported by the OpenCL runtime // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but @@ -70,18 +86,10 @@ using namespace llvm::AMDGPU::HSAMD; // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) { - - // TODO: Is there any real use for the flush in only / flush out only modes? - uint32_t FP32Denormals = - Mode.FP32Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; - - uint32_t FP64Denormals = - Mode.FP64FP16Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; - return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | - FP_DENORM_MODE_SP(FP32Denormals) | - FP_DENORM_MODE_DP(FP64Denormals); + FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | + FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); } static AsmPrinter * @@ -120,7 +128,7 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); } -void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { +void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) { std::string ExpectedTarget; raw_string_ostream ExpectedTargetOS(ExpectedTarget); @@ -152,7 +160,7 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); } -void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { +void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { // Following code requires TargetStreamer to be present. if (!getTargetStreamer()) return; @@ -188,7 +196,7 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); } -void AMDGPUAsmPrinter::EmitFunctionBodyStart() { +void AMDGPUAsmPrinter::emitFunctionBodyStart() { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); if (!MFI.isEntryFunction()) return; @@ -207,7 +215,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); } -void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { +void AMDGPUAsmPrinter::emitFunctionBodyEnd() { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); if (!MFI.isEntryFunction()) return; @@ -226,7 +234,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { // CP microcode requires the kernel descriptor to be allocated on 64 byte // alignment. - Streamer.EmitValueToAlignment(64, 0, 1, 0); + Streamer.emitValueToAlignment(64, 0, 1, 0); if (ReadOnlySection.getAlignment() < 64) ReadOnlySection.setAlignment(Align(64)); @@ -247,10 +255,10 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { Streamer.PopSection(); } -void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { +void AMDGPUAsmPrinter::emitFunctionEntryLabel() { if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) && TM.getTargetTriple().getOS() == Triple::AMDHSA) { - AsmPrinter::EmitFunctionEntryLabel(); + AsmPrinter::emitFunctionEntryLabel(); return; } @@ -269,10 +277,10 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { HexLines.push_back(""); } - AsmPrinter::EmitFunctionEntryLabel(); + AsmPrinter::emitFunctionEntryLabel(); } -void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) { +void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { // Write a line for the basic block label if it is not only fallthrough. DisasmLines.push_back( @@ -281,10 +289,10 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) { DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); HexLines.push_back(""); } - AsmPrinter::EmitBasicBlockStart(MBB); + AsmPrinter::emitBasicBlockStart(MBB); } -void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { +void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { OutContext.reportError({}, @@ -307,18 +315,16 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { const DataLayout &DL = GV->getParent()->getDataLayout(); uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); - unsigned Align = GV->getAlignment(); - if (!Align) - Align = 4; + Align Alignment = GV->getAlign().getValueOr(Align(4)); - EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); - EmitLinkage(GV, GVSym); + emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); + emitLinkage(GV, GVSym); if (auto TS = getTargetStreamer()) - TS->emitAMDGPULDS(GVSym, Size, Align); + TS->emitAMDGPULDS(GVSym, Size, Alignment); return; } - AsmPrinter::EmitGlobalVariable(GV); + AsmPrinter::emitGlobalVariable(GV); } bool AMDGPUAsmPrinter::doFinalization(Module &M) { @@ -468,7 +474,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { HexLines.clear(); DisasmLineMaxLen = 0; - EmitFunctionBody(); + emitFunctionBody(); if (isVerbose()) { MCSectionELF *CommentSection = @@ -549,7 +555,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (DumpCodeInstEmitter) { OutStreamer->SwitchSection( - Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); + Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); for (size_t i = 0; i < DisasmLines.size(); ++i) { std::string Comment = "\n"; @@ -558,8 +564,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Comment += " ; " + HexLines[i] + "\n"; } - OutStreamer->EmitBytes(StringRef(DisasmLines[i])); - OutStreamer->EmitBytes(StringRef(Comment)); + OutStreamer->emitBytes(StringRef(DisasmLines[i])); + OutStreamer->emitBytes(StringRef(Comment)); } } @@ -609,6 +615,15 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( return std::max(NumVGPR, NumAGPR); } +static const Function *getCalleeFunction(const MachineOperand &Op) { + if (Op.isImm()) { + assert(Op.getImm() == 0); + return nullptr; + } + + return cast<Function>(Op.getGlobal()); +} + AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( const MachineFunction &MF) const { SIFunctionResourceInfo Info; @@ -636,11 +651,15 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.UsesFlatScratch = false; } - Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); Info.PrivateSegmentSize = FrameInfo.getStackSize(); - if (MFI->isStackRealigned()) - Info.PrivateSegmentSize += FrameInfo.getMaxAlignment(); + // Assume a big number if there are any unknown sized objects. + Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); + if (Info.HasDynamicallySizedStack) + Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; + + if (MFI->isStackRealigned()) + Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); @@ -715,6 +734,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT: case AMDGPU::SGPR_NULL: + case AMDGPU::MODE: continue; case AMDGPU::SRC_POPS_EXITING_WAVE_ID: @@ -727,6 +747,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::VCC: case AMDGPU::VCC_LO: case AMDGPU::VCC_HI: + case AMDGPU::VCC_LO_LO16: + case AMDGPU::VCC_LO_HI16: + case AMDGPU::VCC_HI_LO16: + case AMDGPU::VCC_HI_HI16: Info.UsesVCC = true; continue; @@ -764,15 +788,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( break; } - if (AMDGPU::SReg_32RegClass.contains(Reg)) { + if (AMDGPU::SReg_32RegClass.contains(Reg) || + AMDGPU::SReg_LO16RegClass.contains(Reg) || + AMDGPU::SGPR_HI16RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && "trap handler registers should not be used"); IsSGPR = true; Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { + } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || + AMDGPU::VGPR_LO16RegClass.contains(Reg) || + AMDGPU::VGPR_HI16RegClass.contains(Reg)) { IsSGPR = false; Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) { + } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || + AMDGPU::AGPR_LO16RegClass.contains(Reg)) { IsSGPR = false; IsAGPR = true; Width = 1; @@ -794,6 +823,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { IsSGPR = true; Width = 3; + } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 3; } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -812,6 +845,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { IsSGPR = true; Width = 5; + } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 5; + } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { + IsSGPR = false; + Width = 6; + } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { + IsSGPR = true; + Width = 6; + } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 6; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -820,6 +867,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { IsSGPR = false; Width = 8; + } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 8; } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -862,8 +913,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( const MachineOperand *CalleeOp = TII->getNamedOperand(MI, AMDGPU::OpName::callee); - const Function *Callee = cast<Function>(CalleeOp->getGlobal()); - if (Callee->isDeclaration()) { + + const Function *Callee = getCalleeFunction(*CalleeOp); + if (!Callee || Callee->isDeclaration()) { // If this is a call to an external function, we can't do much. Make // conservative guesses. @@ -874,7 +926,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( MaxVGPR = std::max(MaxVGPR, 23); MaxAGPR = std::max(MaxAGPR, 23); - CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384)); + CalleeFrameSize = std::max(CalleeFrameSize, + static_cast<uint64_t>(AssumedStackSizeForExternalCall)); + Info.UsesVCC = true; Info.UsesFlatScratch = ST.hasFlatAddressSpace(); Info.HasDynamicallySizedStack = true; @@ -906,7 +960,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.HasRecursion |= I->second.HasRecursion; } - if (!Callee->doesNotRecurse()) + // FIXME: Call site could have norecurse on it + if (!Callee || !Callee->doesNotRecurse()) Info.HasRecursion = true; } } @@ -1108,7 +1163,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | S_00B84C_EXCP_EN(0); - ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize, + ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize, ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU); } @@ -1132,40 +1187,41 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { - OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); - OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4); + OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc1); - OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4); + OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); + OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); - OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); + OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); + OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks)); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. } else { - OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | + OutStreamer->emitInt32(RsrcReg); + OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); - OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue( + OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); + OutStreamer->emitIntValue( S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); } if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { - OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); - OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); - OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); - OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); + OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); + OutStreamer->emitInt32( + S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); + OutStreamer->emitInt32(MFI->getPSInputEnable()); + OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); + OutStreamer->emitInt32(MFI->getPSInputAddr()); } - OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); - OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4); - OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4); - OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); + OutStreamer->emitInt32(R_SPILLED_SGPRS); + OutStreamer->emitInt32(MFI->getNumSpilledSGPRs()); + OutStreamer->emitInt32(R_SPILLED_VGPRS); + OutStreamer->emitInt32(MFI->getNumSpilledVGPRs()); } // This is the equivalent of EmitProgramInfoSI above, but for when the OS type @@ -1304,7 +1360,18 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, *MF->getSubtarget().getRegisterInfo()); return false; + } else if (MO.isImm()) { + int64_t Val = MO.getImm(); + if (AMDGPU::isInlinableIntLiteral(Val)) { + O << Val; + } else if (isUInt<16>(Val)) { + O << format("0x%" PRIx16, static_cast<uint16_t>(Val)); + } else if (isUInt<32>(Val)) { + O << format("0x%" PRIx32, static_cast<uint32_t>(Val)); + } else { + O << format("0x%" PRIx64, static_cast<uint64_t>(Val)); + } + return false; } - return true; } |