diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-08-22 19:00:43 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2021-11-13 20:39:49 +0000 |
commit | fe6060f10f634930ff71b7c50291ddc610da2475 (patch) | |
tree | 1483580c790bd4d27b6500a7542b5ee00534d3cc /contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils | |
parent | b61bce17f346d79cecfd8f195a64b10f77be43b1 (diff) | |
parent | 344a3780b2e33f6ca763666c380202b18aab72a3 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils')
8 files changed, 848 insertions, 57 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index c8a85d76a55b..0bee9022975e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -15,19 +15,19 @@ namespace AMDGPU { namespace SendMsg { // This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h. -const char* const IdSymbolic[] = { +const char *const IdSymbolic[ID_GAPS_LAST_] = { nullptr, "MSG_INTERRUPT", "MSG_GS", "MSG_GS_DONE", - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, + "MSG_SAVEWAVE", + "MSG_STALL_WAVE_GEN", + "MSG_HALT_WAVES", + "MSG_ORDERED_PS_DONE", + "MSG_EARLY_PRIM_DEALLOC", "MSG_GS_ALLOC_REQ", "MSG_GET_DOORBELL", - nullptr, + "MSG_GET_DDID", nullptr, nullptr, nullptr, @@ -35,7 +35,7 @@ const char* const IdSymbolic[] = { }; // These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h. -const char* const OpSysSymbolic[] = { +const char *const OpSysSymbolic[OP_SYS_LAST_] = { nullptr, "SYSMSG_OP_ECC_ERR_INTERRUPT", "SYSMSG_OP_REG_RD", @@ -43,7 +43,7 @@ const char* const OpSysSymbolic[] = { "SYSMSG_OP_TTRACE_PC" }; -const char* const OpGsSymbolic[] = { +const char *const OpGsSymbolic[OP_GS_LAST_] = { "GS_OP_NOP", "GS_OP_CUT", "GS_OP_EMIT", diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index 3eb27c5e5f42..d1deb570a938 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H +#include "SIDefines.h" + namespace llvm { class StringLiteral; @@ -17,9 +19,9 @@ namespace AMDGPU { namespace SendMsg { // Symbolic names for the sendmsg(...) syntax. -extern const char* const IdSymbolic[]; -extern const char* const OpSysSymbolic[]; -extern const char* const OpGsSymbolic[]; +extern const char *const IdSymbolic[ID_GAPS_LAST_]; +extern const char *const OpSysSymbolic[OP_SYS_LAST_]; +extern const char *const OpGsSymbolic[OP_GS_LAST_]; } // namespace SendMsg diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4c1e4dec7ecb..29bbf50cbfdc 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -30,7 +30,8 @@ static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion( "amdhsa-code-object-version", llvm::cl::Hidden, - llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(3)); + llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4), + llvm::cl::ZeroOrMore); namespace { @@ -96,23 +97,36 @@ Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) { return ELF::ELFABIVERSION_AMDGPU_HSA_V2; case 3: return ELF::ELFABIVERSION_AMDGPU_HSA_V3; + case 4: + return ELF::ELFABIVERSION_AMDGPU_HSA_V4; default: - return ELF::ELFABIVERSION_AMDGPU_HSA_V3; + report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") + + Twine(AmdhsaCodeObjectVersion)); } } bool isHsaAbiVersion2(const MCSubtargetInfo *STI) { - if (const auto &&HsaAbiVer = getHsaAbiVersion(STI)) - return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V2; + if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) + return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V2; return false; } bool isHsaAbiVersion3(const MCSubtargetInfo *STI) { - if (const auto &&HsaAbiVer = getHsaAbiVersion(STI)) - return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V3; + if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) + return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3; + return false; +} + +bool isHsaAbiVersion4(const MCSubtargetInfo *STI) { + if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) + return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4; return false; } +bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) { + return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI); +} + #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL @@ -141,6 +155,34 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) { return NewInfo ? NewInfo->Opcode : -1; } +unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, + const MIMGDimInfo *Dim, bool IsA16, + bool IsG16Supported) { + unsigned AddrWords = BaseOpcode->NumExtraArgs; + unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + if (IsA16) + AddrWords += divideCeil(AddrComponents, 2); + else + AddrWords += AddrComponents; + + // Note: For subtargets that support A16 but not G16, enabling A16 also + // enables 16 bit gradients. + // For subtargets that support A16 (operand) and G16 (done with a different + // instruction encoding), they are independent. + + if (BaseOpcode->Gradients) { + if ((IsA16 && !IsG16Supported) || BaseOpcode->G16) + // There are two gradients per coordinate, we pack them separately. + // For the 3d case, + // we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) + AddrWords += alignTo<2>(Dim->NumGradients / 2); + else + AddrWords += Dim->NumGradients; + } + return AddrWords; +} + struct MUBUFInfo { uint16_t Opcode; uint16_t BaseOpcode; @@ -148,6 +190,7 @@ struct MUBUFInfo { bool has_vaddr; bool has_srsrc; bool has_soffset; + bool IsBufferInv; }; struct MTBUFInfo { @@ -164,12 +207,23 @@ struct SMInfo { bool IsBuffer; }; +struct VOPInfo { + uint16_t Opcode; + bool IsSingle; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL #define GET_SMInfoTable_DECL #define GET_SMInfoTable_IMPL +#define GET_VOP1InfoTable_DECL +#define GET_VOP1InfoTable_IMPL +#define GET_VOP2InfoTable_DECL +#define GET_VOP2InfoTable_IMPL +#define GET_VOP3InfoTable_DECL +#define GET_VOP3InfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMTBUFBaseOpcode(unsigned Opc) { @@ -232,11 +286,31 @@ bool getMUBUFHasSoffset(unsigned Opc) { return Info ? Info->has_soffset : false; } +bool getMUBUFIsBufferInv(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->IsBufferInv : false; +} + bool getSMEMIsBuffer(unsigned Opc) { const SMInfo *Info = getSMEMOpcodeHelper(Opc); return Info ? Info->IsBuffer : false; } +bool getVOP1IsSingle(unsigned Opc) { + const VOPInfo *Info = getVOP1OpcodeHelper(Opc); + return Info ? Info->IsSingle : false; +} + +bool getVOP2IsSingle(unsigned Opc) { + const VOPInfo *Info = getVOP2OpcodeHelper(Opc); + return Info ? Info->IsSingle : false; +} + +bool getVOP3IsSingle(unsigned Opc) { + const VOPInfo *Info = getVOP3OpcodeHelper(Opc); + return Info ? Info->IsSingle : false; +} + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. @@ -247,7 +321,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) { namespace IsaInfo { AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) - : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) { + : STI(STI), XnackSetting(TargetIDSetting::Any), + SramEccSetting(TargetIDSetting::Any) { if (!STI.getFeatureBits().test(FeatureSupportsXNACK)) XnackSetting = TargetIDSetting::Unsupported; if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC)) @@ -334,25 +409,109 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) { } } -void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { - auto TargetTriple = STI->getTargetTriple(); - auto Version = getIsaVersion(STI->getCPU()); +std::string AMDGPUTargetID::toString() const { + std::string StringRep = ""; + raw_string_ostream StreamRep(StringRep); - Stream << TargetTriple.getArchName() << '-' - << TargetTriple.getVendorName() << '-' - << TargetTriple.getOSName() << '-' - << TargetTriple.getEnvironmentName() << '-' - << "gfx" - << Version.Major - << Version.Minor - << Version.Stepping; + auto TargetTriple = STI.getTargetTriple(); + auto Version = getIsaVersion(STI.getCPU()); - if (hasXNACK(*STI)) - Stream << "+xnack"; - if (hasSRAMECC(*STI)) - Stream << "+sramecc"; + StreamRep << TargetTriple.getArchName() << '-' + << TargetTriple.getVendorName() << '-' + << TargetTriple.getOSName() << '-' + << TargetTriple.getEnvironmentName() << '-'; - Stream.flush(); + std::string Processor = ""; + // TODO: Following else statement is present here because we used various + // alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803'). + // Remove once all aliases are removed from GCNProcessors.td. + if (Version.Major >= 9) + Processor = STI.getCPU().str(); + else + Processor = (Twine("gfx") + Twine(Version.Major) + Twine(Version.Minor) + + Twine(Version.Stepping)) + .str(); + + std::string Features = ""; + if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) { + switch (*HsaAbiVersion) { + case ELF::ELFABIVERSION_AMDGPU_HSA_V2: + // Code object V2 only supported specific processors and had fixed + // settings for the XNACK. + if (Processor == "gfx600") { + } else if (Processor == "gfx601") { + } else if (Processor == "gfx602") { + } else if (Processor == "gfx700") { + } else if (Processor == "gfx701") { + } else if (Processor == "gfx702") { + } else if (Processor == "gfx703") { + } else if (Processor == "gfx704") { + } else if (Processor == "gfx705") { + } else if (Processor == "gfx801") { + if (!isXnackOnOrAny()) + report_fatal_error( + "AMD GPU code object V2 does not support processor " + Processor + + " without XNACK"); + } else if (Processor == "gfx802") { + } else if (Processor == "gfx803") { + } else if (Processor == "gfx805") { + } else if (Processor == "gfx810") { + if (!isXnackOnOrAny()) + report_fatal_error( + "AMD GPU code object V2 does not support processor " + Processor + + " without XNACK"); + } else if (Processor == "gfx900") { + if (isXnackOnOrAny()) + Processor = "gfx901"; + } else if (Processor == "gfx902") { + if (isXnackOnOrAny()) + Processor = "gfx903"; + } else if (Processor == "gfx904") { + if (isXnackOnOrAny()) + Processor = "gfx905"; + } else if (Processor == "gfx906") { + if (isXnackOnOrAny()) + Processor = "gfx907"; + } else if (Processor == "gfx90c") { + if (isXnackOnOrAny()) + report_fatal_error( + "AMD GPU code object V2 does not support processor " + Processor + + " with XNACK being ON or ANY"); + } else { + report_fatal_error( + "AMD GPU code object V2 does not support processor " + Processor); + } + break; + case ELF::ELFABIVERSION_AMDGPU_HSA_V3: + // xnack. + if (isXnackOnOrAny()) + Features += "+xnack"; + // In code object v2 and v3, "sramecc" feature was spelled with a + // hyphen ("sram-ecc"). + if (isSramEccOnOrAny()) + Features += "+sram-ecc"; + break; + case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + // sramecc. + if (getSramEccSetting() == TargetIDSetting::Off) + Features += ":sramecc-"; + else if (getSramEccSetting() == TargetIDSetting::On) + Features += ":sramecc+"; + // xnack. + if (getXnackSetting() == TargetIDSetting::Off) + Features += ":xnack-"; + else if (getXnackSetting() == TargetIDSetting::On) + Features += ":xnack+"; + break; + default: + break; + } + } + + StreamRep << Processor << Features; + + StreamRep.flush(); + return StringRep; } unsigned getWavefrontSize(const MCSubtargetInfo *STI) { @@ -402,6 +561,8 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) { unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) { // FIXME: Need to take scratch memory into account. + if (isGFX90A(*STI)) + return 8; if (!isGFX10Plus(*STI)) return 10; return hasGFX10_3Insts(*STI) ? 16 : 20; @@ -531,6 +692,9 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) { unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, Optional<bool> EnableWavefrontSize32) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 8; + bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); @@ -543,6 +707,8 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, Optional<bool> EnableWavefrontSize32) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 8; bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : @@ -552,12 +718,16 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 512; if (!isGFX10Plus(*STI)) return 256; return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 512; return 256; } @@ -653,6 +823,11 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1); } + if (AMDGPU::isGFX90A(*STI)) { + AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, + STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0); + } return KD; } @@ -1049,23 +1224,32 @@ int64_t getMsgId(const StringRef Name) { return ID_UNKNOWN_; } -static bool isValidMsgId(int64_t MsgId) { - return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId]; -} - bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) { if (Strict) { - if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL) + switch (MsgId) { + case ID_SAVEWAVE: + return isVI(STI) || isGFX9Plus(STI); + case ID_STALL_WAVE_GEN: + case ID_HALT_WAVES: + case ID_ORDERED_PS_DONE: + case ID_GS_ALLOC_REQ: + case ID_GET_DOORBELL: return isGFX9Plus(STI); - else - return isValidMsgId(MsgId); + case ID_EARLY_PRIM_DEALLOC: + return isGFX9(STI); + case ID_GET_DDID: + return isGFX10Plus(STI); + default: + return 0 <= MsgId && MsgId < ID_GAPS_LAST_ && IdSymbolic[MsgId]; + } } else { return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId); } } StringRef getMsgName(int64_t MsgId) { - return isValidMsgId(MsgId)? IdSymbolic[MsgId] : ""; + assert(0 <= MsgId && MsgId < ID_GAPS_LAST_); + return IdSymbolic[MsgId]; } int64_t getMsgOpId(int64_t MsgId, const StringRef Name) { @@ -1080,7 +1264,9 @@ int64_t getMsgOpId(int64_t MsgId, const StringRef Name) { return OP_UNKNOWN_; } -bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) { +bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI, + bool Strict) { + assert(isValidMsgId(MsgId, STI, Strict)); if (!Strict) return 0 <= OpId && isUInt<OP_WIDTH_>(OpId); @@ -1103,7 +1289,9 @@ StringRef getMsgOpName(int64_t MsgId, int64_t OpId) { return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId]; } -bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) { +bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, + const MCSubtargetInfo &STI, bool Strict) { + assert(isValidMsgOp(MsgId, OpId, STI, Strict)); if (!Strict) return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId); @@ -1156,6 +1344,17 @@ unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } +bool getHasColorExport(const Function &F) { + // As a safe default always respond as if PS has color exports. + return getIntegerAttribute( + F, "amdgpu-color-export", + F.getCallingConv() == CallingConv::AMDGPU_PS ? 1 : 0) != 0; +} + +bool getHasDepthExport(const Function &F) { + return getIntegerAttribute(F, "amdgpu-depth-export", 0) != 0; +} + bool isShader(CallingConv::ID cc) { switch(cc) { case CallingConv::AMDGPU_VS: @@ -1259,6 +1458,10 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; } +bool isGFX10_AEncoding(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10_AEncoding]; +} + bool isGFX10_BEncoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]; } @@ -1267,6 +1470,14 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts]; } +bool isGFX90A(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; +} + +bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; +} + bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); @@ -1374,6 +1585,9 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return true; default: return false; @@ -1413,41 +1627,67 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::VReg_64RegClassID: case AMDGPU::AReg_64RegClassID: case AMDGPU::SReg_64_XEXECRegClassID: + case AMDGPU::VReg_64_Align2RegClassID: + case AMDGPU::AReg_64_Align2RegClassID: return 64; case AMDGPU::SGPR_96RegClassID: case AMDGPU::SReg_96RegClassID: case AMDGPU::VReg_96RegClassID: case AMDGPU::AReg_96RegClassID: + case AMDGPU::VReg_96_Align2RegClassID: + case AMDGPU::AReg_96_Align2RegClassID: + case AMDGPU::AV_96RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: case AMDGPU::VReg_128RegClassID: case AMDGPU::AReg_128RegClassID: + case AMDGPU::VReg_128_Align2RegClassID: + case AMDGPU::AReg_128_Align2RegClassID: + case AMDGPU::AV_128RegClassID: return 128; case AMDGPU::SGPR_160RegClassID: case AMDGPU::SReg_160RegClassID: case AMDGPU::VReg_160RegClassID: case AMDGPU::AReg_160RegClassID: + case AMDGPU::VReg_160_Align2RegClassID: + case AMDGPU::AReg_160_Align2RegClassID: + case AMDGPU::AV_160RegClassID: return 160; case AMDGPU::SGPR_192RegClassID: case AMDGPU::SReg_192RegClassID: case AMDGPU::VReg_192RegClassID: case AMDGPU::AReg_192RegClassID: + case AMDGPU::VReg_192_Align2RegClassID: + case AMDGPU::AReg_192_Align2RegClassID: return 192; + case AMDGPU::SGPR_224RegClassID: + case AMDGPU::SReg_224RegClassID: + case AMDGPU::VReg_224RegClassID: + case AMDGPU::AReg_224RegClassID: + case AMDGPU::VReg_224_Align2RegClassID: + case AMDGPU::AReg_224_Align2RegClassID: + return 224; case AMDGPU::SGPR_256RegClassID: case AMDGPU::SReg_256RegClassID: case AMDGPU::VReg_256RegClassID: case AMDGPU::AReg_256RegClassID: + case AMDGPU::VReg_256_Align2RegClassID: + case AMDGPU::AReg_256_Align2RegClassID: return 256; case AMDGPU::SGPR_512RegClassID: case AMDGPU::SReg_512RegClassID: case AMDGPU::VReg_512RegClassID: case AMDGPU::AReg_512RegClassID: + case AMDGPU::VReg_512_Align2RegClassID: + case AMDGPU::AReg_512_Align2RegClassID: return 512; case AMDGPU::SGPR_1024RegClassID: case AMDGPU::SReg_1024RegClassID: case AMDGPU::VReg_1024RegClassID: case AMDGPU::AReg_1024RegClassID: + case AMDGPU::VReg_1024_Align2RegClassID: + case AMDGPU::AReg_1024_Align2RegClassID: return 1024; default: llvm_unreachable("Unexpected register class"); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f9378693cf48..72c872dec5ba 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -44,6 +44,12 @@ bool isHsaAbiVersion2(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 3, /// false otherwise. bool isHsaAbiVersion3(const MCSubtargetInfo *STI); +/// \returns True if HSA OS ABI Version identification is 4, +/// false otherwise. +bool isHsaAbiVersion4(const MCSubtargetInfo *STI); +/// \returns True if HSA OS ABI Version identification is 3 or 4, +/// false otherwise. +bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI); struct GcnBufferFormatInfo { unsigned Format; @@ -78,6 +84,7 @@ enum class TargetIDSetting { class AMDGPUTargetID { private: + const MCSubtargetInfo &STI; TargetIDSetting XnackSetting; TargetIDSetting SramEccSetting; @@ -145,10 +152,10 @@ public: void setTargetIDFromFeaturesString(StringRef FS); void setTargetIDFromTargetIDStream(StringRef TargetID); -}; -/// Streams isa version string for given subtarget \p STI into \p Stream. -void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream); + /// \returns String representation of an object. + std::string toString() const; +}; /// \returns Wavefront size for given subtarget \p STI. unsigned getWavefrontSize(const MCSubtargetInfo *STI); @@ -284,6 +291,7 @@ struct MIMGBaseOpcodeInfo { bool Coordinates; bool LodOrClampOrMip; bool HasD16; + bool MSAA; }; LLVM_READONLY @@ -293,6 +301,7 @@ struct MIMGDimInfo { MIMGDim Dim; uint8_t NumCoords; uint8_t NumGradients; + bool MSAA; bool DA; uint8_t Encoding; const char *AsmSuffix; @@ -338,6 +347,11 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, LLVM_READONLY int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels); +LLVM_READONLY +unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, + const MIMGDimInfo *Dim, bool IsA16, + bool IsG16Supported); + struct MIMGInfo { uint16_t Opcode; uint16_t BaseOpcode; @@ -386,9 +400,21 @@ LLVM_READONLY bool getMUBUFHasSoffset(unsigned Opc); LLVM_READONLY +bool getMUBUFIsBufferInv(unsigned Opc); + +LLVM_READONLY bool getSMEMIsBuffer(unsigned Opc); LLVM_READONLY +bool getVOP1IsSingle(unsigned Opc); + +LLVM_READONLY +bool getVOP2IsSingle(unsigned Opc); + +LLVM_READONLY +bool getVOP3IsSingle(unsigned Opc); + +LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, @@ -459,6 +485,14 @@ struct Waitcnt { return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u; } + bool hasWaitExceptVsCnt() const { + return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u; + } + + bool hasWaitVsCnt() const { + return VsCnt != ~0u; + } + bool dominates(const Waitcnt &Other) const { return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt; @@ -627,10 +661,12 @@ LLVM_READNONE bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true); LLVM_READNONE -bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true); +bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI, + bool Strict = true); LLVM_READNONE -bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true); +bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, + const MCSubtargetInfo &STI, bool Strict = true); LLVM_READNONE bool msgRequiresOp(int64_t MsgId); @@ -653,6 +689,10 @@ uint64_t encodeMsg(uint64_t MsgId, unsigned getInitialPSInputAddr(const Function &F); +bool getHasColorExport(const Function &F); + +bool getHasDepthExport(const Function &F); + LLVM_READNONE bool isShader(CallingConv::ID CC); @@ -701,8 +741,11 @@ bool isGFX9Plus(const MCSubtargetInfo &STI); bool isGFX10(const MCSubtargetInfo &STI); bool isGFX10Plus(const MCSubtargetInfo &STI); bool isGCN3Encoding(const MCSubtargetInfo &STI); +bool isGFX10_AEncoding(const MCSubtargetInfo &STI); bool isGFX10_BEncoding(const MCSubtargetInfo &STI); bool hasGFX10_3Insts(const MCSubtargetInfo &STI); +bool isGFX90A(const MCSubtargetInfo &STI); +bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -746,12 +789,17 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: return 4; case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return 8; case AMDGPU::OPERAND_REG_IMM_INT16: @@ -847,6 +895,11 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, const GCNSubtarget *Subtarget, Align Alignment = Align(4)); +LLVM_READNONE +inline bool isLegal64BitDPPControl(unsigned DC) { + return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; +} + /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp new file mode 100644 index 000000000000..da8fcf3900bb --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -0,0 +1,355 @@ +//===- AMDGPULDSUtils.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AMDGPU LDS related helper utility functions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPULDSUtils.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/ReplaceConstant.h" + +using namespace llvm; + +namespace llvm { + +namespace AMDGPU { + +// An helper class for collecting all reachable callees for each kernel defined +// within the module. +class CollectReachableCallees { + Module &M; + CallGraph CG; + SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions; + + // Collect all address taken functions within the module. + void collectAddressTakenFunctions() { + auto *ECNode = CG.getExternalCallingNode(); + + for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) { + auto *CGN = GI->second; + auto *F = CGN->getFunction(); + if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F)) + continue; + AddressTakenFunctions.insert(CGN); + } + } + + // For given kernel, collect all its reachable non-kernel functions. + SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) { + SmallPtrSet<Function *, 8> ReachableCallees; + + // Call graph node which represents this kernel. + auto *KCGN = CG[K]; + + // Go through all call graph nodes reachable from the node representing this + // kernel, visit all their call sites, if the call site is direct, add + // corresponding callee to reachable callee set, if it is indirect, resolve + // the indirect call site to potential reachable callees, add them to + // reachable callee set, and repeat the process for the newly added + // potential callee nodes. + // + // FIXME: Need to handle bit-casted function pointers. + // + SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN)); + SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes; + while (!CGNStack.empty()) { + auto *CGN = CGNStack.pop_back_val(); + + if (!VisitedCGNodes.insert(CGN).second) + continue; + + for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) { + auto *RCB = cast<CallBase>(GI->first.getValue()); + auto *RCGN = GI->second; + + if (auto *DCallee = RCGN->getFunction()) { + ReachableCallees.insert(DCallee); + } else if (RCB->isIndirectCall()) { + auto *RCBFTy = RCB->getFunctionType(); + for (auto *ACGN : AddressTakenFunctions) { + auto *ACallee = ACGN->getFunction(); + if (ACallee->getFunctionType() == RCBFTy) { + ReachableCallees.insert(ACallee); + CGNStack.append(df_begin(ACGN), df_end(ACGN)); + } + } + } + } + } + + return ReachableCallees; + } + +public: + explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { + // Collect address taken functions. + collectAddressTakenFunctions(); + } + + void collectReachableCallees( + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { + // Collect reachable callee set for each kernel defined in the module. + for (Function &F : M.functions()) { + if (!AMDGPU::isKernelCC(&F)) + continue; + Function *K = &F; + KernelToCallees[K] = collectReachableCallees(K); + } + } +}; + +void collectReachableCallees( + Module &M, + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { + CollectReachableCallees CRC{M}; + CRC.collectReachableCallees(KernelToCallees); +} + +SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { + SmallPtrSet<Function *, 8> LDSAccessors; + SmallVector<User *, 8> UserStack(GV->users()); + SmallPtrSet<User *, 8> VisitedUsers; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + // `U` is a global variable which is initialized with LDS. Ignore LDS. + if (isa<GlobalValue>(U)) + return SmallPtrSet<Function *, 8>(); + + // Recursively explore constant users. + if (isa<Constant>(U)) { + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction, if it belongs to a non-kernel function F, + // then collect F. + Function *F = cast<Instruction>(U)->getFunction(); + if (!AMDGPU::isKernelCC(F)) + LDSAccessors.insert(F); + } + + return LDSAccessors; +} + +DenseMap<Function *, SmallPtrSet<Instruction *, 8>> +getFunctionToInstsMap(User *U, bool CollectKernelInsts) { + DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts; + SmallVector<User *, 8> UserStack; + SmallPtrSet<User *, 8> VisitedUsers; + + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *UU = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(UU).second) + continue; + + if (isa<GlobalValue>(UU)) + continue; + + if (isa<Constant>(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + auto *I = cast<Instruction>(UU); + Function *F = I->getFunction(); + if (CollectKernelInsts) { + if (!AMDGPU::isKernelCC(F)) { + continue; + } + } else { + if (AMDGPU::isKernelCC(F)) { + continue; + } + } + + FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>())); + FunctionToInsts[F].insert(I); + } + + return FunctionToInsts; +} + +bool isKernelCC(const Function *Func) { + return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); +} + +Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { + return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), + GV->getValueType()); +} + +static void collectFunctionUses(User *U, const Function *F, + SetVector<Instruction *> &InstUsers) { + SmallVector<User *> Stack{U}; + + while (!Stack.empty()) { + U = Stack.pop_back_val(); + + if (auto *I = dyn_cast<Instruction>(U)) { + if (I->getFunction() == F) + InstUsers.insert(I); + continue; + } + + if (!isa<ConstantExpr>(U)) + continue; + + append_range(Stack, U->users()); + } +} + +void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) { + SetVector<Instruction *> InstUsers; + + collectFunctionUses(C, F, InstUsers); + for (Instruction *I : InstUsers) { + convertConstantExprsToInstructions(I, C); + } +} + +bool hasUserInstruction(const GlobalValue *GV) { + SmallPtrSet<const User *, 8> Visited; + SmallVector<const User *, 16> Stack(GV->users()); + + while (!Stack.empty()) { + const User *U = Stack.pop_back_val(); + + if (!Visited.insert(U).second) + continue; + + if (isa<Instruction>(U)) + return true; + + append_range(Stack, U->users()); + } + + return false; +} + +bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { + // We are not interested in kernel LDS lowering for module LDS itself. + if (F && GV.getName() == "llvm.amdgcn.module.lds") + return false; + + bool Ret = false; + SmallPtrSet<const User *, 8> Visited; + SmallVector<const User *, 16> Stack(GV.users()); + SmallPtrSet<const GlobalValue *, 8> GlobalUsers; + + assert(!F || isKernelCC(F)); + + while (!Stack.empty()) { + const User *V = Stack.pop_back_val(); + Visited.insert(V); + + if (auto *G = dyn_cast<GlobalValue>(V)) { + StringRef GName = G->getName(); + if (F && GName != "llvm.used" && GName != "llvm.compiler.used") { + // For kernel LDS lowering, if G is not a compiler.used list, then we + // cannot lower the lds GV since we cannot replace the use of GV within + // G. + return false; + } + GlobalUsers.insert(G); + continue; + } + + if (auto *I = dyn_cast<Instruction>(V)) { + const Function *UF = I->getFunction(); + if (UF == F) { + // Used from this kernel, we want to put it into the structure. + Ret = true; + } else if (!F) { + // For module LDS lowering, lowering is required if the user instruction + // is from non-kernel function. + Ret |= !isKernelCC(UF); + } + continue; + } + + // User V should be a constant, recursively visit users of V. + assert(isa<Constant>(V) && "Expected a constant."); + append_range(Stack, V->users()); + } + + if (!F && !Ret) { + // For module LDS lowering, we have not yet decided if we should lower GV or + // not. Explore all global users of GV, and check if atleast one of these + // global users appear as an use within an instruction (possibly nested use + // via constant expression), if so, then conservately lower LDS. + for (auto *G : GlobalUsers) + Ret |= hasUserInstruction(G); + } + + return Ret; +} + +std::vector<GlobalVariable *> findVariablesToLower(Module &M, + const Function *F) { + std::vector<llvm::GlobalVariable *> LocalVars; + for (auto &GV : M.globals()) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + continue; + } + if (!GV.hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ + // the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another, in which case this transform + // is not required + continue; + } + if (!isa<UndefValue>(GV.getInitializer())) { + // Initializers are unimplemented for local address space. + // Leave such variables in place for consistent error reporting. + continue; + } + if (GV.isConstant()) { + // A constant undef variable can't be written to, and any load is + // undef, so it should be eliminated by the optimizer. It could be + // dropped by the back end if not. This pass skips over it. + continue; + } + if (!shouldLowerLDSToStruct(GV, F)) { + continue; + } + LocalVars.push_back(&GV); + } + return LocalVars; +} + +SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) { + SmallPtrSet<GlobalValue *, 32> UsedList; + + SmallVector<GlobalValue *, 32> TmpVec; + collectUsedGlobalVariables(M, TmpVec, true); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + TmpVec.clear(); + collectUsedGlobalVariables(M, TmpVec, false); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + return UsedList; +} + +} // end namespace AMDGPU + +} // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h new file mode 100644 index 000000000000..ffcafb9b76ce --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -0,0 +1,70 @@ +//===- AMDGPULDSUtils.h - LDS related helper functions -*- C++ -*----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AMDGPU LDS related helper utility functions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H + +#include "AMDGPU.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Constants.h" + +namespace llvm { + +class ConstantExpr; + +namespace AMDGPU { + +/// Collect reachable callees for each kernel defined in the module \p M and +/// return collected callees at \p KernelToCallees. +void collectReachableCallees( + Module &M, + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees); + +/// For the given LDS global \p GV, visit all its users and collect all +/// non-kernel functions within which \p GV is used and return collected list of +/// such non-kernel functions. +SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV); + +/// Collect all the instructions where user \p U belongs to. \p U could be +/// instruction itself or it could be a constant expression which is used within +/// an instruction. If \p CollectKernelInsts is true, collect instructions only +/// from kernels, otherwise collect instructions only from non-kernel functions. +DenseMap<Function *, SmallPtrSet<Instruction *, 8>> +getFunctionToInstsMap(User *U, bool CollectKernelInsts); + +bool isKernelCC(const Function *Func); + +Align getAlign(DataLayout const &DL, const GlobalVariable *GV); + +/// \returns true if a given global variable \p GV (or its global users) appear +/// as an use within some instruction (either from kernel or from non-kernel). +bool hasUserInstruction(const GlobalValue *GV); + +/// \returns true if an LDS global requres lowering to a module LDS structure +/// if \p F is not given. If \p F is given it must be a kernel and function +/// \returns true if an LDS global is directly used from that kernel and it +/// is safe to replace its uses with a kernel LDS structure member. +bool shouldLowerLDSToStruct(const GlobalVariable &GV, + const Function *F = nullptr); + +std::vector<GlobalVariable *> findVariablesToLower(Module &M, + const Function *F = nullptr); + +SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M); + +/// Replace all uses of constant \p C with instructions in \p F. +void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F); +} // end namespace AMDGPU + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index b7dd757a8af3..f6b5975f1934 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -41,7 +41,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) { } return; } - BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA; + BlobType = ELF::NT_AMD_PAL_METADATA; NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); if (!NamedMD || !NamedMD->getNumOperands()) { // Emit msgpack metadata by default @@ -69,7 +69,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) { // Metadata. bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) { BlobType = Type; - if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA) + if (Type == ELF::NT_AMD_PAL_METADATA) return setFromLegacyBlob(Blob); return setFromMsgPackBlob(Blob); } @@ -243,6 +243,27 @@ void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF, Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val); } +// Set the amount of LDS used in bytes in the metadata. +void AMDGPUPALMetadata::setFunctionLdsSize(const MachineFunction &MF, + unsigned Val) { + auto Node = getShaderFunction(MF.getFunction().getName()); + Node[".lds_size"] = MsgPackDoc.getNode(Val); +} + +// Set the number of used vgprs in the metadata. +void AMDGPUPALMetadata::setFunctionNumUsedVgprs(const MachineFunction &MF, + unsigned Val) { + auto Node = getShaderFunction(MF.getFunction().getName()); + Node[".vgpr_count"] = MsgPackDoc.getNode(Val); +} + +// Set the number of used vgprs in the metadata. +void AMDGPUPALMetadata::setFunctionNumUsedSgprs(const MachineFunction &MF, + unsigned Val) { + auto Node = getShaderFunction(MF.getFunction().getName()); + Node[".sgpr_count"] = MsgPackDoc.getNode(Val); +} + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void AMDGPUPALMetadata::setWave32(unsigned CC) { @@ -592,6 +613,41 @@ static const char *getRegisterName(unsigned RegNum) { {0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"}, {0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"}, + {0x2e28, "COMPUTE_PGM_RSRC3"}, + {0x2e2a, "COMPUTE_SHADER_CHKSUM"}, + {0x2e24, "COMPUTE_USER_ACCUM_0"}, + {0x2e25, "COMPUTE_USER_ACCUM_1"}, + {0x2e26, "COMPUTE_USER_ACCUM_2"}, + {0x2e27, "COMPUTE_USER_ACCUM_3"}, + {0xa1ff, "GE_MAX_OUTPUT_PER_SUBGROUP"}, + {0xa2d3, "GE_NGG_SUBGRP_CNTL"}, + {0xc25f, "GE_STEREO_CNTL"}, + {0xc262, "GE_USER_VGPR_EN"}, + {0xc258, "IA_MULTI_VGT_PARAM_PIPED"}, + {0xa210, "PA_STEREO_CNTL"}, + {0xa1c2, "SPI_SHADER_IDX_FORMAT"}, + {0x2c80, "SPI_SHADER_PGM_CHKSUM_GS"}, + {0x2d00, "SPI_SHADER_PGM_CHKSUM_HS"}, + {0x2c06, "SPI_SHADER_PGM_CHKSUM_PS"}, + {0x2c45, "SPI_SHADER_PGM_CHKSUM_VS"}, + {0x2c88, "SPI_SHADER_PGM_LO_GS"}, + {0x2cb2, "SPI_SHADER_USER_ACCUM_ESGS_0"}, + {0x2cb3, "SPI_SHADER_USER_ACCUM_ESGS_1"}, + {0x2cb4, "SPI_SHADER_USER_ACCUM_ESGS_2"}, + {0x2cb5, "SPI_SHADER_USER_ACCUM_ESGS_3"}, + {0x2d32, "SPI_SHADER_USER_ACCUM_LSHS_0"}, + {0x2d33, "SPI_SHADER_USER_ACCUM_LSHS_1"}, + {0x2d34, "SPI_SHADER_USER_ACCUM_LSHS_2"}, + {0x2d35, "SPI_SHADER_USER_ACCUM_LSHS_3"}, + {0x2c32, "SPI_SHADER_USER_ACCUM_PS_0"}, + {0x2c33, "SPI_SHADER_USER_ACCUM_PS_1"}, + {0x2c34, "SPI_SHADER_USER_ACCUM_PS_2"}, + {0x2c35, "SPI_SHADER_USER_ACCUM_PS_3"}, + {0x2c72, "SPI_SHADER_USER_ACCUM_VS_0"}, + {0x2c73, "SPI_SHADER_USER_ACCUM_VS_1"}, + {0x2c74, "SPI_SHADER_USER_ACCUM_VS_2"}, + {0x2c75, "SPI_SHADER_USER_ACCUM_VS_3"}, + {0, nullptr}}; auto Entry = RegInfoTable; for (; Entry->Num && Entry->Num != RegNum; ++Entry) @@ -653,7 +709,7 @@ void AMDGPUPALMetadata::toString(std::string &String) { // a .note record of the specified AMD type. Returns an empty blob if // there is no PAL metadata, void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) { - if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA) + if (Type == ELF::NT_AMD_PAL_METADATA) toLegacyBlob(Blob); else if (Type) toMsgPackBlob(Blob); @@ -790,7 +846,7 @@ const char *AMDGPUPALMetadata::getVendor() const { } // Get .note record type of metadata blob to be emitted: -// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or +// ELF::NT_AMD_PAL_METADATA (legacy key=val format), or // ELF::NT_AMDGPU_METADATA (MsgPack format), or // 0 (no PAL metadata). unsigned AMDGPUPALMetadata::getType() const { @@ -799,12 +855,12 @@ unsigned AMDGPUPALMetadata::getType() const { // Return whether the blob type is legacy PAL metadata. bool AMDGPUPALMetadata::isLegacy() const { - return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA; + return BlobType == ELF::NT_AMD_PAL_METADATA; } // Set legacy PAL metadata format. void AMDGPUPALMetadata::setLegacy() { - BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA; + BlobType = ELF::NT_AMD_PAL_METADATA; } // Erase all PAL metadata. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index 8fa1f738487c..7fdd9a8429c1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -80,6 +80,21 @@ public: // Set the stack frame size of a function in the metadata. void setFunctionScratchSize(const MachineFunction &MF, unsigned Val); + // Set the amount of LDS used in bytes in the metadata. This is an optional + // advisory record for logging etc; wave dispatch actually uses the rsrc1 + // register for the shader stage to determine the amount of LDS to allocate. + void setFunctionLdsSize(const MachineFunction &MF, unsigned Val); + + // Set the number of used vgprs in the metadata. This is an optional advisory + // record for logging etc; wave dispatch actually uses the rsrc1 register for + // the shader stage to determine the number of vgprs to allocate. + void setFunctionNumUsedVgprs(const MachineFunction &MF, unsigned Val); + + // Set the number of used sgprs in the metadata. This is an optional advisory + // record for logging etc; wave dispatch actually uses the rsrc1 register for + // the shader stage to determine the number of sgprs to allocate. + void setFunctionNumUsedSgprs(const MachineFunction &MF, unsigned Val); + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void setWave32(unsigned CC); @@ -95,7 +110,7 @@ public: const char *getVendor() const; // Get .note record type of metadata blob to be emitted: - // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or + // ELF::NT_AMD_PAL_METADATA (legacy key=val format), or // ELF::NT_AMDGPU_METADATA (MsgPack format), or // 0 (no PAL metadata). unsigned getType() const; |