aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-08-22 19:00:43 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-11-13 20:39:49 +0000
commitfe6060f10f634930ff71b7c50291ddc610da2475 (patch)
tree1483580c790bd4d27b6500a7542b5ee00534d3cc /contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils
parentb61bce17f346d79cecfd8f195a64b10f77be43b1 (diff)
parent344a3780b2e33f6ca763666c380202b18aab72a3 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp306
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h63
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp355
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h70
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp68
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h17
8 files changed, 848 insertions, 57 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index c8a85d76a55b..0bee9022975e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -15,19 +15,19 @@ namespace AMDGPU {
namespace SendMsg {
// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.
-const char* const IdSymbolic[] = {
+const char *const IdSymbolic[ID_GAPS_LAST_] = {
nullptr,
"MSG_INTERRUPT",
"MSG_GS",
"MSG_GS_DONE",
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
+ "MSG_SAVEWAVE",
+ "MSG_STALL_WAVE_GEN",
+ "MSG_HALT_WAVES",
+ "MSG_ORDERED_PS_DONE",
+ "MSG_EARLY_PRIM_DEALLOC",
"MSG_GS_ALLOC_REQ",
"MSG_GET_DOORBELL",
- nullptr,
+ "MSG_GET_DDID",
nullptr,
nullptr,
nullptr,
@@ -35,7 +35,7 @@ const char* const IdSymbolic[] = {
};
// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
-const char* const OpSysSymbolic[] = {
+const char *const OpSysSymbolic[OP_SYS_LAST_] = {
nullptr,
"SYSMSG_OP_ECC_ERR_INTERRUPT",
"SYSMSG_OP_REG_RD",
@@ -43,7 +43,7 @@ const char* const OpSysSymbolic[] = {
"SYSMSG_OP_TTRACE_PC"
};
-const char* const OpGsSymbolic[] = {
+const char *const OpGsSymbolic[OP_GS_LAST_] = {
"GS_OP_NOP",
"GS_OP_CUT",
"GS_OP_EMIT",
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index 3eb27c5e5f42..d1deb570a938 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -9,6 +9,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+#include "SIDefines.h"
+
namespace llvm {
class StringLiteral;
@@ -17,9 +19,9 @@ namespace AMDGPU {
namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
-extern const char* const IdSymbolic[];
-extern const char* const OpSysSymbolic[];
-extern const char* const OpGsSymbolic[];
+extern const char *const IdSymbolic[ID_GAPS_LAST_];
+extern const char *const OpSysSymbolic[OP_SYS_LAST_];
+extern const char *const OpGsSymbolic[OP_GS_LAST_];
} // namespace SendMsg
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4c1e4dec7ecb..29bbf50cbfdc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -30,7 +30,8 @@
static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion(
"amdhsa-code-object-version", llvm::cl::Hidden,
- llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(3));
+ llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4),
+ llvm::cl::ZeroOrMore);
namespace {
@@ -96,23 +97,36 @@ Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
return ELF::ELFABIVERSION_AMDGPU_HSA_V2;
case 3:
return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ case 4:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
default:
- return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") +
+ Twine(AmdhsaCodeObjectVersion));
}
}
bool isHsaAbiVersion2(const MCSubtargetInfo *STI) {
- if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
- return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V2;
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V2;
return false;
}
bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {
- if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
- return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ return false;
+}
+
+bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4;
return false;
}
+bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) {
+ return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI);
+}
+
#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
@@ -141,6 +155,34 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
return NewInfo ? NewInfo->Opcode : -1;
}
+unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
+ const MIMGDimInfo *Dim, bool IsA16,
+ bool IsG16Supported) {
+ unsigned AddrWords = BaseOpcode->NumExtraArgs;
+ unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ if (IsA16)
+ AddrWords += divideCeil(AddrComponents, 2);
+ else
+ AddrWords += AddrComponents;
+
+ // Note: For subtargets that support A16 but not G16, enabling A16 also
+ // enables 16 bit gradients.
+ // For subtargets that support A16 (operand) and G16 (done with a different
+ // instruction encoding), they are independent.
+
+ if (BaseOpcode->Gradients) {
+ if ((IsA16 && !IsG16Supported) || BaseOpcode->G16)
+ // There are two gradients per coordinate, we pack them separately.
+ // For the 3d case,
+ // we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv)
+ AddrWords += alignTo<2>(Dim->NumGradients / 2);
+ else
+ AddrWords += Dim->NumGradients;
+ }
+ return AddrWords;
+}
+
struct MUBUFInfo {
uint16_t Opcode;
uint16_t BaseOpcode;
@@ -148,6 +190,7 @@ struct MUBUFInfo {
bool has_vaddr;
bool has_srsrc;
bool has_soffset;
+ bool IsBufferInv;
};
struct MTBUFInfo {
@@ -164,12 +207,23 @@ struct SMInfo {
bool IsBuffer;
};
+struct VOPInfo {
+ uint16_t Opcode;
+ bool IsSingle;
+};
+
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
#define GET_MUBUFInfoTable_IMPL
#define GET_SMInfoTable_DECL
#define GET_SMInfoTable_IMPL
+#define GET_VOP1InfoTable_DECL
+#define GET_VOP1InfoTable_IMPL
+#define GET_VOP2InfoTable_DECL
+#define GET_VOP2InfoTable_IMPL
+#define GET_VOP3InfoTable_DECL
+#define GET_VOP3InfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMTBUFBaseOpcode(unsigned Opc) {
@@ -232,11 +286,31 @@ bool getMUBUFHasSoffset(unsigned Opc) {
return Info ? Info->has_soffset : false;
}
+bool getMUBUFIsBufferInv(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->IsBufferInv : false;
+}
+
bool getSMEMIsBuffer(unsigned Opc) {
const SMInfo *Info = getSMEMOpcodeHelper(Opc);
return Info ? Info->IsBuffer : false;
}
+bool getVOP1IsSingle(unsigned Opc) {
+ const VOPInfo *Info = getVOP1OpcodeHelper(Opc);
+ return Info ? Info->IsSingle : false;
+}
+
+bool getVOP2IsSingle(unsigned Opc) {
+ const VOPInfo *Info = getVOP2OpcodeHelper(Opc);
+ return Info ? Info->IsSingle : false;
+}
+
+bool getVOP3IsSingle(unsigned Opc) {
+ const VOPInfo *Info = getVOP3OpcodeHelper(Opc);
+ return Info ? Info->IsSingle : false;
+}
+
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
// header files, so we need to wrap it in a function that takes unsigned
// instead.
@@ -247,7 +321,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {
namespace IsaInfo {
AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
- : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) {
+ : STI(STI), XnackSetting(TargetIDSetting::Any),
+ SramEccSetting(TargetIDSetting::Any) {
if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
XnackSetting = TargetIDSetting::Unsupported;
if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
@@ -334,25 +409,109 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) {
}
}
-void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
- auto TargetTriple = STI->getTargetTriple();
- auto Version = getIsaVersion(STI->getCPU());
+std::string AMDGPUTargetID::toString() const {
+ std::string StringRep = "";
+ raw_string_ostream StreamRep(StringRep);
- Stream << TargetTriple.getArchName() << '-'
- << TargetTriple.getVendorName() << '-'
- << TargetTriple.getOSName() << '-'
- << TargetTriple.getEnvironmentName() << '-'
- << "gfx"
- << Version.Major
- << Version.Minor
- << Version.Stepping;
+ auto TargetTriple = STI.getTargetTriple();
+ auto Version = getIsaVersion(STI.getCPU());
- if (hasXNACK(*STI))
- Stream << "+xnack";
- if (hasSRAMECC(*STI))
- Stream << "+sramecc";
+ StreamRep << TargetTriple.getArchName() << '-'
+ << TargetTriple.getVendorName() << '-'
+ << TargetTriple.getOSName() << '-'
+ << TargetTriple.getEnvironmentName() << '-';
- Stream.flush();
+ std::string Processor = "";
+ // TODO: Following else statement is present here because we used various
+ // alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803').
+ // Remove once all aliases are removed from GCNProcessors.td.
+ if (Version.Major >= 9)
+ Processor = STI.getCPU().str();
+ else
+ Processor = (Twine("gfx") + Twine(Version.Major) + Twine(Version.Minor) +
+ Twine(Version.Stepping))
+ .str();
+
+ std::string Features = "";
+ if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) {
+ switch (*HsaAbiVersion) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ // Code object V2 only supported specific processors and had fixed
+ // settings for the XNACK.
+ if (Processor == "gfx600") {
+ } else if (Processor == "gfx601") {
+ } else if (Processor == "gfx602") {
+ } else if (Processor == "gfx700") {
+ } else if (Processor == "gfx701") {
+ } else if (Processor == "gfx702") {
+ } else if (Processor == "gfx703") {
+ } else if (Processor == "gfx704") {
+ } else if (Processor == "gfx705") {
+ } else if (Processor == "gfx801") {
+ if (!isXnackOnOrAny())
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor +
+ " without XNACK");
+ } else if (Processor == "gfx802") {
+ } else if (Processor == "gfx803") {
+ } else if (Processor == "gfx805") {
+ } else if (Processor == "gfx810") {
+ if (!isXnackOnOrAny())
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor +
+ " without XNACK");
+ } else if (Processor == "gfx900") {
+ if (isXnackOnOrAny())
+ Processor = "gfx901";
+ } else if (Processor == "gfx902") {
+ if (isXnackOnOrAny())
+ Processor = "gfx903";
+ } else if (Processor == "gfx904") {
+ if (isXnackOnOrAny())
+ Processor = "gfx905";
+ } else if (Processor == "gfx906") {
+ if (isXnackOnOrAny())
+ Processor = "gfx907";
+ } else if (Processor == "gfx90c") {
+ if (isXnackOnOrAny())
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor +
+ " with XNACK being ON or ANY");
+ } else {
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor);
+ }
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ // xnack.
+ if (isXnackOnOrAny())
+ Features += "+xnack";
+ // In code object v2 and v3, "sramecc" feature was spelled with a
+ // hyphen ("sram-ecc").
+ if (isSramEccOnOrAny())
+ Features += "+sram-ecc";
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ // sramecc.
+ if (getSramEccSetting() == TargetIDSetting::Off)
+ Features += ":sramecc-";
+ else if (getSramEccSetting() == TargetIDSetting::On)
+ Features += ":sramecc+";
+ // xnack.
+ if (getXnackSetting() == TargetIDSetting::Off)
+ Features += ":xnack-";
+ else if (getXnackSetting() == TargetIDSetting::On)
+ Features += ":xnack+";
+ break;
+ default:
+ break;
+ }
+ }
+
+ StreamRep << Processor << Features;
+
+ StreamRep.flush();
+ return StringRep;
}
unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
@@ -402,6 +561,8 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
// FIXME: Need to take scratch memory into account.
+ if (isGFX90A(*STI))
+ return 8;
if (!isGFX10Plus(*STI))
return 10;
return hasGFX10_3Insts(*STI) ? 16 : 20;
@@ -531,6 +692,9 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
Optional<bool> EnableWavefrontSize32) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 8;
+
bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -543,6 +707,8 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
Optional<bool> EnableWavefrontSize32) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 8;
bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
@@ -552,12 +718,16 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
}
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 512;
if (!isGFX10Plus(*STI))
return 256;
return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
}
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 512;
return 256;
}
@@ -653,6 +823,11 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);
}
+ if (AMDGPU::isGFX90A(*STI)) {
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc3,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+ STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0);
+ }
return KD;
}
@@ -1049,23 +1224,32 @@ int64_t getMsgId(const StringRef Name) {
return ID_UNKNOWN_;
}
-static bool isValidMsgId(int64_t MsgId) {
- return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId];
-}
-
bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {
if (Strict) {
- if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL)
+ switch (MsgId) {
+ case ID_SAVEWAVE:
+ return isVI(STI) || isGFX9Plus(STI);
+ case ID_STALL_WAVE_GEN:
+ case ID_HALT_WAVES:
+ case ID_ORDERED_PS_DONE:
+ case ID_GS_ALLOC_REQ:
+ case ID_GET_DOORBELL:
return isGFX9Plus(STI);
- else
- return isValidMsgId(MsgId);
+ case ID_EARLY_PRIM_DEALLOC:
+ return isGFX9(STI);
+ case ID_GET_DDID:
+ return isGFX10Plus(STI);
+ default:
+ return 0 <= MsgId && MsgId < ID_GAPS_LAST_ && IdSymbolic[MsgId];
+ }
} else {
return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId);
}
}
StringRef getMsgName(int64_t MsgId) {
- return isValidMsgId(MsgId)? IdSymbolic[MsgId] : "";
+ assert(0 <= MsgId && MsgId < ID_GAPS_LAST_);
+ return IdSymbolic[MsgId];
}
int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
@@ -1080,7 +1264,9 @@ int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
return OP_UNKNOWN_;
}
-bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) {
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
+ bool Strict) {
+ assert(isValidMsgId(MsgId, STI, Strict));
if (!Strict)
return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);
@@ -1103,7 +1289,9 @@ StringRef getMsgOpName(int64_t MsgId, int64_t OpId) {
return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];
}
-bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) {
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
+ const MCSubtargetInfo &STI, bool Strict) {
+ assert(isValidMsgOp(MsgId, OpId, STI, Strict));
if (!Strict)
return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId);
@@ -1156,6 +1344,17 @@ unsigned getInitialPSInputAddr(const Function &F) {
return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}
+bool getHasColorExport(const Function &F) {
+ // As a safe default always respond as if PS has color exports.
+ return getIntegerAttribute(
+ F, "amdgpu-color-export",
+ F.getCallingConv() == CallingConv::AMDGPU_PS ? 1 : 0) != 0;
+}
+
+bool getHasDepthExport(const Function &F) {
+ return getIntegerAttribute(F, "amdgpu-depth-export", 0) != 0;
+}
+
bool isShader(CallingConv::ID cc) {
switch(cc) {
case CallingConv::AMDGPU_VS:
@@ -1259,6 +1458,10 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
}
+bool isGFX10_AEncoding(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10_AEncoding];
+}
+
bool isGFX10_BEncoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding];
}
@@ -1267,6 +1470,14 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
}
+bool isGFX90A(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
+}
+
+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+}
+
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
@@ -1374,6 +1585,9 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return true;
default:
return false;
@@ -1413,41 +1627,67 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::VReg_64RegClassID:
case AMDGPU::AReg_64RegClassID:
case AMDGPU::SReg_64_XEXECRegClassID:
+ case AMDGPU::VReg_64_Align2RegClassID:
+ case AMDGPU::AReg_64_Align2RegClassID:
return 64;
case AMDGPU::SGPR_96RegClassID:
case AMDGPU::SReg_96RegClassID:
case AMDGPU::VReg_96RegClassID:
case AMDGPU::AReg_96RegClassID:
+ case AMDGPU::VReg_96_Align2RegClassID:
+ case AMDGPU::AReg_96_Align2RegClassID:
+ case AMDGPU::AV_96RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::SReg_128RegClassID:
case AMDGPU::VReg_128RegClassID:
case AMDGPU::AReg_128RegClassID:
+ case AMDGPU::VReg_128_Align2RegClassID:
+ case AMDGPU::AReg_128_Align2RegClassID:
+ case AMDGPU::AV_128RegClassID:
return 128;
case AMDGPU::SGPR_160RegClassID:
case AMDGPU::SReg_160RegClassID:
case AMDGPU::VReg_160RegClassID:
case AMDGPU::AReg_160RegClassID:
+ case AMDGPU::VReg_160_Align2RegClassID:
+ case AMDGPU::AReg_160_Align2RegClassID:
+ case AMDGPU::AV_160RegClassID:
return 160;
case AMDGPU::SGPR_192RegClassID:
case AMDGPU::SReg_192RegClassID:
case AMDGPU::VReg_192RegClassID:
case AMDGPU::AReg_192RegClassID:
+ case AMDGPU::VReg_192_Align2RegClassID:
+ case AMDGPU::AReg_192_Align2RegClassID:
return 192;
+ case AMDGPU::SGPR_224RegClassID:
+ case AMDGPU::SReg_224RegClassID:
+ case AMDGPU::VReg_224RegClassID:
+ case AMDGPU::AReg_224RegClassID:
+ case AMDGPU::VReg_224_Align2RegClassID:
+ case AMDGPU::AReg_224_Align2RegClassID:
+ return 224;
case AMDGPU::SGPR_256RegClassID:
case AMDGPU::SReg_256RegClassID:
case AMDGPU::VReg_256RegClassID:
case AMDGPU::AReg_256RegClassID:
+ case AMDGPU::VReg_256_Align2RegClassID:
+ case AMDGPU::AReg_256_Align2RegClassID:
return 256;
case AMDGPU::SGPR_512RegClassID:
case AMDGPU::SReg_512RegClassID:
case AMDGPU::VReg_512RegClassID:
case AMDGPU::AReg_512RegClassID:
+ case AMDGPU::VReg_512_Align2RegClassID:
+ case AMDGPU::AReg_512_Align2RegClassID:
return 512;
case AMDGPU::SGPR_1024RegClassID:
case AMDGPU::SReg_1024RegClassID:
case AMDGPU::VReg_1024RegClassID:
case AMDGPU::AReg_1024RegClassID:
+ case AMDGPU::VReg_1024_Align2RegClassID:
+ case AMDGPU::AReg_1024_Align2RegClassID:
return 1024;
default:
llvm_unreachable("Unexpected register class");
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f9378693cf48..72c872dec5ba 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -44,6 +44,12 @@ bool isHsaAbiVersion2(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 3,
/// false otherwise.
bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 4,
+/// false otherwise.
+bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 3 or 4,
+/// false otherwise.
+bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI);
struct GcnBufferFormatInfo {
unsigned Format;
@@ -78,6 +84,7 @@ enum class TargetIDSetting {
class AMDGPUTargetID {
private:
+ const MCSubtargetInfo &STI;
TargetIDSetting XnackSetting;
TargetIDSetting SramEccSetting;
@@ -145,10 +152,10 @@ public:
void setTargetIDFromFeaturesString(StringRef FS);
void setTargetIDFromTargetIDStream(StringRef TargetID);
-};
-/// Streams isa version string for given subtarget \p STI into \p Stream.
-void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
+ /// \returns String representation of an object.
+ std::string toString() const;
+};
/// \returns Wavefront size for given subtarget \p STI.
unsigned getWavefrontSize(const MCSubtargetInfo *STI);
@@ -284,6 +291,7 @@ struct MIMGBaseOpcodeInfo {
bool Coordinates;
bool LodOrClampOrMip;
bool HasD16;
+ bool MSAA;
};
LLVM_READONLY
@@ -293,6 +301,7 @@ struct MIMGDimInfo {
MIMGDim Dim;
uint8_t NumCoords;
uint8_t NumGradients;
+ bool MSAA;
bool DA;
uint8_t Encoding;
const char *AsmSuffix;
@@ -338,6 +347,11 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
LLVM_READONLY
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
+LLVM_READONLY
+unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
+ const MIMGDimInfo *Dim, bool IsA16,
+ bool IsG16Supported);
+
struct MIMGInfo {
uint16_t Opcode;
uint16_t BaseOpcode;
@@ -386,9 +400,21 @@ LLVM_READONLY
bool getMUBUFHasSoffset(unsigned Opc);
LLVM_READONLY
+bool getMUBUFIsBufferInv(unsigned Opc);
+
+LLVM_READONLY
bool getSMEMIsBuffer(unsigned Opc);
LLVM_READONLY
+bool getVOP1IsSingle(unsigned Opc);
+
+LLVM_READONLY
+bool getVOP2IsSingle(unsigned Opc);
+
+LLVM_READONLY
+bool getVOP3IsSingle(unsigned Opc);
+
+LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
uint8_t NumFormat,
@@ -459,6 +485,14 @@ struct Waitcnt {
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
}
+ bool hasWaitExceptVsCnt() const {
+ return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;
+ }
+
+ bool hasWaitVsCnt() const {
+ return VsCnt != ~0u;
+ }
+
bool dominates(const Waitcnt &Other) const {
return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;
@@ -627,10 +661,12 @@ LLVM_READNONE
bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true);
LLVM_READNONE
-bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true);
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
+ bool Strict = true);
LLVM_READNONE
-bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true);
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
+ const MCSubtargetInfo &STI, bool Strict = true);
LLVM_READNONE
bool msgRequiresOp(int64_t MsgId);
@@ -653,6 +689,10 @@ uint64_t encodeMsg(uint64_t MsgId,
unsigned getInitialPSInputAddr(const Function &F);
+bool getHasColorExport(const Function &F);
+
+bool getHasDepthExport(const Function &F);
+
LLVM_READNONE
bool isShader(CallingConv::ID CC);
@@ -701,8 +741,11 @@ bool isGFX9Plus(const MCSubtargetInfo &STI);
bool isGFX10(const MCSubtargetInfo &STI);
bool isGFX10Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
+bool isGFX10_AEncoding(const MCSubtargetInfo &STI);
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
+bool isGFX90A(const MCSubtargetInfo &STI);
+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -746,12 +789,17 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
return 4;
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return 8;
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -847,6 +895,11 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
const GCNSubtarget *Subtarget,
Align Alignment = Align(4));
+LLVM_READNONE
+inline bool isLegal64BitDPPControl(unsigned DC) {
+ return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+}
+
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
new file mode 100644
index 000000000000..da8fcf3900bb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -0,0 +1,355 @@
+//===- AMDGPULDSUtils.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU LDS related helper utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULDSUtils.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/ReplaceConstant.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// An helper class for collecting all reachable callees for each kernel defined
+// within the module.
+class CollectReachableCallees {
+ Module &M;
+ CallGraph CG;
+ SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
+
+ // Collect all address taken functions within the module.
+ void collectAddressTakenFunctions() {
+ auto *ECNode = CG.getExternalCallingNode();
+
+ for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
+ auto *CGN = GI->second;
+ auto *F = CGN->getFunction();
+ if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
+ continue;
+ AddressTakenFunctions.insert(CGN);
+ }
+ }
+
+ // For given kernel, collect all its reachable non-kernel functions.
+ SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
+ SmallPtrSet<Function *, 8> ReachableCallees;
+
+ // Call graph node which represents this kernel.
+ auto *KCGN = CG[K];
+
+ // Go through all call graph nodes reachable from the node representing this
+ // kernel, visit all their call sites, if the call site is direct, add
+ // corresponding callee to reachable callee set, if it is indirect, resolve
+ // the indirect call site to potential reachable callees, add them to
+ // reachable callee set, and repeat the process for the newly added
+ // potential callee nodes.
+ //
+ // FIXME: Need to handle bit-casted function pointers.
+ //
+ SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
+ SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
+ while (!CGNStack.empty()) {
+ auto *CGN = CGNStack.pop_back_val();
+
+ if (!VisitedCGNodes.insert(CGN).second)
+ continue;
+
+ for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
+ auto *RCB = cast<CallBase>(GI->first.getValue());
+ auto *RCGN = GI->second;
+
+ if (auto *DCallee = RCGN->getFunction()) {
+ ReachableCallees.insert(DCallee);
+ } else if (RCB->isIndirectCall()) {
+ auto *RCBFTy = RCB->getFunctionType();
+ for (auto *ACGN : AddressTakenFunctions) {
+ auto *ACallee = ACGN->getFunction();
+ if (ACallee->getFunctionType() == RCBFTy) {
+ ReachableCallees.insert(ACallee);
+ CGNStack.append(df_begin(ACGN), df_end(ACGN));
+ }
+ }
+ }
+ }
+ }
+
+ return ReachableCallees;
+ }
+
+public:
+ explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
+ // Collect address taken functions.
+ collectAddressTakenFunctions();
+ }
+
+ void collectReachableCallees(
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+ // Collect reachable callee set for each kernel defined in the module.
+ for (Function &F : M.functions()) {
+ if (!AMDGPU::isKernelCC(&F))
+ continue;
+ Function *K = &F;
+ KernelToCallees[K] = collectReachableCallees(K);
+ }
+ }
+};
+
+void collectReachableCallees(
+ Module &M,
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+ CollectReachableCallees CRC{M};
+ CRC.collectReachableCallees(KernelToCallees);
+}
+
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
+ SmallPtrSet<Function *, 8> LDSAccessors;
+ SmallVector<User *, 8> UserStack(GV->users());
+ SmallPtrSet<User *, 8> VisitedUsers;
+
+ while (!UserStack.empty()) {
+ auto *U = UserStack.pop_back_val();
+
+ // `U` is already visited? continue to next one.
+ if (!VisitedUsers.insert(U).second)
+ continue;
+
+ // `U` is a global variable which is initialized with LDS. Ignore LDS.
+ if (isa<GlobalValue>(U))
+ return SmallPtrSet<Function *, 8>();
+
+ // Recursively explore constant users.
+ if (isa<Constant>(U)) {
+ append_range(UserStack, U->users());
+ continue;
+ }
+
+ // `U` should be an instruction, if it belongs to a non-kernel function F,
+ // then collect F.
+ Function *F = cast<Instruction>(U)->getFunction();
+ if (!AMDGPU::isKernelCC(F))
+ LDSAccessors.insert(F);
+ }
+
+ return LDSAccessors;
+}
+
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
+ DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
+ SmallVector<User *, 8> UserStack;
+ SmallPtrSet<User *, 8> VisitedUsers;
+
+ UserStack.push_back(U);
+
+ while (!UserStack.empty()) {
+ auto *UU = UserStack.pop_back_val();
+
+ if (!VisitedUsers.insert(UU).second)
+ continue;
+
+ if (isa<GlobalValue>(UU))
+ continue;
+
+ if (isa<Constant>(UU)) {
+ append_range(UserStack, UU->users());
+ continue;
+ }
+
+ auto *I = cast<Instruction>(UU);
+ Function *F = I->getFunction();
+ if (CollectKernelInsts) {
+ if (!AMDGPU::isKernelCC(F)) {
+ continue;
+ }
+ } else {
+ if (AMDGPU::isKernelCC(F)) {
+ continue;
+ }
+ }
+
+ FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
+ FunctionToInsts[F].insert(I);
+ }
+
+ return FunctionToInsts;
+}
+
+bool isKernelCC(const Function *Func) {
+ return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
+}
+
+Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+ return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
+ GV->getValueType());
+}
+
+static void collectFunctionUses(User *U, const Function *F,
+ SetVector<Instruction *> &InstUsers) {
+ SmallVector<User *> Stack{U};
+
+ while (!Stack.empty()) {
+ U = Stack.pop_back_val();
+
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ if (I->getFunction() == F)
+ InstUsers.insert(I);
+ continue;
+ }
+
+ if (!isa<ConstantExpr>(U))
+ continue;
+
+ append_range(Stack, U->users());
+ }
+}
+
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
+ SetVector<Instruction *> InstUsers;
+
+ collectFunctionUses(C, F, InstUsers);
+ for (Instruction *I : InstUsers) {
+ convertConstantExprsToInstructions(I, C);
+ }
+}
+
+bool hasUserInstruction(const GlobalValue *GV) {
+ SmallPtrSet<const User *, 8> Visited;
+ SmallVector<const User *, 16> Stack(GV->users());
+
+ while (!Stack.empty()) {
+ const User *U = Stack.pop_back_val();
+
+ if (!Visited.insert(U).second)
+ continue;
+
+ if (isa<Instruction>(U))
+ return true;
+
+ append_range(Stack, U->users());
+ }
+
+ return false;
+}
+
+bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
+ // We are not interested in kernel LDS lowering for module LDS itself.
+ if (F && GV.getName() == "llvm.amdgcn.module.lds")
+ return false;
+
+ bool Ret = false;
+ SmallPtrSet<const User *, 8> Visited;
+ SmallVector<const User *, 16> Stack(GV.users());
+ SmallPtrSet<const GlobalValue *, 8> GlobalUsers;
+
+ assert(!F || isKernelCC(F));
+
+ while (!Stack.empty()) {
+ const User *V = Stack.pop_back_val();
+ Visited.insert(V);
+
+ if (auto *G = dyn_cast<GlobalValue>(V)) {
+ StringRef GName = G->getName();
+ if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
+ // For kernel LDS lowering, if G is not a compiler.used list, then we
+ // cannot lower the lds GV since we cannot replace the use of GV within
+ // G.
+ return false;
+ }
+ GlobalUsers.insert(G);
+ continue;
+ }
+
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ const Function *UF = I->getFunction();
+ if (UF == F) {
+ // Used from this kernel, we want to put it into the structure.
+ Ret = true;
+ } else if (!F) {
+ // For module LDS lowering, lowering is required if the user instruction
+ // is from non-kernel function.
+ Ret |= !isKernelCC(UF);
+ }
+ continue;
+ }
+
+ // User V should be a constant, recursively visit users of V.
+ assert(isa<Constant>(V) && "Expected a constant.");
+ append_range(Stack, V->users());
+ }
+
+ if (!F && !Ret) {
+ // For module LDS lowering, we have not yet decided if we should lower GV or
+ // not. Explore all global users of GV, and check if atleast one of these
+ // global users appear as an use within an instruction (possibly nested use
+ // via constant expression), if so, then conservately lower LDS.
+ for (auto *G : GlobalUsers)
+ Ret |= hasUserInstruction(G);
+ }
+
+ return Ret;
+}
+
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+ const Function *F) {
+ std::vector<llvm::GlobalVariable *> LocalVars;
+ for (auto &GV : M.globals()) {
+ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+ continue;
+ }
+ if (!GV.hasInitializer()) {
+ // addrspace(3) without initializer implies cuda/hip extern __shared__
+ // the semantics for such a variable appears to be that all extern
+ // __shared__ variables alias one another, in which case this transform
+ // is not required
+ continue;
+ }
+ if (!isa<UndefValue>(GV.getInitializer())) {
+ // Initializers are unimplemented for local address space.
+ // Leave such variables in place for consistent error reporting.
+ continue;
+ }
+ if (GV.isConstant()) {
+ // A constant undef variable can't be written to, and any load is
+ // undef, so it should be eliminated by the optimizer. It could be
+ // dropped by the back end if not. This pass skips over it.
+ continue;
+ }
+ if (!shouldLowerLDSToStruct(GV, F)) {
+ continue;
+ }
+ LocalVars.push_back(&GV);
+ }
+ return LocalVars;
+}
+
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+ SmallPtrSet<GlobalValue *, 32> UsedList;
+
+ SmallVector<GlobalValue *, 32> TmpVec;
+ collectUsedGlobalVariables(M, TmpVec, true);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ TmpVec.clear();
+ collectUsedGlobalVariables(M, TmpVec, false);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ return UsedList;
+}
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
new file mode 100644
index 000000000000..ffcafb9b76ce
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -0,0 +1,70 @@
+//===- AMDGPULDSUtils.h - LDS related helper functions -*- C++ -*----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU LDS related helper utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Constants.h"
+
+namespace llvm {
+
+class ConstantExpr;
+
+namespace AMDGPU {
+
+/// Collect reachable callees for each kernel defined in the module \p M and
+/// return collected callees at \p KernelToCallees.
+void collectReachableCallees(
+ Module &M,
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees);
+
+/// For the given LDS global \p GV, visit all its users and collect all
+/// non-kernel functions within which \p GV is used and return collected list of
+/// such non-kernel functions.
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
+
+/// Collect all the instructions where user \p U belongs to. \p U could be
+/// instruction itself or it could be a constant expression which is used within
+/// an instruction. If \p CollectKernelInsts is true, collect instructions only
+/// from kernels, otherwise collect instructions only from non-kernel functions.
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts);
+
+bool isKernelCC(const Function *Func);
+
+Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
+
+/// \returns true if a given global variable \p GV (or its global users) appear
+/// as an use within some instruction (either from kernel or from non-kernel).
+bool hasUserInstruction(const GlobalValue *GV);
+
+/// \returns true if an LDS global requres lowering to a module LDS structure
+/// if \p F is not given. If \p F is given it must be a kernel and function
+/// \returns true if an LDS global is directly used from that kernel and it
+/// is safe to replace its uses with a kernel LDS structure member.
+bool shouldLowerLDSToStruct(const GlobalVariable &GV,
+ const Function *F = nullptr);
+
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+ const Function *F = nullptr);
+
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
+
+/// Replace all uses of constant \p C with instructions in \p F.
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index b7dd757a8af3..f6b5975f1934 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -41,7 +41,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {
}
return;
}
- BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ BlobType = ELF::NT_AMD_PAL_METADATA;
NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
if (!NamedMD || !NamedMD->getNumOperands()) {
// Emit msgpack metadata by default
@@ -69,7 +69,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {
// Metadata.
bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) {
BlobType = Type;
- if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ if (Type == ELF::NT_AMD_PAL_METADATA)
return setFromLegacyBlob(Blob);
return setFromMsgPackBlob(Blob);
}
@@ -243,6 +243,27 @@ void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF,
Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
}
+// Set the amount of LDS used in bytes in the metadata.
+void AMDGPUPALMetadata::setFunctionLdsSize(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".lds_size"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the number of used vgprs in the metadata.
+void AMDGPUPALMetadata::setFunctionNumUsedVgprs(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".vgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the number of used vgprs in the metadata.
+void AMDGPUPALMetadata::setFunctionNumUsedSgprs(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".sgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -592,6 +613,41 @@ static const char *getRegisterName(unsigned RegNum) {
{0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"},
{0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"},
+ {0x2e28, "COMPUTE_PGM_RSRC3"},
+ {0x2e2a, "COMPUTE_SHADER_CHKSUM"},
+ {0x2e24, "COMPUTE_USER_ACCUM_0"},
+ {0x2e25, "COMPUTE_USER_ACCUM_1"},
+ {0x2e26, "COMPUTE_USER_ACCUM_2"},
+ {0x2e27, "COMPUTE_USER_ACCUM_3"},
+ {0xa1ff, "GE_MAX_OUTPUT_PER_SUBGROUP"},
+ {0xa2d3, "GE_NGG_SUBGRP_CNTL"},
+ {0xc25f, "GE_STEREO_CNTL"},
+ {0xc262, "GE_USER_VGPR_EN"},
+ {0xc258, "IA_MULTI_VGT_PARAM_PIPED"},
+ {0xa210, "PA_STEREO_CNTL"},
+ {0xa1c2, "SPI_SHADER_IDX_FORMAT"},
+ {0x2c80, "SPI_SHADER_PGM_CHKSUM_GS"},
+ {0x2d00, "SPI_SHADER_PGM_CHKSUM_HS"},
+ {0x2c06, "SPI_SHADER_PGM_CHKSUM_PS"},
+ {0x2c45, "SPI_SHADER_PGM_CHKSUM_VS"},
+ {0x2c88, "SPI_SHADER_PGM_LO_GS"},
+ {0x2cb2, "SPI_SHADER_USER_ACCUM_ESGS_0"},
+ {0x2cb3, "SPI_SHADER_USER_ACCUM_ESGS_1"},
+ {0x2cb4, "SPI_SHADER_USER_ACCUM_ESGS_2"},
+ {0x2cb5, "SPI_SHADER_USER_ACCUM_ESGS_3"},
+ {0x2d32, "SPI_SHADER_USER_ACCUM_LSHS_0"},
+ {0x2d33, "SPI_SHADER_USER_ACCUM_LSHS_1"},
+ {0x2d34, "SPI_SHADER_USER_ACCUM_LSHS_2"},
+ {0x2d35, "SPI_SHADER_USER_ACCUM_LSHS_3"},
+ {0x2c32, "SPI_SHADER_USER_ACCUM_PS_0"},
+ {0x2c33, "SPI_SHADER_USER_ACCUM_PS_1"},
+ {0x2c34, "SPI_SHADER_USER_ACCUM_PS_2"},
+ {0x2c35, "SPI_SHADER_USER_ACCUM_PS_3"},
+ {0x2c72, "SPI_SHADER_USER_ACCUM_VS_0"},
+ {0x2c73, "SPI_SHADER_USER_ACCUM_VS_1"},
+ {0x2c74, "SPI_SHADER_USER_ACCUM_VS_2"},
+ {0x2c75, "SPI_SHADER_USER_ACCUM_VS_3"},
+
{0, nullptr}};
auto Entry = RegInfoTable;
for (; Entry->Num && Entry->Num != RegNum; ++Entry)
@@ -653,7 +709,7 @@ void AMDGPUPALMetadata::toString(std::string &String) {
// a .note record of the specified AMD type. Returns an empty blob if
// there is no PAL metadata,
void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) {
- if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ if (Type == ELF::NT_AMD_PAL_METADATA)
toLegacyBlob(Blob);
else if (Type)
toMsgPackBlob(Blob);
@@ -790,7 +846,7 @@ const char *AMDGPUPALMetadata::getVendor() const {
}
// Get .note record type of metadata blob to be emitted:
-// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+// ELF::NT_AMD_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMDGPU_METADATA (MsgPack format), or
// 0 (no PAL metadata).
unsigned AMDGPUPALMetadata::getType() const {
@@ -799,12 +855,12 @@ unsigned AMDGPUPALMetadata::getType() const {
// Return whether the blob type is legacy PAL metadata.
bool AMDGPUPALMetadata::isLegacy() const {
- return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ return BlobType == ELF::NT_AMD_PAL_METADATA;
}
// Set legacy PAL metadata format.
void AMDGPUPALMetadata::setLegacy() {
- BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ BlobType = ELF::NT_AMD_PAL_METADATA;
}
// Erase all PAL metadata.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 8fa1f738487c..7fdd9a8429c1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -80,6 +80,21 @@ public:
// Set the stack frame size of a function in the metadata.
void setFunctionScratchSize(const MachineFunction &MF, unsigned Val);
+ // Set the amount of LDS used in bytes in the metadata. This is an optional
+ // advisory record for logging etc; wave dispatch actually uses the rsrc1
+ // register for the shader stage to determine the amount of LDS to allocate.
+ void setFunctionLdsSize(const MachineFunction &MF, unsigned Val);
+
+ // Set the number of used vgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of vgprs to allocate.
+ void setFunctionNumUsedVgprs(const MachineFunction &MF, unsigned Val);
+
+ // Set the number of used sgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of sgprs to allocate.
+ void setFunctionNumUsedSgprs(const MachineFunction &MF, unsigned Val);
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void setWave32(unsigned CC);
@@ -95,7 +110,7 @@ public:
const char *getVendor() const;
// Get .note record type of metadata blob to be emitted:
- // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+ // ELF::NT_AMD_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMDGPU_METADATA (MsgPack format), or
// 0 (no PAL metadata).
unsigned getType() const;