src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2021-08-22 19:00:43 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2021-11-13 20:39:49 +0000
commit	fe6060f10f634930ff71b7c50291ddc610da2475 (patch)
tree	1483580c790bd4d27b6500a7542b5ee00534d3cc /contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils
parent	b61bce17f346d79cecfd8f195a64b10f77be43b1 (diff)
parent	344a3780b2e33f6ca763666c380202b18aab72a3 (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils')

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

306

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp

355

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h

8 files changed, 848 insertions, 57 deletions

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index c8a85d76a55b..0bee9022975e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp

@@ -15,19 +15,19 @@ namespace AMDGPU {

namespace SendMsg {

// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.

-const char* const IdSymbolic[] = {

+const char *const IdSymbolic[ID_GAPS_LAST_] = {

nullptr,

"MSG_INTERRUPT",

"MSG_GS",

"MSG_GS_DONE",

- nullptr,

+ "MSG_SAVEWAVE",

+ "MSG_STALL_WAVE_GEN",

+ "MSG_HALT_WAVES",

+ "MSG_ORDERED_PS_DONE",

+ "MSG_EARLY_PRIM_DEALLOC",

"MSG_GS_ALLOC_REQ",

"MSG_GET_DOORBELL",

- nullptr,

+ "MSG_GET_DDID",

nullptr,

@@ -35,7 +35,7 @@ const char* const IdSymbolic[] = {

};

// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.

-const char* const OpSysSymbolic[] = {

+const char *const OpSysSymbolic[OP_SYS_LAST_] = {

nullptr,

"SYSMSG_OP_ECC_ERR_INTERRUPT",

"SYSMSG_OP_REG_RD",

@@ -43,7 +43,7 @@ const char* const OpSysSymbolic[] = {

"SYSMSG_OP_TTRACE_PC"

};

-const char* const OpGsSymbolic[] = {

+const char *const OpGsSymbolic[OP_GS_LAST_] = {

"GS_OP_NOP",

"GS_OP_CUT",

"GS_OP_EMIT",

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index 3eb27c5e5f42..d1deb570a938 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h

@@ -9,6 +9,8 @@

#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H

#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H

+#include "SIDefines.h"

namespace llvm {

class StringLiteral;

@@ -17,9 +19,9 @@ namespace AMDGPU {

namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.

-extern const char* const IdSymbolic[];

-extern const char* const OpSysSymbolic[];

-extern const char* const OpGsSymbolic[];

+extern const char *const IdSymbolic[ID_GAPS_LAST_];

+extern const char *const OpSysSymbolic[OP_SYS_LAST_];

+extern const char *const OpGsSymbolic[OP_GS_LAST_];

} // namespace SendMsg

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4c1e4dec7ecb..29bbf50cbfdc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

@@ -30,7 +30,8 @@

static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion(

"amdhsa-code-object-version", llvm::cl::Hidden,

- llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(3));

+ llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4),

+ llvm::cl::ZeroOrMore);

namespace {

@@ -96,23 +97,36 @@ Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {

return ELF::ELFABIVERSION_AMDGPU_HSA_V2;

case 3:

return ELF::ELFABIVERSION_AMDGPU_HSA_V3;

+ case 4:

+ return ELF::ELFABIVERSION_AMDGPU_HSA_V4;

default:

- return ELF::ELFABIVERSION_AMDGPU_HSA_V3;

+ report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") +

+ Twine(AmdhsaCodeObjectVersion));

}

bool isHsaAbiVersion2(const MCSubtargetInfo *STI) {

- if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))

- return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V2;

+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))

+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V2;

return false;

}

bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {

- if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))

- return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V3;

+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))

+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3;

+ return false;

+bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {

+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))

+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4;

return false;

}

+bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) {

+ return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI);

#define GET_MIMGBaseOpcodesTable_IMPL

#define GET_MIMGDimInfoTable_IMPL

#define GET_MIMGInfoTable_IMPL

@@ -141,6 +155,34 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {

return NewInfo ? NewInfo->Opcode : -1;

}

+unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,

+ const MIMGDimInfo *Dim, bool IsA16,

+ bool IsG16Supported) {

+ unsigned AddrWords = BaseOpcode->NumExtraArgs;

+ unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +

+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);

+ if (IsA16)

+ AddrWords += divideCeil(AddrComponents, 2);

+ else

+ AddrWords += AddrComponents;

+ // Note: For subtargets that support A16 but not G16, enabling A16 also

+ // enables 16 bit gradients.

+ // For subtargets that support A16 (operand) and G16 (done with a different

+ // instruction encoding), they are independent.

+ if (BaseOpcode->Gradients) {

+ if ((IsA16 && !IsG16Supported) || BaseOpcode->G16)

+ // There are two gradients per coordinate, we pack them separately.

+ // For the 3d case,

+ // we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv)

+ AddrWords += alignTo<2>(Dim->NumGradients / 2);

+ else

+ AddrWords += Dim->NumGradients;

+ }

+ return AddrWords;

struct MUBUFInfo {

uint16_t Opcode;

uint16_t BaseOpcode;

@@ -148,6 +190,7 @@ struct MUBUFInfo {

bool has_vaddr;

bool has_srsrc;

bool has_soffset;

+ bool IsBufferInv;

};

struct MTBUFInfo {

@@ -164,12 +207,23 @@ struct SMInfo {

bool IsBuffer;

};

+struct VOPInfo {

+ uint16_t Opcode;

+ bool IsSingle;

+};

#define GET_MTBUFInfoTable_DECL

#define GET_MTBUFInfoTable_IMPL

#define GET_MUBUFInfoTable_DECL

#define GET_MUBUFInfoTable_IMPL

#define GET_SMInfoTable_DECL

#define GET_SMInfoTable_IMPL

+#define GET_VOP1InfoTable_DECL

+#define GET_VOP1InfoTable_IMPL

+#define GET_VOP2InfoTable_DECL

+#define GET_VOP2InfoTable_IMPL

+#define GET_VOP3InfoTable_DECL

+#define GET_VOP3InfoTable_IMPL

#include "AMDGPUGenSearchableTables.inc"

int getMTBUFBaseOpcode(unsigned Opc) {

@@ -232,11 +286,31 @@ bool getMUBUFHasSoffset(unsigned Opc) {

return Info ? Info->has_soffset : false;

}

+bool getMUBUFIsBufferInv(unsigned Opc) {

+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);

+ return Info ? Info->IsBufferInv : false;

bool getSMEMIsBuffer(unsigned Opc) {

const SMInfo *Info = getSMEMOpcodeHelper(Opc);

return Info ? Info->IsBuffer : false;

}

+bool getVOP1IsSingle(unsigned Opc) {

+ const VOPInfo *Info = getVOP1OpcodeHelper(Opc);

+ return Info ? Info->IsSingle : false;

+bool getVOP2IsSingle(unsigned Opc) {

+ const VOPInfo *Info = getVOP2OpcodeHelper(Opc);

+ return Info ? Info->IsSingle : false;

+bool getVOP3IsSingle(unsigned Opc) {

+ const VOPInfo *Info = getVOP3OpcodeHelper(Opc);

+ return Info ? Info->IsSingle : false;

// Wrapper for Tablegen'd function. enum Subtarget is not defined in any

// header files, so we need to wrap it in a function that takes unsigned

// instead.

@@ -247,7 +321,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {

namespace IsaInfo {

AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)

- : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) {

+ : STI(STI), XnackSetting(TargetIDSetting::Any),

+ SramEccSetting(TargetIDSetting::Any) {

if (!STI.getFeatureBits().test(FeatureSupportsXNACK))

XnackSetting = TargetIDSetting::Unsupported;

if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))

@@ -334,25 +409,109 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) {

}

-void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {

- auto TargetTriple = STI->getTargetTriple();

- auto Version = getIsaVersion(STI->getCPU());

+std::string AMDGPUTargetID::toString() const {

+ std::string StringRep = "";

+ raw_string_ostream StreamRep(StringRep);

- Stream << TargetTriple.getArchName() << '-'

- << TargetTriple.getVendorName() << '-'

- << TargetTriple.getOSName() << '-'

- << TargetTriple.getEnvironmentName() << '-'

- << "gfx"

- << Version.Major

- << Version.Minor

- << Version.Stepping;

+ auto TargetTriple = STI.getTargetTriple();

+ auto Version = getIsaVersion(STI.getCPU());

- if (hasXNACK(*STI))

- Stream << "+xnack";

- if (hasSRAMECC(*STI))

- Stream << "+sramecc";

+ StreamRep << TargetTriple.getArchName() << '-'

+ << TargetTriple.getVendorName() << '-'

+ << TargetTriple.getOSName() << '-'

+ << TargetTriple.getEnvironmentName() << '-';

- Stream.flush();

+ std::string Processor = "";

+ // TODO: Following else statement is present here because we used various

+ // alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803').

+ // Remove once all aliases are removed from GCNProcessors.td.

+ if (Version.Major >= 9)

+ Processor = STI.getCPU().str();

+ else

+ Processor = (Twine("gfx") + Twine(Version.Major) + Twine(Version.Minor) +

+ Twine(Version.Stepping))

+ .str();

+ std::string Features = "";

+ if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) {

+ switch (*HsaAbiVersion) {

+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:

+ // Code object V2 only supported specific processors and had fixed

+ // settings for the XNACK.

+ if (Processor == "gfx600") {

+ } else if (Processor == "gfx601") {

+ } else if (Processor == "gfx602") {

+ } else if (Processor == "gfx700") {

+ } else if (Processor == "gfx701") {

+ } else if (Processor == "gfx702") {

+ } else if (Processor == "gfx703") {

+ } else if (Processor == "gfx704") {

+ } else if (Processor == "gfx705") {

+ } else if (Processor == "gfx801") {

+ if (!isXnackOnOrAny())

+ report_fatal_error(

+ "AMD GPU code object V2 does not support processor " + Processor +

+ " without XNACK");

+ } else if (Processor == "gfx802") {

+ } else if (Processor == "gfx803") {

+ } else if (Processor == "gfx805") {

+ } else if (Processor == "gfx810") {

+ if (!isXnackOnOrAny())

+ report_fatal_error(

+ "AMD GPU code object V2 does not support processor " + Processor +

+ " without XNACK");

+ } else if (Processor == "gfx900") {

+ if (isXnackOnOrAny())

+ Processor = "gfx901";

+ } else if (Processor == "gfx902") {

+ if (isXnackOnOrAny())

+ Processor = "gfx903";

+ } else if (Processor == "gfx904") {

+ if (isXnackOnOrAny())

+ Processor = "gfx905";

+ } else if (Processor == "gfx906") {

+ if (isXnackOnOrAny())

+ Processor = "gfx907";

+ } else if (Processor == "gfx90c") {

+ if (isXnackOnOrAny())

+ report_fatal_error(

+ "AMD GPU code object V2 does not support processor " + Processor +

+ " with XNACK being ON or ANY");

+ } else {

+ report_fatal_error(

+ "AMD GPU code object V2 does not support processor " + Processor);

+ }

+ break;

+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:

+ // xnack.

+ if (isXnackOnOrAny())

+ Features += "+xnack";

+ // In code object v2 and v3, "sramecc" feature was spelled with a

+ // hyphen ("sram-ecc").

+ if (isSramEccOnOrAny())

+ Features += "+sram-ecc";

+ break;

+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:

+ // sramecc.

+ if (getSramEccSetting() == TargetIDSetting::Off)

+ Features += ":sramecc-";

+ else if (getSramEccSetting() == TargetIDSetting::On)

+ Features += ":sramecc+";

+ // xnack.

+ if (getXnackSetting() == TargetIDSetting::Off)

+ Features += ":xnack-";

+ else if (getXnackSetting() == TargetIDSetting::On)

+ Features += ":xnack+";

+ break;

+ default:

+ break;

+ }

+ StreamRep << Processor << Features;

+ StreamRep.flush();

+ return StringRep;

}

unsigned getWavefrontSize(const MCSubtargetInfo *STI) {

@@ -402,6 +561,8 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {

unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {

// FIXME: Need to take scratch memory into account.

+ if (isGFX90A(*STI))

+ return 8;

if (!isGFX10Plus(*STI))

return 10;

return hasGFX10_3Insts(*STI) ? 16 : 20;

@@ -531,6 +692,9 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {

unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,

Optional<bool> EnableWavefrontSize32) {

+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))

+ return 8;

bool IsWave32 = EnableWavefrontSize32 ?

*EnableWavefrontSize32 :

STI->getFeatureBits().test(FeatureWavefrontSize32);

@@ -543,6 +707,8 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,

unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,

Optional<bool> EnableWavefrontSize32) {

+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))

+ return 8;

bool IsWave32 = EnableWavefrontSize32 ?

*EnableWavefrontSize32 :

@@ -552,12 +718,16 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,

}

unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {

+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))

+ return 512;

if (!isGFX10Plus(*STI))

return 256;

return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;

}

unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {

+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))

+ return 512;

return 256;

}

@@ -653,6 +823,11 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(

AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,

amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);

}

+ if (AMDGPU::isGFX90A(*STI)) {

+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc3,

+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,

+ STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0);

+ }

return KD;

}

@@ -1049,23 +1224,32 @@ int64_t getMsgId(const StringRef Name) {

return ID_UNKNOWN_;

}

-static bool isValidMsgId(int64_t MsgId) {

- return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId];

bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {

if (Strict) {

- if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL)

+ switch (MsgId) {

+ case ID_SAVEWAVE:

+ return isVI(STI) || isGFX9Plus(STI);

+ case ID_STALL_WAVE_GEN:

+ case ID_HALT_WAVES:

+ case ID_ORDERED_PS_DONE:

+ case ID_GS_ALLOC_REQ:

+ case ID_GET_DOORBELL:

return isGFX9Plus(STI);

- else

- return isValidMsgId(MsgId);

+ case ID_EARLY_PRIM_DEALLOC:

+ return isGFX9(STI);

+ case ID_GET_DDID:

+ return isGFX10Plus(STI);

+ default:

+ return 0 <= MsgId && MsgId < ID_GAPS_LAST_ && IdSymbolic[MsgId];

+ }

} else {

return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId);

}

StringRef getMsgName(int64_t MsgId) {

- return isValidMsgId(MsgId)? IdSymbolic[MsgId] : "";

+ assert(0 <= MsgId && MsgId < ID_GAPS_LAST_);

+ return IdSymbolic[MsgId];

}

int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {

@@ -1080,7 +1264,9 @@ int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {

return OP_UNKNOWN_;

}

-bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) {

+bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,

+ bool Strict) {

+ assert(isValidMsgId(MsgId, STI, Strict));

if (!Strict)

return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);

@@ -1103,7 +1289,9 @@ StringRef getMsgOpName(int64_t MsgId, int64_t OpId) {

return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];

}

-bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) {

+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,

+ const MCSubtargetInfo &STI, bool Strict) {

+ assert(isValidMsgOp(MsgId, OpId, STI, Strict));

if (!Strict)

return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId);

@@ -1156,6 +1344,17 @@ unsigned getInitialPSInputAddr(const Function &F) {

return getIntegerAttribute(F, "InitialPSInputAddr", 0);

}

+bool getHasColorExport(const Function &F) {

+ // As a safe default always respond as if PS has color exports.

+ return getIntegerAttribute(

+ F, "amdgpu-color-export",

+ F.getCallingConv() == CallingConv::AMDGPU_PS ? 1 : 0) != 0;

+bool getHasDepthExport(const Function &F) {

+ return getIntegerAttribute(F, "amdgpu-depth-export", 0) != 0;

bool isShader(CallingConv::ID cc) {

switch(cc) {

case CallingConv::AMDGPU_VS:

@@ -1259,6 +1458,10 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) {

return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];

}

+bool isGFX10_AEncoding(const MCSubtargetInfo &STI) {

+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10_AEncoding];

bool isGFX10_BEncoding(const MCSubtargetInfo &STI) {

return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding];

}

@@ -1267,6 +1470,14 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {

return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];

}

+bool isGFX90A(const MCSubtargetInfo &STI) {

+ return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];

+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {

+ return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];

bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {

const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);

const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);

@@ -1374,6 +1585,9 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {

case AMDGPU::OPERAND_REG_INLINE_AC_FP16:

case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:

case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:

+ case AMDGPU::OPERAND_REG_IMM_V2FP32:

+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:

+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:

return true;

default:

return false;

@@ -1413,41 +1627,67 @@ unsigned getRegBitWidth(unsigned RCID) {

case AMDGPU::VReg_64RegClassID:

case AMDGPU::AReg_64RegClassID:

case AMDGPU::SReg_64_XEXECRegClassID:

+ case AMDGPU::VReg_64_Align2RegClassID:

+ case AMDGPU::AReg_64_Align2RegClassID:

return 64;

case AMDGPU::SGPR_96RegClassID:

case AMDGPU::SReg_96RegClassID:

case AMDGPU::VReg_96RegClassID:

case AMDGPU::AReg_96RegClassID:

+ case AMDGPU::VReg_96_Align2RegClassID:

+ case AMDGPU::AReg_96_Align2RegClassID:

+ case AMDGPU::AV_96RegClassID:

return 96;

case AMDGPU::SGPR_128RegClassID:

case AMDGPU::SReg_128RegClassID:

case AMDGPU::VReg_128RegClassID:

case AMDGPU::AReg_128RegClassID:

+ case AMDGPU::VReg_128_Align2RegClassID:

+ case AMDGPU::AReg_128_Align2RegClassID:

+ case AMDGPU::AV_128RegClassID:

return 128;

case AMDGPU::SGPR_160RegClassID:

case AMDGPU::SReg_160RegClassID:

case AMDGPU::VReg_160RegClassID:

case AMDGPU::AReg_160RegClassID:

+ case AMDGPU::VReg_160_Align2RegClassID:

+ case AMDGPU::AReg_160_Align2RegClassID:

+ case AMDGPU::AV_160RegClassID:

return 160;

case AMDGPU::SGPR_192RegClassID:

case AMDGPU::SReg_192RegClassID:

case AMDGPU::VReg_192RegClassID:

case AMDGPU::AReg_192RegClassID:

+ case AMDGPU::VReg_192_Align2RegClassID:

+ case AMDGPU::AReg_192_Align2RegClassID:

return 192;

+ case AMDGPU::SGPR_224RegClassID:

+ case AMDGPU::SReg_224RegClassID:

+ case AMDGPU::VReg_224RegClassID:

+ case AMDGPU::AReg_224RegClassID:

+ case AMDGPU::VReg_224_Align2RegClassID:

+ case AMDGPU::AReg_224_Align2RegClassID:

+ return 224;

case AMDGPU::SGPR_256RegClassID:

case AMDGPU::SReg_256RegClassID:

case AMDGPU::VReg_256RegClassID:

case AMDGPU::AReg_256RegClassID:

+ case AMDGPU::VReg_256_Align2RegClassID:

+ case AMDGPU::AReg_256_Align2RegClassID:

return 256;

case AMDGPU::SGPR_512RegClassID:

case AMDGPU::SReg_512RegClassID:

case AMDGPU::VReg_512RegClassID:

case AMDGPU::AReg_512RegClassID:

+ case AMDGPU::VReg_512_Align2RegClassID:

+ case AMDGPU::AReg_512_Align2RegClassID:

return 512;

case AMDGPU::SGPR_1024RegClassID:

case AMDGPU::SReg_1024RegClassID:

case AMDGPU::VReg_1024RegClassID:

case AMDGPU::AReg_1024RegClassID:

+ case AMDGPU::VReg_1024_Align2RegClassID:

+ case AMDGPU::AReg_1024_Align2RegClassID:

return 1024;

default:

llvm_unreachable("Unexpected register class");

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f9378693cf48..72c872dec5ba 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

@@ -44,6 +44,12 @@ bool isHsaAbiVersion2(const MCSubtargetInfo *STI);

/// \returns True if HSA OS ABI Version identification is 3,

/// false otherwise.

bool isHsaAbiVersion3(const MCSubtargetInfo *STI);

+/// \returns True if HSA OS ABI Version identification is 4,

+/// false otherwise.

+bool isHsaAbiVersion4(const MCSubtargetInfo *STI);

+/// \returns True if HSA OS ABI Version identification is 3 or 4,

+/// false otherwise.

+bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI);

struct GcnBufferFormatInfo {

unsigned Format;

@@ -78,6 +84,7 @@ enum class TargetIDSetting {

class AMDGPUTargetID {

private:

+ const MCSubtargetInfo &STI;

TargetIDSetting XnackSetting;

TargetIDSetting SramEccSetting;

@@ -145,10 +152,10 @@ public:

void setTargetIDFromFeaturesString(StringRef FS);

void setTargetIDFromTargetIDStream(StringRef TargetID);

-};

-/// Streams isa version string for given subtarget \p STI into \p Stream.

-void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);

+ /// \returns String representation of an object.

+ std::string toString() const;

+};

/// \returns Wavefront size for given subtarget \p STI.

unsigned getWavefrontSize(const MCSubtargetInfo *STI);

@@ -284,6 +291,7 @@ struct MIMGBaseOpcodeInfo {

bool Coordinates;

bool LodOrClampOrMip;

bool HasD16;

+ bool MSAA;

};

LLVM_READONLY

@@ -293,6 +301,7 @@ struct MIMGDimInfo {

MIMGDim Dim;

uint8_t NumCoords;

uint8_t NumGradients;

+ bool MSAA;

bool DA;

uint8_t Encoding;

const char *AsmSuffix;

@@ -338,6 +347,11 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,

LLVM_READONLY

int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);

+LLVM_READONLY

+unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,

+ const MIMGDimInfo *Dim, bool IsA16,

+ bool IsG16Supported);

struct MIMGInfo {

uint16_t Opcode;

uint16_t BaseOpcode;

@@ -386,9 +400,21 @@ LLVM_READONLY

bool getMUBUFHasSoffset(unsigned Opc);

LLVM_READONLY

+bool getMUBUFIsBufferInv(unsigned Opc);

+LLVM_READONLY

bool getSMEMIsBuffer(unsigned Opc);

LLVM_READONLY

+bool getVOP1IsSingle(unsigned Opc);

+LLVM_READONLY

+bool getVOP2IsSingle(unsigned Opc);

+LLVM_READONLY

+bool getVOP3IsSingle(unsigned Opc);

+LLVM_READONLY

const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,

uint8_t NumComponents,

uint8_t NumFormat,

@@ -459,6 +485,14 @@ struct Waitcnt {

return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;

}

+ bool hasWaitExceptVsCnt() const {

+ return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;

+ }

+ bool hasWaitVsCnt() const {

+ return VsCnt != ~0u;

+ }

bool dominates(const Waitcnt &Other) const {

return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&

LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;

@@ -627,10 +661,12 @@ LLVM_READNONE

bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true);

LLVM_READNONE

-bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true);

+bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,

+ bool Strict = true);

LLVM_READNONE

-bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true);

+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,

+ const MCSubtargetInfo &STI, bool Strict = true);

LLVM_READNONE

bool msgRequiresOp(int64_t MsgId);

@@ -653,6 +689,10 @@ uint64_t encodeMsg(uint64_t MsgId,

unsigned getInitialPSInputAddr(const Function &F);

+bool getHasColorExport(const Function &F);

+bool getHasDepthExport(const Function &F);

LLVM_READNONE

bool isShader(CallingConv::ID CC);

@@ -701,8 +741,11 @@ bool isGFX9Plus(const MCSubtargetInfo &STI);

bool isGFX10(const MCSubtargetInfo &STI);

bool isGFX10Plus(const MCSubtargetInfo &STI);

bool isGCN3Encoding(const MCSubtargetInfo &STI);

+bool isGFX10_AEncoding(const MCSubtargetInfo &STI);

bool isGFX10_BEncoding(const MCSubtargetInfo &STI);

bool hasGFX10_3Insts(const MCSubtargetInfo &STI);

+bool isGFX90A(const MCSubtargetInfo &STI);

+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);

/// Is Reg - scalar register

bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);

@@ -746,12 +789,17 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {

case AMDGPU::OPERAND_REG_INLINE_C_FP32:

case AMDGPU::OPERAND_REG_INLINE_AC_INT32:

case AMDGPU::OPERAND_REG_INLINE_AC_FP32:

+ case AMDGPU::OPERAND_REG_IMM_V2INT32:

+ case AMDGPU::OPERAND_REG_IMM_V2FP32:

+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:

+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:

return 4;

case AMDGPU::OPERAND_REG_IMM_INT64:

case AMDGPU::OPERAND_REG_IMM_FP64:

case AMDGPU::OPERAND_REG_INLINE_C_INT64:

case AMDGPU::OPERAND_REG_INLINE_C_FP64:

+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:

return 8;

case AMDGPU::OPERAND_REG_IMM_INT16:

@@ -847,6 +895,11 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,

const GCNSubtarget *Subtarget,

Align Alignment = Align(4));

+LLVM_READNONE

+inline bool isLegal64BitDPPControl(unsigned DC) {

+ return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;

/// \returns true if the intrinsic is divergent

bool isIntrinsicSourceOfDivergence(unsigned IntrID);

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
new file mode 100644
index 000000000000..da8fcf3900bb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp

@@ -0,0 +1,355 @@

+//===- AMDGPULDSUtils.cpp -------------------------------------------------===//

+//

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+//

+//===----------------------------------------------------------------------===//

+//

+// AMDGPU LDS related helper utility functions.

+//

+//===----------------------------------------------------------------------===//

+#include "AMDGPULDSUtils.h"

+#include "Utils/AMDGPUBaseInfo.h"

+#include "llvm/ADT/DepthFirstIterator.h"

+#include "llvm/ADT/SetVector.h"

+#include "llvm/Analysis/CallGraph.h"

+#include "llvm/IR/Constants.h"

+#include "llvm/IR/ReplaceConstant.h"

+using namespace llvm;

+namespace llvm {

+namespace AMDGPU {

+// An helper class for collecting all reachable callees for each kernel defined

+// within the module.

+class CollectReachableCallees {

+ Module &M;

+ CallGraph CG;

+ SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;

+ // Collect all address taken functions within the module.

+ void collectAddressTakenFunctions() {

+ auto *ECNode = CG.getExternalCallingNode();

+ for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {

+ auto *CGN = GI->second;

+ auto *F = CGN->getFunction();

+ if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))

+ continue;

+ AddressTakenFunctions.insert(CGN);

+ }

+ // For given kernel, collect all its reachable non-kernel functions.

+ SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {

+ SmallPtrSet<Function *, 8> ReachableCallees;

+ // Call graph node which represents this kernel.

+ auto *KCGN = CG[K];

+ // Go through all call graph nodes reachable from the node representing this

+ // kernel, visit all their call sites, if the call site is direct, add

+ // corresponding callee to reachable callee set, if it is indirect, resolve

+ // the indirect call site to potential reachable callees, add them to

+ // reachable callee set, and repeat the process for the newly added

+ // potential callee nodes.

+ //

+ // FIXME: Need to handle bit-casted function pointers.

+ //

+ SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));

+ SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;

+ while (!CGNStack.empty()) {

+ auto *CGN = CGNStack.pop_back_val();

+ if (!VisitedCGNodes.insert(CGN).second)

+ continue;

+ for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {

+ auto *RCB = cast<CallBase>(GI->first.getValue());

+ auto *RCGN = GI->second;

+ if (auto *DCallee = RCGN->getFunction()) {

+ ReachableCallees.insert(DCallee);

+ } else if (RCB->isIndirectCall()) {

+ auto *RCBFTy = RCB->getFunctionType();

+ for (auto *ACGN : AddressTakenFunctions) {

+ auto *ACallee = ACGN->getFunction();

+ if (ACallee->getFunctionType() == RCBFTy) {

+ ReachableCallees.insert(ACallee);

+ CGNStack.append(df_begin(ACGN), df_end(ACGN));

+ }

+ return ReachableCallees;

+ }

+public:

+ explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {

+ // Collect address taken functions.

+ collectAddressTakenFunctions();

+ }

+ void collectReachableCallees(

+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {

+ // Collect reachable callee set for each kernel defined in the module.

+ for (Function &F : M.functions()) {

+ if (!AMDGPU::isKernelCC(&F))

+ continue;

+ Function *K = &F;

+ KernelToCallees[K] = collectReachableCallees(K);

+ }

+};

+void collectReachableCallees(

+ Module &M,

+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {

+ CollectReachableCallees CRC{M};

+ CRC.collectReachableCallees(KernelToCallees);

+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {

+ SmallPtrSet<Function *, 8> LDSAccessors;

+ SmallVector<User *, 8> UserStack(GV->users());

+ SmallPtrSet<User *, 8> VisitedUsers;

+ while (!UserStack.empty()) {

+ auto *U = UserStack.pop_back_val();

+ // `U` is already visited? continue to next one.

+ if (!VisitedUsers.insert(U).second)

+ continue;

+ // `U` is a global variable which is initialized with LDS. Ignore LDS.

+ if (isa<GlobalValue>(U))

+ return SmallPtrSet<Function *, 8>();

+ // Recursively explore constant users.

+ if (isa<Constant>(U)) {

+ append_range(UserStack, U->users());

+ continue;

+ }

+ // `U` should be an instruction, if it belongs to a non-kernel function F,

+ // then collect F.

+ Function *F = cast<Instruction>(U)->getFunction();

+ if (!AMDGPU::isKernelCC(F))

+ LDSAccessors.insert(F);

+ }

+ return LDSAccessors;

+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>

+getFunctionToInstsMap(User *U, bool CollectKernelInsts) {

+ DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;

+ SmallVector<User *, 8> UserStack;

+ SmallPtrSet<User *, 8> VisitedUsers;

+ UserStack.push_back(U);

+ while (!UserStack.empty()) {

+ auto *UU = UserStack.pop_back_val();

+ if (!VisitedUsers.insert(UU).second)

+ continue;

+ if (isa<GlobalValue>(UU))

+ continue;

+ if (isa<Constant>(UU)) {

+ append_range(UserStack, UU->users());

+ continue;

+ }

+ auto *I = cast<Instruction>(UU);

+ Function *F = I->getFunction();

+ if (CollectKernelInsts) {

+ if (!AMDGPU::isKernelCC(F)) {

+ continue;

+ }

+ } else {

+ if (AMDGPU::isKernelCC(F)) {

+ continue;

+ }

+ FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));

+ FunctionToInsts[F].insert(I);

+ }

+ return FunctionToInsts;

+bool isKernelCC(const Function *Func) {

+ return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());

+Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {

+ return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),

+ GV->getValueType());

+static void collectFunctionUses(User *U, const Function *F,

+ SetVector<Instruction *> &InstUsers) {

+ SmallVector<User *> Stack{U};

+ while (!Stack.empty()) {

+ U = Stack.pop_back_val();

+ if (auto *I = dyn_cast<Instruction>(U)) {

+ if (I->getFunction() == F)

+ InstUsers.insert(I);

+ continue;

+ }

+ if (!isa<ConstantExpr>(U))

+ continue;

+ append_range(Stack, U->users());

+ }

+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {

+ SetVector<Instruction *> InstUsers;

+ collectFunctionUses(C, F, InstUsers);

+ for (Instruction *I : InstUsers) {

+ convertConstantExprsToInstructions(I, C);

+ }

+bool hasUserInstruction(const GlobalValue *GV) {

+ SmallPtrSet<const User *, 8> Visited;

+ SmallVector<const User *, 16> Stack(GV->users());

+ while (!Stack.empty()) {

+ const User *U = Stack.pop_back_val();

+ if (!Visited.insert(U).second)

+ continue;

+ if (isa<Instruction>(U))

+ return true;

+ append_range(Stack, U->users());

+ }

+ return false;

+bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {

+ // We are not interested in kernel LDS lowering for module LDS itself.

+ if (F && GV.getName() == "llvm.amdgcn.module.lds")

+ return false;

+ bool Ret = false;

+ SmallPtrSet<const User *, 8> Visited;

+ SmallVector<const User *, 16> Stack(GV.users());

+ SmallPtrSet<const GlobalValue *, 8> GlobalUsers;

+ assert(!F || isKernelCC(F));

+ while (!Stack.empty()) {

+ const User *V = Stack.pop_back_val();

+ Visited.insert(V);

+ if (auto *G = dyn_cast<GlobalValue>(V)) {

+ StringRef GName = G->getName();

+ if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {

+ // For kernel LDS lowering, if G is not a compiler.used list, then we

+ // cannot lower the lds GV since we cannot replace the use of GV within

+ // G.

+ return false;

+ }

+ GlobalUsers.insert(G);

+ continue;

+ }

+ if (auto *I = dyn_cast<Instruction>(V)) {

+ const Function *UF = I->getFunction();

+ if (UF == F) {

+ // Used from this kernel, we want to put it into the structure.

+ Ret = true;

+ } else if (!F) {

+ // For module LDS lowering, lowering is required if the user instruction

+ // is from non-kernel function.

+ Ret |= !isKernelCC(UF);

+ }

+ continue;

+ }

+ // User V should be a constant, recursively visit users of V.

+ assert(isa<Constant>(V) && "Expected a constant.");

+ append_range(Stack, V->users());

+ }

+ if (!F && !Ret) {

+ // For module LDS lowering, we have not yet decided if we should lower GV or

+ // not. Explore all global users of GV, and check if atleast one of these

+ // global users appear as an use within an instruction (possibly nested use

+ // via constant expression), if so, then conservately lower LDS.

+ for (auto *G : GlobalUsers)

+ Ret |= hasUserInstruction(G);

+ }

+ return Ret;

+std::vector<GlobalVariable *> findVariablesToLower(Module &M,

+ const Function *F) {

+ std::vector<llvm::GlobalVariable *> LocalVars;

+ for (auto &GV : M.globals()) {

+ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {

+ continue;

+ }

+ if (!GV.hasInitializer()) {

+ // addrspace(3) without initializer implies cuda/hip extern __shared__

+ // the semantics for such a variable appears to be that all extern

+ // __shared__ variables alias one another, in which case this transform

+ // is not required

+ continue;

+ }

+ if (!isa<UndefValue>(GV.getInitializer())) {

+ // Initializers are unimplemented for local address space.

+ // Leave such variables in place for consistent error reporting.

+ continue;

+ }

+ if (GV.isConstant()) {

+ // A constant undef variable can't be written to, and any load is

+ // undef, so it should be eliminated by the optimizer. It could be

+ // dropped by the back end if not. This pass skips over it.

+ continue;

+ }

+ if (!shouldLowerLDSToStruct(GV, F)) {

+ continue;

+ }

+ LocalVars.push_back(&GV);

+ }

+ return LocalVars;

+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {

+ SmallPtrSet<GlobalValue *, 32> UsedList;

+ SmallVector<GlobalValue *, 32> TmpVec;

+ collectUsedGlobalVariables(M, TmpVec, true);

+ UsedList.insert(TmpVec.begin(), TmpVec.end());

+ TmpVec.clear();

+ collectUsedGlobalVariables(M, TmpVec, false);

+ UsedList.insert(TmpVec.begin(), TmpVec.end());

+ return UsedList;

+} // end namespace AMDGPU

+} // end namespace llvm

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
new file mode 100644
index 000000000000..ffcafb9b76ce
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h

@@ -0,0 +1,70 @@

+//===- AMDGPULDSUtils.h - LDS related helper functions -*- C++ -*----------===//

+//

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+//

+//===----------------------------------------------------------------------===//

+//

+// AMDGPU LDS related helper utility functions.

+//

+//===----------------------------------------------------------------------===//

+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H

+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H

+#include "AMDGPU.h"

+#include "llvm/ADT/DenseMap.h"

+#include "llvm/IR/Constants.h"

+namespace llvm {

+class ConstantExpr;

+namespace AMDGPU {

+/// Collect reachable callees for each kernel defined in the module \p M and

+/// return collected callees at \p KernelToCallees.

+void collectReachableCallees(

+ Module &M,

+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees);

+/// For the given LDS global \p GV, visit all its users and collect all

+/// non-kernel functions within which \p GV is used and return collected list of

+/// such non-kernel functions.

+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);

+/// Collect all the instructions where user \p U belongs to. \p U could be

+/// instruction itself or it could be a constant expression which is used within

+/// an instruction. If \p CollectKernelInsts is true, collect instructions only

+/// from kernels, otherwise collect instructions only from non-kernel functions.

+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>

+getFunctionToInstsMap(User *U, bool CollectKernelInsts);

+bool isKernelCC(const Function *Func);

+Align getAlign(DataLayout const &DL, const GlobalVariable *GV);

+/// \returns true if a given global variable \p GV (or its global users) appear

+/// as an use within some instruction (either from kernel or from non-kernel).

+bool hasUserInstruction(const GlobalValue *GV);

+/// \returns true if an LDS global requres lowering to a module LDS structure

+/// if \p F is not given. If \p F is given it must be a kernel and function

+/// \returns true if an LDS global is directly used from that kernel and it

+/// is safe to replace its uses with a kernel LDS structure member.

+bool shouldLowerLDSToStruct(const GlobalVariable &GV,

+ const Function *F = nullptr);

+std::vector<GlobalVariable *> findVariablesToLower(Module &M,

+ const Function *F = nullptr);

+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);

+/// Replace all uses of constant \p C with instructions in \p F.

+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);

+} // end namespace AMDGPU

+} // end namespace llvm

+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index b7dd757a8af3..f6b5975f1934 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp

@@ -41,7 +41,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {

}

return;

}

- BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;

+ BlobType = ELF::NT_AMD_PAL_METADATA;

NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");

if (!NamedMD || !NamedMD->getNumOperands()) {

// Emit msgpack metadata by default

@@ -69,7 +69,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {

// Metadata.

bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) {

BlobType = Type;

- if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)

+ if (Type == ELF::NT_AMD_PAL_METADATA)

return setFromLegacyBlob(Blob);

return setFromMsgPackBlob(Blob);

}

@@ -243,6 +243,27 @@ void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF,

Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);

}

+// Set the amount of LDS used in bytes in the metadata.

+void AMDGPUPALMetadata::setFunctionLdsSize(const MachineFunction &MF,

+ unsigned Val) {

+ auto Node = getShaderFunction(MF.getFunction().getName());

+ Node[".lds_size"] = MsgPackDoc.getNode(Val);

+// Set the number of used vgprs in the metadata.

+void AMDGPUPALMetadata::setFunctionNumUsedVgprs(const MachineFunction &MF,

+ unsigned Val) {

+ auto Node = getShaderFunction(MF.getFunction().getName());

+ Node[".vgpr_count"] = MsgPackDoc.getNode(Val);

+// Set the number of used vgprs in the metadata.

+void AMDGPUPALMetadata::setFunctionNumUsedSgprs(const MachineFunction &MF,

+ unsigned Val) {

+ auto Node = getShaderFunction(MF.getFunction().getName());

+ Node[".sgpr_count"] = MsgPackDoc.getNode(Val);

// Set the hardware register bit in PAL metadata to enable wave32 on the

// shader of the given calling convention.

void AMDGPUPALMetadata::setWave32(unsigned CC) {

@@ -592,6 +613,41 @@ static const char *getRegisterName(unsigned RegNum) {

{0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"},

{0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"},

+ {0x2e28, "COMPUTE_PGM_RSRC3"},

+ {0x2e2a, "COMPUTE_SHADER_CHKSUM"},

+ {0x2e24, "COMPUTE_USER_ACCUM_0"},

+ {0x2e25, "COMPUTE_USER_ACCUM_1"},

+ {0x2e26, "COMPUTE_USER_ACCUM_2"},

+ {0x2e27, "COMPUTE_USER_ACCUM_3"},

+ {0xa1ff, "GE_MAX_OUTPUT_PER_SUBGROUP"},

+ {0xa2d3, "GE_NGG_SUBGRP_CNTL"},

+ {0xc25f, "GE_STEREO_CNTL"},

+ {0xc262, "GE_USER_VGPR_EN"},

+ {0xc258, "IA_MULTI_VGT_PARAM_PIPED"},

+ {0xa210, "PA_STEREO_CNTL"},

+ {0xa1c2, "SPI_SHADER_IDX_FORMAT"},

+ {0x2c80, "SPI_SHADER_PGM_CHKSUM_GS"},

+ {0x2d00, "SPI_SHADER_PGM_CHKSUM_HS"},

+ {0x2c06, "SPI_SHADER_PGM_CHKSUM_PS"},

+ {0x2c45, "SPI_SHADER_PGM_CHKSUM_VS"},

+ {0x2c88, "SPI_SHADER_PGM_LO_GS"},

+ {0x2cb2, "SPI_SHADER_USER_ACCUM_ESGS_0"},

+ {0x2cb3, "SPI_SHADER_USER_ACCUM_ESGS_1"},

+ {0x2cb4, "SPI_SHADER_USER_ACCUM_ESGS_2"},

+ {0x2cb5, "SPI_SHADER_USER_ACCUM_ESGS_3"},

+ {0x2d32, "SPI_SHADER_USER_ACCUM_LSHS_0"},

+ {0x2d33, "SPI_SHADER_USER_ACCUM_LSHS_1"},

+ {0x2d34, "SPI_SHADER_USER_ACCUM_LSHS_2"},

+ {0x2d35, "SPI_SHADER_USER_ACCUM_LSHS_3"},

+ {0x2c32, "SPI_SHADER_USER_ACCUM_PS_0"},

+ {0x2c33, "SPI_SHADER_USER_ACCUM_PS_1"},

+ {0x2c34, "SPI_SHADER_USER_ACCUM_PS_2"},

+ {0x2c35, "SPI_SHADER_USER_ACCUM_PS_3"},

+ {0x2c72, "SPI_SHADER_USER_ACCUM_VS_0"},

+ {0x2c73, "SPI_SHADER_USER_ACCUM_VS_1"},

+ {0x2c74, "SPI_SHADER_USER_ACCUM_VS_2"},

+ {0x2c75, "SPI_SHADER_USER_ACCUM_VS_3"},

{0, nullptr}};

auto Entry = RegInfoTable;

for (; Entry->Num && Entry->Num != RegNum; ++Entry)

@@ -653,7 +709,7 @@ void AMDGPUPALMetadata::toString(std::string &String) {

// a .note record of the specified AMD type. Returns an empty blob if

// there is no PAL metadata,

void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) {

- if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)

+ if (Type == ELF::NT_AMD_PAL_METADATA)

toLegacyBlob(Blob);

else if (Type)

toMsgPackBlob(Blob);

@@ -790,7 +846,7 @@ const char *AMDGPUPALMetadata::getVendor() const {

}

// Get .note record type of metadata blob to be emitted:

-// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or

+// ELF::NT_AMD_PAL_METADATA (legacy key=val format), or

// ELF::NT_AMDGPU_METADATA (MsgPack format), or

// 0 (no PAL metadata).

unsigned AMDGPUPALMetadata::getType() const {

@@ -799,12 +855,12 @@ unsigned AMDGPUPALMetadata::getType() const {

// Return whether the blob type is legacy PAL metadata.

bool AMDGPUPALMetadata::isLegacy() const {

- return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA;

+ return BlobType == ELF::NT_AMD_PAL_METADATA;

}

// Set legacy PAL metadata format.

void AMDGPUPALMetadata::setLegacy() {

- BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;

+ BlobType = ELF::NT_AMD_PAL_METADATA;

}

// Erase all PAL metadata.

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 8fa1f738487c..7fdd9a8429c1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h

@@ -80,6 +80,21 @@ public:

// Set the stack frame size of a function in the metadata.

void setFunctionScratchSize(const MachineFunction &MF, unsigned Val);

+ // Set the amount of LDS used in bytes in the metadata. This is an optional

+ // advisory record for logging etc; wave dispatch actually uses the rsrc1

+ // register for the shader stage to determine the amount of LDS to allocate.

+ void setFunctionLdsSize(const MachineFunction &MF, unsigned Val);

+ // Set the number of used vgprs in the metadata. This is an optional advisory

+ // record for logging etc; wave dispatch actually uses the rsrc1 register for

+ // the shader stage to determine the number of vgprs to allocate.

+ void setFunctionNumUsedVgprs(const MachineFunction &MF, unsigned Val);

+ // Set the number of used sgprs in the metadata. This is an optional advisory

+ // record for logging etc; wave dispatch actually uses the rsrc1 register for

+ // the shader stage to determine the number of sgprs to allocate.

+ void setFunctionNumUsedSgprs(const MachineFunction &MF, unsigned Val);

// Set the hardware register bit in PAL metadata to enable wave32 on the

// shader of the given calling convention.

void setWave32(unsigned CC);

@@ -95,7 +110,7 @@ public:

const char *getVendor() const;

// Get .note record type of metadata blob to be emitted:

- // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or

+ // ELF::NT_AMD_PAL_METADATA (legacy key=val format), or

// ELF::NT_AMDGPU_METADATA (MsgPack format), or

// 0 (no PAL metadata).

unsigned getType() const;