aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-12-18 20:30:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-04-06 20:11:55 +0000
commit5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch)
tree1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
parent3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff)
parent312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp93
1 files changed, 77 insertions, 16 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index c252d30e250e..9c85ff3c43e2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -161,8 +161,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
if (!AddrOp->isReg())
return false;
- // TODO: We should be able to merge physical reg addresses.
- if (AddrOp->getReg().isPhysical())
+ // TODO: We should be able to merge instructions with other physical reg
+ // addresses too.
+ if (AddrOp->getReg().isPhysical() &&
+ AddrOp->getReg() != AMDGPU::SGPR_NULL)
return false;
// If an address has only one use then there will be no other
@@ -320,7 +322,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
// FIXME: Handle d16 correctly
return AMDGPU::getMUBUFElements(Opc);
}
- if (TII.isMIMG(MI)) {
+ if (TII.isImage(MI)) {
uint64_t DMaskImm =
TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
return llvm::popcount(DMaskImm);
@@ -350,6 +352,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::FLAT_LOAD_DWORDX2:
case AMDGPU::FLAT_STORE_DWORDX2:
return 2;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -398,15 +403,23 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
return BUFFER_LOAD;
case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
+ case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
+ case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
+ case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
return BUFFER_STORE;
}
}
- if (TII.isMIMG(Opc)) {
+ if (TII.isImage(Opc)) {
// Ignore instructions encoded without vaddr.
if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
@@ -424,35 +437,50 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
default:
return UNKNOWN;
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
- case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
- case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
- case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
- case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
return TBUFFER_LOAD;
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
+ case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
+ case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
+ case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
+ case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
return TBUFFER_STORE;
}
}
return UNKNOWN;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return S_BUFFER_LOAD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
return S_LOAD_IMM;
@@ -505,7 +533,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
default:
if (TII.isMUBUF(Opc))
return AMDGPU::getMUBUFBaseOpcode(Opc);
- if (TII.isMIMG(Opc)) {
+ if (TII.isImage(Opc)) {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
assert(Info);
return Info->BaseOpcode;
@@ -524,16 +552,19 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
return Opc;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
return AMDGPU::S_LOAD_DWORD_IMM;
@@ -600,11 +631,13 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
return Result;
}
- if (TII.isMIMG(Opc)) {
+ if (TII.isImage(Opc)) {
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
if (VAddr0Idx >= 0) {
- int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
- Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
+ int RsrcName =
+ TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
+ int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
+ Result.NumVAddrs = RsrcIdx - VAddr0Idx;
} else {
Result.VAddr = true;
}
@@ -631,16 +664,19 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
return Result;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
Result.SOffset = true;
[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
Result.SBase = true;
@@ -739,6 +775,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
}
AddressRegs Regs = getRegs(Opc, *LSO.TII);
+ bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
NumAddresses = 0;
for (unsigned J = 0; J < Regs.NumVAddrs; J++)
@@ -751,8 +788,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
if (Regs.SRsrc)
- AddrIdx[NumAddresses++] =
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
+ Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
if (Regs.SOffset)
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
@@ -763,8 +800,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
if (Regs.SSamp)
- AddrIdx[NumAddresses++] =
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
+ AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
+ Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
assert(NumAddresses <= MaxAddressRegs);
for (unsigned J = 0; J < NumAddresses; J++)
@@ -871,6 +908,9 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
unsigned MinMask = std::min(CI.DMask, Paired.DMask);
+ if (!MaxMask)
+ return false;
+
unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
if ((1u << AllowedBitsForMin) <= MinMask)
return false;
@@ -964,6 +1004,17 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
return false;
if (CI.CPol != Paired.CPol)
return false;
+ if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
+ CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
+ // Reject cases like:
+ // dword + dwordx2 -> dwordx3
+ // dword + dwordx3 -> dwordx4
+ // If we tried to combine these cases, we would fail to extract a subreg
+ // for the result of the second load due to SGPR alignment requirements.
+ if (CI.Width != Paired.Width &&
+ (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
+ return false;
+ }
return true;
}
@@ -1043,6 +1094,8 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
case 4:
case 8:
return true;
+ case 3:
+ return STM.hasScalarDwordx3Loads();
}
}
}
@@ -1671,6 +1724,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return 0;
case 2:
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ case 3:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:
@@ -1682,6 +1737,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return 0;
case 2:
return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
+ case 3:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
@@ -1693,6 +1750,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return 0;
case 2:
return AMDGPU::S_LOAD_DWORDX2_IMM;
+ case 3:
+ return AMDGPU::S_LOAD_DWORDX3_IMM;
case 4:
return AMDGPU::S_LOAD_DWORDX4_IMM;
case 8:
@@ -1814,6 +1873,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
return nullptr;
case 2:
return &AMDGPU::SReg_64_XEXECRegClass;
+ case 3:
+ return &AMDGPU::SGPR_96RegClass;
case 4:
return &AMDGPU::SGPR_128RegClass;
case 8: