diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-12-18 20:30:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2024-04-06 20:11:55 +0000 |
commit | 5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch) | |
tree | 1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | |
parent | 3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff) | |
parent | 312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 93 |
1 files changed, 77 insertions, 16 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index c252d30e250e..9c85ff3c43e2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -161,8 +161,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass { if (!AddrOp->isReg()) return false; - // TODO: We should be able to merge physical reg addresses. - if (AddrOp->getReg().isPhysical()) + // TODO: We should be able to merge instructions with other physical reg + // addresses too. + if (AddrOp->getReg().isPhysical() && + AddrOp->getReg() != AMDGPU::SGPR_NULL) return false; // If an address has only one use then there will be no other @@ -320,7 +322,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { // FIXME: Handle d16 correctly return AMDGPU::getMUBUFElements(Opc); } - if (TII.isMIMG(MI)) { + if (TII.isImage(MI)) { uint64_t DMaskImm = TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); return llvm::popcount(DMaskImm); @@ -350,6 +352,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::FLAT_LOAD_DWORDX2: case AMDGPU::FLAT_STORE_DWORDX2: return 2; + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX3: @@ -398,15 +403,23 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: return BUFFER_LOAD; case AMDGPU::BUFFER_STORE_DWORD_OFFEN: case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: case AMDGPU::BUFFER_STORE_DWORD_OFFSET: case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: + case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: + case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: return BUFFER_STORE; } } - if (TII.isMIMG(Opc)) { + if (TII.isImage(Opc)) { // Ignore instructions encoded without vaddr. if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) @@ -424,35 +437,50 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { default: return UNKNOWN; + case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: - case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: - case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: - case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: - case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: return TBUFFER_LOAD; case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: + case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: + case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: + case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: + case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: return TBUFFER_STORE; } } return UNKNOWN; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return S_BUFFER_LOAD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: return S_BUFFER_LOAD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: return S_LOAD_IMM; @@ -505,7 +533,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { default: if (TII.isMUBUF(Opc)) return AMDGPU::getMUBUFBaseOpcode(Opc); - if (TII.isMIMG(Opc)) { + if (TII.isImage(Opc)) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); assert(Info); return Info->BaseOpcode; @@ -524,16 +552,19 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { return Opc; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: return AMDGPU::S_LOAD_DWORD_IMM; @@ -600,11 +631,13 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { return Result; } - if (TII.isMIMG(Opc)) { + if (TII.isImage(Opc)) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); if (VAddr0Idx >= 0) { - int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); - Result.NumVAddrs = SRsrcIdx - VAddr0Idx; + int RsrcName = + TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; + int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); + Result.NumVAddrs = RsrcIdx - VAddr0Idx; } else { Result.VAddr = true; } @@ -631,16 +664,19 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { return Result; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: Result.SOffset = true; [[fallthrough]]; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: Result.SBase = true; @@ -739,6 +775,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, } AddressRegs Regs = getRegs(Opc, *LSO.TII); + bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); NumAddresses = 0; for (unsigned J = 0; J < Regs.NumVAddrs; J++) @@ -751,8 +788,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); if (Regs.SRsrc) - AddrIdx[NumAddresses++] = - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( + Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); if (Regs.SOffset) AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); @@ -763,8 +800,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); if (Regs.SSamp) - AddrIdx[NumAddresses++] = - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); + AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( + Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); assert(NumAddresses <= MaxAddressRegs); for (unsigned J = 0; J < NumAddresses; J++) @@ -871,6 +908,9 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, unsigned MaxMask = std::max(CI.DMask, Paired.DMask); unsigned MinMask = std::min(CI.DMask, Paired.DMask); + if (!MaxMask) + return false; + unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); if ((1u << AllowedBitsForMin) <= MinMask) return false; @@ -964,6 +1004,17 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, return false; if (CI.CPol != Paired.CPol) return false; + if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || + CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { + // Reject cases like: + // dword + dwordx2 -> dwordx3 + // dword + dwordx3 -> dwordx4 + // If we tried to combine these cases, we would fail to extract a subreg + // for the result of the second load due to SGPR alignment requirements. + if (CI.Width != Paired.Width && + (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) + return false; + } return true; } @@ -1043,6 +1094,8 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, case 4: case 8: return true; + case 3: + return STM.hasScalarDwordx3Loads(); } } } @@ -1671,6 +1724,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return 0; case 2: return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + case 3: + return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; case 8: @@ -1682,6 +1737,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return 0; case 2: return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; + case 3: + return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; case 8: @@ -1693,6 +1750,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return 0; case 2: return AMDGPU::S_LOAD_DWORDX2_IMM; + case 3: + return AMDGPU::S_LOAD_DWORDX3_IMM; case 4: return AMDGPU::S_LOAD_DWORDX4_IMM; case 8: @@ -1814,6 +1873,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, return nullptr; case 2: return &AMDGPU::SReg_64_XEXECRegClass; + case 3: + return &AMDGPU::SGPR_96RegClass; case 4: return &AMDGPU::SGPR_128RegClass; case 8: |