diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2022-03-20 11:40:34 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-06-04 11:58:51 +0000 |
| commit | 4b6eb0e63c698094db5506763df44cc83c19f643 (patch) | |
| tree | f1d30b8c10bc6db323b91538745ae8ab8b593910 /contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | |
| parent | 76886853f03395abb680824bcc74e98f83bd477a (diff) | |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 81 |
1 files changed, 49 insertions, 32 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 493c1ad87f93..34cbb49dcd16 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -146,7 +146,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { if (!AddrOp->isReg()) return false; - // TODO: We should be able to merge physical reg addreses. + // TODO: We should be able to merge physical reg addresses. if (AddrOp->getReg().isPhysical()) return false; @@ -303,6 +303,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 2; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: return 4; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + return 8; case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; @@ -343,6 +345,9 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) return UNKNOWN; + // Ignore BVH instructions + if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) + return UNKNOWN; // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc)) @@ -369,6 +374,7 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return S_BUFFER_LOAD_IMM; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -380,15 +386,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B64_gfx9: return DS_WRITE; - case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa: - case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa: - case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa: - case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa: - case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa: - case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa: - case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa: - case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa: - return UNKNOWN; } } @@ -419,6 +416,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; } } @@ -469,6 +467,7 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: Result.SBase = true; return Result; case AMDGPU::DS_READ_B32: @@ -653,7 +652,7 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, } // This function assumes that \p A and \p B have are identical except for -// size and offset, and they referecne adjacent memory. +// size and offset, and they reference adjacent memory. static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, const MachineMemOperand *A, const MachineMemOperand *B) { @@ -863,6 +862,7 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, return false; case 2: case 4: + case 8: return true; } } @@ -1529,45 +1529,62 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + case 8: + return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } case MIMG: - assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); + assert((countPopulation(CI.DMask | Paired.DMask) == Width) && + "No overlaps"); return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); } } std::pair<unsigned, unsigned> -SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { +SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, + const CombineInfo &Paired) { - if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) - return std::make_pair(0, 0); + assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero"); bool ReverseOrder; if (CI.InstClass == MIMG) { - assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && - "No overlaps"); + assert( + (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && + "No overlaps"); ReverseOrder = CI.DMask > Paired.DMask; } else ReverseOrder = CI.Offset > Paired.Offset; - static const unsigned Idxs[4][4] = { - {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, - {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, - {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, - {AMDGPU::sub3, 0, 0, 0}, - }; unsigned Idx0; unsigned Idx1; - assert(CI.Width >= 1 && CI.Width <= 3); - assert(Paired.Width >= 1 && Paired.Width <= 3); + if (CI.Width + Paired.Width > 4) { + assert(CI.Width == 4 && Paired.Width == 4); - if (ReverseOrder) { - Idx1 = Idxs[0][Paired.Width - 1]; - Idx0 = Idxs[Paired.Width][CI.Width - 1]; + if (ReverseOrder) { + Idx1 = AMDGPU::sub0_sub1_sub2_sub3; + Idx0 = AMDGPU::sub4_sub5_sub6_sub7; + } else { + Idx0 = AMDGPU::sub0_sub1_sub2_sub3; + Idx1 = AMDGPU::sub4_sub5_sub6_sub7; + } } else { - Idx0 = Idxs[0][CI.Width - 1]; - Idx1 = Idxs[CI.Width][Paired.Width - 1]; + static const unsigned Idxs[4][4] = { + {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, + {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, + {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, + {AMDGPU::sub3, 0, 0, 0}, + }; + + assert(CI.Width >= 1 && CI.Width <= 3); + assert(Paired.Width >= 1 && Paired.Width <= 3); + + if (ReverseOrder) { + Idx1 = Idxs[0][Paired.Width - 1]; + Idx0 = Idxs[Paired.Width][CI.Width - 1]; + } else { + Idx0 = Idxs[0][CI.Width - 1]; + Idx1 = Idxs[CI.Width][Paired.Width - 1]; + } } return std::make_pair(Idx0, Idx1); @@ -2048,7 +2065,7 @@ SILoadStoreOptimizer::collectMergeableInsts( // adjacent to each other in the list, which will make it easier to find // matches. MergeList.sort( - [] (const CombineInfo &A, CombineInfo &B) { + [] (const CombineInfo &A, const CombineInfo &B) { return A.Offset < B.Offset; }); ++I; @@ -2140,7 +2157,7 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); - OptimizeListAgain |= (CI.Width + Paired.Width) < 16; + OptimizeListAgain |= (CI.Width + Paired.Width) < 8; break; } case BUFFER_LOAD: { |
