aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-03-20 11:40:34 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-06-04 11:58:51 +0000
commit4b6eb0e63c698094db5506763df44cc83c19f643 (patch)
treef1d30b8c10bc6db323b91538745ae8ab8b593910 /contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
parent76886853f03395abb680824bcc74e98f83bd477a (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp81
1 files changed, 49 insertions, 32 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 493c1ad87f93..34cbb49dcd16 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -146,7 +146,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
if (!AddrOp->isReg())
return false;
- // TODO: We should be able to merge physical reg addreses.
+ // TODO: We should be able to merge physical reg addresses.
if (AddrOp->getReg().isPhysical())
return false;
@@ -303,6 +303,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
return 4;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ return 8;
case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
@@ -343,6 +345,9 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
return UNKNOWN;
+ // Ignore BVH instructions
+ if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
+ return UNKNOWN;
// TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
TII.isGather4(Opc))
@@ -369,6 +374,7 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return S_BUFFER_LOAD_IMM;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
@@ -380,15 +386,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B64_gfx9:
return DS_WRITE;
- case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:
- case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:
- case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:
- case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:
- case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:
- case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:
- case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:
- case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:
- return UNKNOWN;
}
}
@@ -419,6 +416,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
}
}
@@ -469,6 +467,7 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
Result.SBase = true;
return Result;
case AMDGPU::DS_READ_B32:
@@ -653,7 +652,7 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
}
// This function assumes that \p A and \p B have are identical except for
-// size and offset, and they referecne adjacent memory.
+// size and offset, and they reference adjacent memory.
static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
const MachineMemOperand *A,
const MachineMemOperand *B) {
@@ -863,6 +862,7 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
return false;
case 2:
case 4:
+ case 8:
return true;
}
}
@@ -1529,45 +1529,62 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+ case 8:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
case MIMG:
- assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
+ assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
+ "No overlaps");
return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
}
}
std::pair<unsigned, unsigned>
-SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
+SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
+ const CombineInfo &Paired) {
- if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
- return std::make_pair(0, 0);
+ assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero");
bool ReverseOrder;
if (CI.InstClass == MIMG) {
- assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
- "No overlaps");
+ assert(
+ (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
+ "No overlaps");
ReverseOrder = CI.DMask > Paired.DMask;
} else
ReverseOrder = CI.Offset > Paired.Offset;
- static const unsigned Idxs[4][4] = {
- {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
- {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
- {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
- {AMDGPU::sub3, 0, 0, 0},
- };
unsigned Idx0;
unsigned Idx1;
- assert(CI.Width >= 1 && CI.Width <= 3);
- assert(Paired.Width >= 1 && Paired.Width <= 3);
+ if (CI.Width + Paired.Width > 4) {
+ assert(CI.Width == 4 && Paired.Width == 4);
- if (ReverseOrder) {
- Idx1 = Idxs[0][Paired.Width - 1];
- Idx0 = Idxs[Paired.Width][CI.Width - 1];
+ if (ReverseOrder) {
+ Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
+ Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
+ } else {
+ Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
+ Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
+ }
} else {
- Idx0 = Idxs[0][CI.Width - 1];
- Idx1 = Idxs[CI.Width][Paired.Width - 1];
+ static const unsigned Idxs[4][4] = {
+ {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
+ {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
+ {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
+ {AMDGPU::sub3, 0, 0, 0},
+ };
+
+ assert(CI.Width >= 1 && CI.Width <= 3);
+ assert(Paired.Width >= 1 && Paired.Width <= 3);
+
+ if (ReverseOrder) {
+ Idx1 = Idxs[0][Paired.Width - 1];
+ Idx0 = Idxs[Paired.Width][CI.Width - 1];
+ } else {
+ Idx0 = Idxs[0][CI.Width - 1];
+ Idx1 = Idxs[CI.Width][Paired.Width - 1];
+ }
}
return std::make_pair(Idx0, Idx1);
@@ -2048,7 +2065,7 @@ SILoadStoreOptimizer::collectMergeableInsts(
// adjacent to each other in the list, which will make it easier to find
// matches.
MergeList.sort(
- [] (const CombineInfo &A, CombineInfo &B) {
+ [] (const CombineInfo &A, const CombineInfo &B) {
return A.Offset < B.Offset;
});
++I;
@@ -2140,7 +2157,7 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
MachineBasicBlock::iterator NewMI =
mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
+ OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
break;
}
case BUFFER_LOAD: {