src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2022-03-20 11:40:34 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2022-06-04 11:58:51 +0000
commit	4b6eb0e63c698094db5506763df44cc83c19f643 (patch)
tree	f1d30b8c10bc6db323b91538745ae8ab8b593910 /contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
parent	76886853f03395abb680824bcc74e98f83bd477a (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

1 files changed, 49 insertions, 32 deletions

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 493c1ad87f93..34cbb49dcd16 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

@@ -146,7 +146,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {

if (!AddrOp->isReg())

return false;

- // TODO: We should be able to merge physical reg addreses.

+ // TODO: We should be able to merge physical reg addresses.

if (AddrOp->getReg().isPhysical())

return false;

@@ -303,6 +303,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {

return 2;

case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

return 4;

+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

+ return 8;

case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;

case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;

case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;

@@ -343,6 +345,9 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {

if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&

AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)

return UNKNOWN;

+ // Ignore BVH instructions

+ if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)

+ return UNKNOWN;

// TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.

if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||

TII.isGather4(Opc))

@@ -369,6 +374,7 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {

case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

return S_BUFFER_LOAD_IMM;

case AMDGPU::DS_READ_B32:

case AMDGPU::DS_READ_B32_gfx9:

@@ -380,15 +386,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {

case AMDGPU::DS_WRITE_B64:

case AMDGPU::DS_WRITE_B64_gfx9:

return DS_WRITE;

- case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:

- case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:

- case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:

- case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:

- case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:

- case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:

- case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:

- case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:

- return UNKNOWN;

}

@@ -419,6 +416,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {

case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;

}

@@ -469,6 +467,7 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {

case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

Result.SBase = true;

return Result;

case AMDGPU::DS_READ_B32:

@@ -653,7 +652,7 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,

}

// This function assumes that \p A and \p B have are identical except for

-// size and offset, and they referecne adjacent memory.

+// size and offset, and they reference adjacent memory.

static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,

const MachineMemOperand *A,

const MachineMemOperand *B) {

@@ -863,6 +862,7 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,

return false;

case 2:

case 4:

+ case 8:

return true;

}

@@ -1529,45 +1529,62 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,

return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;

case 4:

return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;

+ case 8:

+ return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;

}

case MIMG:

- assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));

+ assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&

+ "No overlaps");

return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);

}

std::pair<unsigned, unsigned>

-SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {

+SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,

+ const CombineInfo &Paired) {

- if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)

- return std::make_pair(0, 0);

+ assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero");

bool ReverseOrder;

if (CI.InstClass == MIMG) {

- assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&

- "No overlaps");

+ assert(

+ (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&

+ "No overlaps");

ReverseOrder = CI.DMask > Paired.DMask;

} else

ReverseOrder = CI.Offset > Paired.Offset;

- static const unsigned Idxs[4][4] = {

- {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},

- {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},

- {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},

- {AMDGPU::sub3, 0, 0, 0},

- };

unsigned Idx0;

unsigned Idx1;

- assert(CI.Width >= 1 && CI.Width <= 3);

- assert(Paired.Width >= 1 && Paired.Width <= 3);

+ if (CI.Width + Paired.Width > 4) {

+ assert(CI.Width == 4 && Paired.Width == 4);

- if (ReverseOrder) {

- Idx1 = Idxs[0][Paired.Width - 1];

- Idx0 = Idxs[Paired.Width][CI.Width - 1];

+ if (ReverseOrder) {

+ Idx1 = AMDGPU::sub0_sub1_sub2_sub3;

+ Idx0 = AMDGPU::sub4_sub5_sub6_sub7;

+ } else {

+ Idx0 = AMDGPU::sub0_sub1_sub2_sub3;

+ Idx1 = AMDGPU::sub4_sub5_sub6_sub7;

+ }

} else {

- Idx0 = Idxs[0][CI.Width - 1];

- Idx1 = Idxs[CI.Width][Paired.Width - 1];

+ static const unsigned Idxs[4][4] = {

+ {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},

+ {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},

+ {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},

+ {AMDGPU::sub3, 0, 0, 0},

+ };

+ assert(CI.Width >= 1 && CI.Width <= 3);

+ assert(Paired.Width >= 1 && Paired.Width <= 3);

+ if (ReverseOrder) {

+ Idx1 = Idxs[0][Paired.Width - 1];

+ Idx0 = Idxs[Paired.Width][CI.Width - 1];

+ } else {

+ Idx0 = Idxs[0][CI.Width - 1];

+ Idx1 = Idxs[CI.Width][Paired.Width - 1];

+ }

}

return std::make_pair(Idx0, Idx1);

@@ -2048,7 +2065,7 @@ SILoadStoreOptimizer::collectMergeableInsts(

// adjacent to each other in the list, which will make it easier to find

// matches.

MergeList.sort(

- [] (const CombineInfo &A, CombineInfo &B) {

+ [] (const CombineInfo &A, const CombineInfo &B) {

return A.Offset < B.Offset;

});

++I;

@@ -2140,7 +2157,7 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(

MachineBasicBlock::iterator NewMI =

mergeSBufferLoadImmPair(CI, Paired, InstsToMove);

CI.setMI(NewMI, *TII, *STM);

- OptimizeListAgain |= (CI.Width + Paired.Width) < 16;

+ OptimizeListAgain |= (CI.Width + Paired.Width) < 8;

break;

}

case BUFFER_LOAD: {