diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 173 |
1 files changed, 135 insertions, 38 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 6d4e1d2c898b..2b5ca33b0e4f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -74,6 +74,8 @@ enum InstClassEnum { DS_READ, DS_WRITE, S_BUFFER_LOAD_IMM, + S_BUFFER_LOAD_SGPR_IMM, + S_LOAD_IMM, BUFFER_LOAD, BUFFER_STORE, MIMG, @@ -120,7 +122,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned NumAddresses; unsigned Order; - bool hasSameBaseAddress(const MachineInstr &MI) { + bool hasSameBaseAddress(const CombineInfo &CI) { + if (NumAddresses != CI.NumAddresses) + return false; + + const MachineInstr &MI = *CI.I; for (unsigned i = 0; i < NumAddresses; i++) { const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); @@ -159,7 +165,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { if (AddrOp->getReg().isPhysical()) return false; - // If an address has only one use then there will be on other + // If an address has only one use then there will be no other // instructions with the same address, so we can't merge this one. if (MRI.hasOneNonDBGUse(AddrOp->getReg())) return false; @@ -232,8 +238,8 @@ private: mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator - mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, - MachineBasicBlock::iterator InsertBefore); + mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore); @@ -257,7 +263,7 @@ private: int32_t NewOffset) const; Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; - Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; + std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; /// Promotes constant offset to the immediate by adjusting the base. It /// tries to use a base from the nearby instructions that allows it to have @@ -317,7 +323,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { if (TII.isMIMG(MI)) { uint64_t DMaskImm = TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); - return countPopulation(DMaskImm); + return llvm::popcount(DMaskImm); } if (TII.isMTBUF(Opc)) { return AMDGPU::getMTBUFElements(Opc); @@ -325,6 +331,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: + case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: case AMDGPU::GLOBAL_STORE_DWORD: @@ -333,6 +342,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::FLAT_STORE_DWORD: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX2: @@ -348,6 +360,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::FLAT_STORE_DWORDX3: return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX4: case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4: @@ -356,15 +371,18 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::FLAT_STORE_DWORDX4: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: return 8; - case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; - case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; - case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; + case AMDGPU::DS_READ_B32: [[fallthrough]]; + case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]]; + case AMDGPU::DS_WRITE_B32: [[fallthrough]]; case AMDGPU::DS_WRITE_B32_gfx9: return 1; - case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; - case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; - case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; + case AMDGPU::DS_READ_B64: [[fallthrough]]; + case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]]; + case AMDGPU::DS_WRITE_B64: [[fallthrough]]; case AMDGPU::DS_WRITE_B64_gfx9: return 2; default: @@ -394,8 +412,8 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { } if (TII.isMIMG(Opc)) { // Ignore instructions encoded without vaddr. - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) + if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && + !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) return UNKNOWN; // Ignore BVH instructions if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) @@ -428,6 +446,22 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return S_BUFFER_LOAD_IMM; + // For the purposes of this optimization SGPR variants of buffer loads + // are considered to be zero-offsetted SGPR_IMM loads. + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + return S_BUFFER_LOAD_SGPR_IMM; + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: + return S_LOAD_IMM; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: case AMDGPU::DS_READ_B64: @@ -499,6 +533,22 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; + // For the purposes of this optimization SGPR variants of buffer loads + // are considered to be zero-offsetted SGPR_IMM loads. + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: + return AMDGPU::S_LOAD_DWORD_IMM; case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX3: @@ -591,10 +641,24 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { switch (Opc) { default: return Result; + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + Result.SOffset = true; + [[fallthrough]]; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: Result.SBase = true; return Result; case AMDGPU::DS_READ_B32: @@ -616,7 +680,7 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: Result.SAddr = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX3: @@ -661,6 +725,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, : 4; break; case S_BUFFER_LOAD_IMM: + case S_BUFFER_LOAD_SGPR_IMM: + case S_LOAD_IMM: EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); break; default: @@ -674,7 +740,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, Offset = 0; } else { int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); - Offset = I->getOperand(OffsetIdx).getImm(); + Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm(); } if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) @@ -981,6 +1047,8 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, default: return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); case S_BUFFER_LOAD_IMM: + case S_BUFFER_LOAD_SGPR_IMM: + case S_LOAD_IMM: switch (Width) { default: return false; @@ -1293,7 +1361,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, return New; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); @@ -1310,12 +1378,16 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - MachineInstr *New = + MachineInstrBuilder New = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.CPol) // cpol - .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); + if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) + New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); + // For convenience, when SGPR_IMM buffer loads are merged into a + // zero-offset load, we generate its SGPR variant. + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset)) + New.addImm(MergedOffset); + New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1369,7 +1441,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(MergedOffset) // offset .addImm(CI.CPol) // cpol - .addImm(0) // tfe .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); @@ -1429,7 +1500,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( .addImm(MergedOffset) // offset .addImm(JoinedFormat) // format .addImm(CI.CPol) // cpol - .addImm(0) // tfe .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); @@ -1501,7 +1571,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( .addImm(std::min(CI.Offset, Paired.Offset)) // offset .addImm(JoinedFormat) // format .addImm(CI.CPol) // cpol - .addImm(0) // tfe .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); @@ -1623,6 +1692,31 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case 8: return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } + case S_BUFFER_LOAD_SGPR_IMM: + switch (Width) { + default: + return 0; + case 2: + return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR + : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; + case 4: + return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR + : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; + case 8: + return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR + : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; + } + case S_LOAD_IMM: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_LOAD_DWORDX2_IMM; + case 4: + return AMDGPU::S_LOAD_DWORDX4_IMM; + case 8: + return AMDGPU::S_LOAD_DWORDX8_IMM; + } case GLOBAL_LOAD: switch (Width) { default: @@ -1690,7 +1784,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return AMDGPU::FLAT_STORE_DWORDX4; } case MIMG: - assert((countPopulation(CI.DMask | Paired.DMask) == Width) && + assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && "No overlaps"); return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); } @@ -1699,8 +1793,9 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, std::pair<unsigned, unsigned> SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { - assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) == - CI.Width + Paired.Width)) && + assert((CI.InstClass != MIMG || + ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == + CI.Width + Paired.Width)) && "No overlaps"); unsigned Idx0; @@ -1725,13 +1820,14 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, Idx1 = Idxs[CI.Width][Paired.Width - 1]; } - return std::make_pair(Idx0, Idx1); + return std::pair(Idx0, Idx1); } const TargetRegisterClass * SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, const CombineInfo &Paired) { - if (CI.InstClass == S_BUFFER_LOAD_IMM) { + if (CI.InstClass == S_BUFFER_LOAD_IMM || + CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { switch (CI.Width + Paired.Width) { default: return nullptr; @@ -1796,7 +1892,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(std::min(CI.Offset, Paired.Offset)) // offset .addImm(CI.CPol) // cpol - .addImm(0) // tfe .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); @@ -1889,18 +1984,18 @@ void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); } -Optional<int32_t> +std::optional<int32_t> SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { if (Op.isImm()) return Op.getImm(); if (!Op.isReg()) - return None; + return std::nullopt; MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || !Def->getOperand(1).isImm()) - return None; + return std::nullopt; return Def->getOperand(1).getImm(); } @@ -2072,7 +2167,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) continue; - InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); + InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset)); int64_t Dist = MAddr.Offset - MAddrNext.Offset; TargetLoweringBase::AddrMode AM; @@ -2123,7 +2218,7 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, for (std::list<CombineInfo> &AddrList : MergeableInsts) { if (AddrList.front().InstClass == CI.InstClass && AddrList.front().IsAGPR == CI.IsAGPR && - AddrList.front().hasSameBaseAddress(*CI.I)) { + AddrList.front().hasSameBaseAddress(CI)) { AddrList.emplace_back(CI); return; } @@ -2222,7 +2317,7 @@ SILoadStoreOptimizer::collectMergeableInsts( ++I; } - return std::make_pair(BlockI, Modified); + return std::pair(BlockI, Modified); } // Scan through looking for adjacent LDS operations with constant offsets from @@ -2300,7 +2395,9 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( NewMI = mergeWrite2Pair(CI, Paired, Where->I); break; case S_BUFFER_LOAD_IMM: - NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); + case S_BUFFER_LOAD_SGPR_IMM: + case S_LOAD_IMM: + NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 8; break; case BUFFER_LOAD: |