diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-01-19 10:01:25 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-01-19 10:01:25 +0000 |
commit | d8e91e46262bc44006913e6796843909f1ac7bcd (patch) | |
tree | 7d0c143d9b38190e0fa0180805389da22cd834c5 /lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | |
parent | b7eb8e35e481a74962664b63dfb09483b200209a (diff) |
Notes
Diffstat (limited to 'lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 1062 |
1 files changed, 816 insertions, 246 deletions
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 4b537540046f..be291b127301 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -20,6 +20,26 @@ // ==> // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 // +// This pass also tries to promote constant offset to the immediate by +// adjusting the base. It tries to use a base from the nearby instructions that +// allows it to have a 13bit constant offset and then promotes the 13bit offset +// to the immediate. +// E.g. +// s_movk_i32 s0, 0x1800 +// v_add_co_u32_e32 v0, vcc, s0, v2 +// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +// +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[0:1], off +// => +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[5:6], off offset:2048 // // Future improvements: // @@ -43,9 +63,9 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" @@ -74,23 +94,38 @@ using namespace llvm; #define DEBUG_TYPE "si-load-store-opt" namespace { +enum InstClassEnum { + UNKNOWN, + DS_READ, + DS_WRITE, + S_BUFFER_LOAD_IMM, + BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN, + BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN, + BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET, + BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, + BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, + BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, + BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, +}; -class SILoadStoreOptimizer : public MachineFunctionPass { - enum InstClassEnum { - DS_READ_WRITE, - S_BUFFER_LOAD_IMM, - BUFFER_LOAD_OFFEN, - BUFFER_LOAD_OFFSET, - BUFFER_STORE_OFFEN, - BUFFER_STORE_OFFSET, - }; +enum RegisterEnum { + SBASE = 0x1, + SRSRC = 0x2, + SOFFSET = 0x4, + VADDR = 0x8, + ADDR = 0x10, +}; +class SILoadStoreOptimizer : public MachineFunctionPass { struct CombineInfo { MachineBasicBlock::iterator I; MachineBasicBlock::iterator Paired; unsigned EltSize; unsigned Offset0; unsigned Offset1; + unsigned Width0; + unsigned Width1; unsigned BaseOff; InstClassEnum InstClass; bool GLC0; @@ -98,9 +133,23 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool SLC0; bool SLC1; bool UseST64; - bool IsX2; - SmallVector<MachineInstr*, 8> InstsToMove; - }; + SmallVector<MachineInstr *, 8> InstsToMove; + }; + + struct BaseRegisters { + unsigned LoReg = 0; + unsigned HiReg = 0; + + unsigned LoSubReg = 0; + unsigned HiSubReg = 0; + }; + + struct MemAddress { + BaseRegisters Base; + int64_t Offset = 0; + }; + + using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; private: const GCNSubtarget *STM = nullptr; @@ -108,9 +157,16 @@ private: const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; - unsigned CreatedX2; + bool OptimizeAgain; static bool offsetsCanBeCombined(CombineInfo &CI); + static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); + static unsigned getNewOpcode(const CombineInfo &CI); + static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); + const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); + unsigned getOpcodeWidth(const MachineInstr &MI); + InstClassEnum getInstClass(unsigned Opc); + unsigned getRegs(unsigned Opc); bool findMatchingInst(CombineInfo &CI); @@ -123,10 +179,21 @@ private: MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); - unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, - bool &IsOffen) const; MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); + void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, + int32_t NewOffset); + unsigned computeBase(MachineInstr &MI, const MemAddress &Addr); + MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI); + Optional<int32_t> extractConstOffset(const MachineOperand &Op); + void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr); + /// Promotes constant offset to the immediate by adjusting the base. It + /// tries to use a base from the nearby instructions that allows it to have + /// a 13bit constant offset which gets promoted to the immediate. + bool promoteConstantOffsetToImm(MachineInstr &CI, + MemInfoMap &Visited, + SmallPtrSet<MachineInstr *, 4> &Promoted); + public: static char ID; @@ -153,8 +220,8 @@ public: INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load Store Optimizer", false, false) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", + false, false) char SILoadStoreOptimizer::ID = 0; @@ -165,7 +232,7 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() { } static void moveInstsAfter(MachineBasicBlock::iterator I, - ArrayRef<MachineInstr*> InstsToMove) { + ArrayRef<MachineInstr *> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); ++I; for (MachineInstr *MI : InstsToMove) { @@ -191,21 +258,19 @@ static void addDefsUsesToList(const MachineInstr &MI, static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, - AliasAnalysis * AA) { + AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder return !(A->mayStore() || B->mayStore()) || - TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); + TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); } // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. -static bool -addToListsIfDependent(MachineInstr &MI, - DenseSet<unsigned> &RegDefs, - DenseSet<unsigned> &PhysRegUses, - SmallVectorImpl<MachineInstr*> &Insts) { +static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, + DenseSet<unsigned> &PhysRegUses, + SmallVectorImpl<MachineInstr *> &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move @@ -228,18 +293,16 @@ addToListsIfDependent(MachineInstr &MI, return false; } -static bool -canMoveInstsAcrossMemOp(MachineInstr &MemOp, - ArrayRef<MachineInstr*> InstsToMove, - const SIInstrInfo *TII, - AliasAnalysis *AA) { +static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, + ArrayRef<MachineInstr *> InstsToMove, + const SIInstrInfo *TII, AliasAnalysis *AA) { assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) - return false; + return false; } return true; } @@ -260,10 +323,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { CI.BaseOff = 0; // Handle SMEM and VMEM instructions. - if (CI.InstClass != DS_READ_WRITE) { - unsigned Diff = CI.IsX2 ? 2 : 1; - return (EltOffset0 + Diff == EltOffset1 || - EltOffset1 + Diff == EltOffset0) && + if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { + return (EltOffset0 + CI.Width0 == EltOffset1 || + EltOffset1 + CI.Width1 == EltOffset0) && CI.GLC0 == CI.GLC1 && (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); } @@ -305,42 +367,176 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { return false; } +bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, + const CombineInfo &CI) { + const unsigned Width = (CI.Width0 + CI.Width1); + switch (CI.InstClass) { + default: + return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return false; + case 2: + case 4: + return true; + } + } +} + +unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) { + const unsigned Opc = MI.getOpcode(); + + if (TII->isMUBUF(MI)) { + return AMDGPU::getMUBUFDwords(Opc); + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + return 1; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + return 2; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return 4; + } +} + +InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) { + if (TII->isMUBUF(Opc)) { + const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc); + + // If we couldn't identify the opcode, bail out. + if (baseOpcode == -1) { + return UNKNOWN; + } + + switch (baseOpcode) { + default: + return UNKNOWN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + return BUFFER_LOAD_OFFEN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + return BUFFER_LOAD_OFFSET; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + return BUFFER_STORE_OFFEN; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + return BUFFER_STORE_OFFSET; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + return BUFFER_LOAD_OFFEN_exact; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + return BUFFER_LOAD_OFFSET_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + return BUFFER_STORE_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return BUFFER_STORE_OFFSET_exact; + } + } + + switch (Opc) { + default: + return UNKNOWN; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return S_BUFFER_LOAD_IMM; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + return DS_READ; + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return DS_WRITE; + } +} + +unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) { + if (TII->isMUBUF(Opc)) { + unsigned result = 0; + + if (AMDGPU::getMUBUFHasVAddr(Opc)) { + result |= VADDR; + } + + if (AMDGPU::getMUBUFHasSrsrc(Opc)) { + result |= SRSRC; + } + + if (AMDGPU::getMUBUFHasSoffset(Opc)) { + result |= SOFFSET; + } + + return result; + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return SBASE; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return ADDR; + } +} + bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; - unsigned AddrOpName[3] = {0}; - int AddrIdx[3]; - const MachineOperand *AddrReg[3]; + const unsigned Opc = CI.I->getOpcode(); + const InstClassEnum InstClass = getInstClass(Opc); + + if (InstClass == UNKNOWN) { + return false; + } + + const unsigned Regs = getRegs(Opc); + + unsigned AddrOpName[5] = {0}; + int AddrIdx[5]; + const MachineOperand *AddrReg[5]; unsigned NumAddresses = 0; - switch (CI.InstClass) { - case DS_READ_WRITE: + if (Regs & ADDR) { AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - break; - case S_BUFFER_LOAD_IMM: + } + + if (Regs & SBASE) { AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - break; - case BUFFER_LOAD_OFFEN: - case BUFFER_STORE_OFFEN: - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; - case BUFFER_LOAD_OFFSET: - case BUFFER_STORE_OFFSET: + } + + if (Regs & SRSRC) { AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + } + + if (Regs & SOFFSET) { AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; + } + + if (Regs & VADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; } for (unsigned i = 0; i < NumAddresses; i++) { AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); - // We only ever merge operations with the same base address register, so don't - // bother scanning forward if there are no other uses. + // We only ever merge operations with the same base address register, so + // don't bother scanning forward if there are no other uses. if (AddrReg[i]->isReg() && (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) @@ -353,8 +549,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { DenseSet<unsigned> PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - for ( ; MBBI != E; ++MBBI) { - if (MBBI->getOpcode() != CI.I->getOpcode()) { + for (; MBBI != E; ++MBBI) { + const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); + + if ((getInstClass(MBBI->getOpcode()) != InstClass) || + (IsDS && (MBBI->getOpcode() != Opc))) { // This is not a matching DS instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. @@ -368,8 +567,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -413,8 +612,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { continue; } - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. + // Check same base pointer. Be careful of subregisters, which can occur + // with vectors of pointers. if (AddrReg[i]->getReg() != AddrRegNext.getReg() || AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { Match = false; @@ -423,13 +622,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (Match) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), - AMDGPU::OpName::offset); + int OffsetIdx = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Width0 = getOpcodeWidth(*CI.I); CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); + CI.Width1 = getOpcodeWidth(*MBBI); CI.Paired = MBBI; - if (CI.InstClass == DS_READ_WRITE) { + if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { CI.Offset0 &= 0xffff; CI.Offset1 &= 0xffff; } else { @@ -445,7 +646,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (offsetsCanBeCombined(CI)) + if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) return true; } @@ -472,12 +673,12 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 + : AMDGPU::DS_READ2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -489,8 +690,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; @@ -502,39 +703,40 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Read2Desc = TII->get(Opc); - const TargetRegisterClass *SuperRC - = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + const TargetRegisterClass *SuperRC = + (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = CI.I->getDebugLoc(); unsigned BaseReg = AddrReg->getReg(); + unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg()); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); + BaseSubReg = 0; } MachineInstrBuilder Read2 = - BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) - .addReg(BaseReg, BaseRegFlags) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); (void)Read2; @@ -561,32 +763,36 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; - return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 + : AMDGPU::DS_WRITE2_B64_gfx9; } unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) - return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 + : AMDGPU::DS_WRITE2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 + : AMDGPU::DS_WRITE2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); - const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); - const MachineOperand *Data1 - = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); + const MachineOperand *AddrReg = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); + const MachineOperand *Data1 = + TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -595,36 +801,37 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); unsigned BaseReg = AddrReg->getReg(); + unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg()); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); + BaseSubReg = 0; } MachineInstrBuilder Write2 = - BuildMI(*MBB, CI.Paired, DL, Write2Desc) - .addReg(BaseReg, BaseRegFlags) // addr - .add(*Data0) // data0 - .add(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + BuildMI(*MBB, CI.Paired, DL, Write2Desc) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .add(*Data0) // data0 + .add(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(Write2, CI.InstsToMove); @@ -636,15 +843,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : - AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); @@ -652,14 +858,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) .addImm(MergedOffset) // offset .addImm(CI.GLC0) // glc - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); - - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -681,29 +884,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode; - if (CI.InstClass == BUFFER_LOAD_OFFEN) { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; - } else { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; - } + const unsigned Opcode = getNewOpcode(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + // Copy to the new source register. unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); - if (CI.InstClass == BUFFER_LOAD_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + const unsigned Regs = getRegs(Opcode); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -711,14 +910,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(CI.GLC0) // glc .addImm(CI.SLC0) // slc .addImm(0) // tfe - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); - - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -740,57 +936,137 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( return Next; } -unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( - const MachineInstr &I, bool &IsX2, bool &IsOffen) const { - IsX2 = false; - IsOffen = false; - - switch (I.getOpcode()) { - case AMDGPU::BUFFER_STORE_DWORD_OFFEN: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; - } - return 0; -} - -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( - CombineInfo &CI) { +unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { + const unsigned Width = CI.Width0 + CI.Width1; + + switch (CI.InstClass) { + default: + return AMDGPU::getMUBUFOpcode(CI.InstClass, Width); + case UNKNOWN: + llvm_unreachable("Unknown instruction class"); + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + case 4: + return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + } + } +} + +std::pair<unsigned, unsigned> +SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { + if (CI.Offset0 > CI.Offset1) { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); + case 3: + return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); + } + } + } else { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); + case 2: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); + case 3: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); + case 2: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); + } + } + } +} + +const TargetRegisterClass * +SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { + if (CI.InstClass == S_BUFFER_LOAD_IMM) { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::SReg_64_XEXECRegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: + return &AMDGPU::SReg_512RegClass; + } + } else { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + } + } +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - bool Unused1, Unused2; - unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + const unsigned Opcode = getNewOpcode(CI); - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the new source register. - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); unsigned SrcReg = MRI->createVirtualRegister(SuperRC); const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); @@ -803,18 +1079,20 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( .addImm(SubRegIdx1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) - .addReg(SrcReg, RegState::Kill); + .addReg(SrcReg, RegState::Kill); - if (CI.InstClass == BUFFER_STORE_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + const unsigned Regs = getRegs(Opcode); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(MIB, CI.InstsToMove); @@ -824,105 +1102,399 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( return Next; } +MachineOperand +SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { + APInt V(32, Val, true); + if (TII->isInlineConstant(V)) + return MachineOperand::CreateImm(Val); + + unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + MachineInstr *Mov = + BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(Val); + (void)Mov; + LLVM_DEBUG(dbgs() << " "; Mov->dump()); + return MachineOperand::CreateReg(Reg, false); +} + +// Compute base address using Addr and return the final register. +unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, + const MemAddress &Addr) { + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + DebugLoc DL = MI.getDebugLoc(); + + assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || + Addr.Base.LoSubReg) && + "Expected 32-bit Base-Register-Low!!"); + + assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || + Addr.Base.HiSubReg) && + "Expected 32-bit Base-Register-Hi!!"); + + LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); + MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); + MachineOperand OffsetHi = + createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); + unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned DeadCarryReg = + MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *LoHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) + .addReg(CarryReg, RegState::Define) + .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) + .add(OffsetLo); + (void)LoHalf; + LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); + + MachineInstr *HiHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) + .add(OffsetHi) + .addReg(CarryReg, RegState::Kill); + (void)HiHalf; + LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); + + unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + MachineInstr *FullBase = + BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + (void)FullBase; + LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); + + return FullDestReg; +} + +// Update base and offset with the NewBase and NewOffset in MI. +void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, + unsigned NewBase, + int32_t NewOffset) { + TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase); + TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); +} + +Optional<int32_t> +SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { + if (Op.isImm()) + return Op.getImm(); + + if (!Op.isReg()) + return None; + + MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || + !Def->getOperand(1).isImm()) + return None; + + return Def->getOperand(1).getImm(); +} + +// Analyze Base and extracts: +// - 32bit base registers, subregisters +// - 64bit constant offset +// Expecting base computation as: +// %OFFSET0:sgpr_32 = S_MOV_B32 8000 +// %LO:vgpr_32, %c:sreg_64_xexec = +// V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, +// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec +// %Base:vreg_64 = +// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 +void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, + MemAddress &Addr) { + if (!Base.isReg()) + return; + + MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE + || Def->getNumOperands() != 5) + return; + + MachineOperand BaseLo = Def->getOperand(1); + MachineOperand BaseHi = Def->getOperand(3); + if (!BaseLo.isReg() || !BaseHi.isReg()) + return; + + MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); + MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); + + if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || + !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) + return; + + const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); + const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); + + auto Offset0P = extractConstOffset(*Src0); + if (Offset0P) + BaseLo = *Src1; + else { + if (!(Offset0P = extractConstOffset(*Src1))) + return; + BaseLo = *Src0; + } + + Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); + Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); + + if (Src0->isImm()) + std::swap(Src0, Src1); + + if (!Src1->isImm()) + return; + + uint64_t Offset1 = Src1->getImm(); + BaseHi = *Src0; + + Addr.Base.LoReg = BaseLo.getReg(); + Addr.Base.HiReg = BaseHi.getReg(); + Addr.Base.LoSubReg = BaseLo.getSubReg(); + Addr.Base.HiSubReg = BaseHi.getSubReg(); + Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); +} + +bool SILoadStoreOptimizer::promoteConstantOffsetToImm( + MachineInstr &MI, + MemInfoMap &Visited, + SmallPtrSet<MachineInstr *, 4> &AnchorList) { + + // TODO: Support flat and scratch. + if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 || + TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) + return false; + + // TODO: Support Store. + if (!MI.mayLoad()) + return false; + + if (AnchorList.count(&MI)) + return false; + + LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); + + if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { + LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); + return false; + } + + // Step1: Find the base-registers and a 64bit constant offset. + MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + MemAddress MAddr; + if (Visited.find(&MI) == Visited.end()) { + processBaseWithConstOffset(Base, MAddr); + Visited[&MI] = MAddr; + } else + MAddr = Visited[&MI]; + + if (MAddr.Offset == 0) { + LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" + " constant offsets that can be promoted.\n";); + return false; + } + + LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " + << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); + + // Step2: Traverse through MI's basic block and find an anchor(that has the + // same base-registers) with the highest 13bit distance from MI's offset. + // E.g. (64bit loads) + // bb: + // addr1 = &a + 4096; load1 = load(addr1, 0) + // addr2 = &a + 6144; load2 = load(addr2, 0) + // addr3 = &a + 8192; load3 = load(addr3, 0) + // addr4 = &a + 10240; load4 = load(addr4, 0) + // addr5 = &a + 12288; load5 = load(addr5, 0) + // + // Starting from the first load, the optimization will try to find a new base + // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 + // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 + // as the new-base(anchor) because of the maximum distance which can + // accomodate more intermediate bases presumeably. + // + // Step3: move (&a + 8192) above load1. Compute and promote offsets from + // (&a + 8192) for load1, load2, load4. + // addr = &a + 8192 + // load1 = load(addr, -4096) + // load2 = load(addr, -2048) + // load3 = load(addr, 0) + // load4 = load(addr, 2048) + // addr5 = &a + 12288; load5 = load(addr5, 0) + // + MachineInstr *AnchorInst = nullptr; + MemAddress AnchorAddr; + uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); + SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; + + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + ++MBBI; + const SITargetLowering *TLI = + static_cast<const SITargetLowering *>(STM->getTargetLowering()); + + for ( ; MBBI != E; ++MBBI) { + MachineInstr &MINext = *MBBI; + // TODO: Support finding an anchor(with same base) from store addresses or + // any other load addresses where the opcodes are different. + if (MINext.getOpcode() != MI.getOpcode() || + TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) + continue; + + const MachineOperand &BaseNext = + *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); + MemAddress MAddrNext; + if (Visited.find(&MINext) == Visited.end()) { + processBaseWithConstOffset(BaseNext, MAddrNext); + Visited[&MINext] = MAddrNext; + } else + MAddrNext = Visited[&MINext]; + + if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || + MAddrNext.Base.HiReg != MAddr.Base.HiReg || + MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || + MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) + continue; + + InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); + + int64_t Dist = MAddr.Offset - MAddrNext.Offset; + TargetLoweringBase::AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = Dist; + if (TLI->isLegalGlobalAddressingMode(AM) && + (uint32_t)std::abs(Dist) > MaxDist) { + MaxDist = std::abs(Dist); + + AnchorAddr = MAddrNext; + AnchorInst = &MINext; + } + } + + if (AnchorInst) { + LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; + AnchorInst->dump()); + LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " + << AnchorAddr.Offset << "\n\n"); + + // Instead of moving up, just re-compute anchor-instruction's base address. + unsigned Base = computeBase(MI, AnchorAddr); + + updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); + + for (auto P : InstsWCommonBase) { + TargetLoweringBase::AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = P.second - AnchorAddr.Offset; + + if (TLI->isLegalGlobalAddressingMode(AM)) { + LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; + dbgs() << ")"; P.first->dump()); + updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); + } + } + AnchorList.insert(AnchorInst); + return true; + } + + return false; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { bool Modified = false; + // Contain the list + MemInfoMap Visited; + // Contains the list of instructions for which constant offsets are being + // promoted to the IMM. + SmallPtrSet<MachineInstr *, 4> AnchorList; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { MachineInstr &MI = *I; + if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) + Modified = true; + // Don't combine if volatile. if (MI.hasOrderedMemoryRef()) { ++I; continue; } + const unsigned Opc = MI.getOpcode(); + CombineInfo CI; CI.I = I; - unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || - Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { + CI.InstClass = getInstClass(Opc); - CI.InstClass = DS_READ_WRITE; + switch (CI.InstClass) { + default: + break; + case DS_READ: CI.EltSize = - (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; - + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); } else { ++I; } - continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || - Opc == AMDGPU::DS_WRITE_B32_gfx9 || - Opc == AMDGPU::DS_WRITE_B64_gfx9) { - CI.InstClass = DS_READ_WRITE; - CI.EltSize - = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; - + case DS_WRITE: + CI.EltSize = + (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); } else { ++I; } - continue; - } - if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || - Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { - // EltSize is in units of the offset encoding. - CI.InstClass = S_BUFFER_LOAD_IMM; + case S_BUFFER_LOAD_IMM: CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); - CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; if (findMatchingInst(CI)) { Modified = true; I = mergeSBufferLoadImmPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; } else { ++I; } continue; - } - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) - CI.InstClass = BUFFER_LOAD_OFFEN; - else - CI.InstClass = BUFFER_LOAD_OFFSET; - + case BUFFER_LOAD_OFFEN: + case BUFFER_LOAD_OFFSET: + case BUFFER_LOAD_OFFEN_exact: + case BUFFER_LOAD_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferLoadPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } continue; - } - - bool StoreIsX2, IsOffen; - if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { - CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; + case BUFFER_STORE_OFFEN: + case BUFFER_STORE_OFFSET: + case BUFFER_STORE_OFFEN_exact: + case BUFFER_STORE_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = StoreIsX2; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferStorePair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } @@ -956,12 +1528,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Modified = false; for (MachineBasicBlock &MBB : MF) { - CreatedX2 = 0; - Modified |= optimizeBlock(MBB); - - // Run again to convert x2 to x4. - if (CreatedX2 >= 1) + do { + OptimizeAgain = false; Modified |= optimizeBlock(MBB); + } while (OptimizeAgain); } return Modified; |