diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 842 |
1 files changed, 506 insertions, 336 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 44bdbe37dec0..6d4e1d2c898b 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -79,6 +79,13 @@ enum InstClassEnum { MIMG, TBUFFER_LOAD, TBUFFER_STORE, + GLOBAL_LOAD_SADDR, + GLOBAL_STORE_SADDR, + FLAT_LOAD, + FLAT_STORE, + GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of + GLOBAL_STORE // any CombineInfo, they are only ever returned by + // getCommonInstClass. }; struct AddressRegs { @@ -86,6 +93,7 @@ struct AddressRegs { bool SBase = false; bool SRsrc = false; bool SOffset = false; + bool SAddr = false; bool VAddr = false; bool Addr = false; bool SSamp = false; @@ -160,6 +168,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass { } void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); + + // Compare by pointer order. + bool operator<(const CombineInfo& Other) const { + return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; + } }; struct BaseRegisters { @@ -185,6 +198,9 @@ private: AliasAnalysis *AA = nullptr; bool OptimizeAgain; + bool canSwapInstructions(const DenseSet<Register> &ARegDefs, + const DenseSet<Register> &ARegUses, + const MachineInstr &A, const MachineInstr &B) const; static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII, const CombineInfo &Paired); @@ -199,38 +215,43 @@ private: const CombineInfo &Paired); const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; - bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, - SmallVectorImpl<MachineInstr *> &InstsToMove); + CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; - MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, - CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator + mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); + MachineBasicBlock::iterator + mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); + MachineBasicBlock::iterator + mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); void updateBaseAndOffset(MachineInstr &I, Register NewBase, int32_t NewOffset) const; @@ -252,6 +273,12 @@ private: MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, std::list<std::list<CombineInfo>> &MergeableInsts) const; + static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, + const CombineInfo &Paired); + + static InstClassEnum getCommonInstClass(const CombineInfo &CI, + const CombineInfo &Paired); + public: static char ID; @@ -298,10 +325,35 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORD: + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_STORE_DWORD: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_STORE_DWORDX2: return 2; + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_STORE_DWORDX3: + return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX4: + case AMDGPU::FLAT_STORE_DWORDX4: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return 8; @@ -386,11 +438,40 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B64_gfx9: return DS_WRITE; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX4: + return FLAT_LOAD; + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + return GLOBAL_LOAD_SADDR; + case AMDGPU::GLOBAL_STORE_DWORD: + case AMDGPU::GLOBAL_STORE_DWORDX2: + case AMDGPU::GLOBAL_STORE_DWORDX3: + case AMDGPU::GLOBAL_STORE_DWORDX4: + case AMDGPU::FLAT_STORE_DWORD: + case AMDGPU::FLAT_STORE_DWORDX2: + case AMDGPU::FLAT_STORE_DWORDX3: + case AMDGPU::FLAT_STORE_DWORDX4: + return FLAT_STORE; + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + return GLOBAL_STORE_SADDR; } } /// Determines instruction subclass from opcode. Only instructions -/// of the same subclass can be merged together. +/// of the same subclass can be merged together. The merged instruction may have +/// a different subclass but must have the same class. static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { switch (Opc) { default: @@ -418,9 +499,55 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX4: + return AMDGPU::FLAT_LOAD_DWORD; + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; + case AMDGPU::GLOBAL_STORE_DWORD: + case AMDGPU::GLOBAL_STORE_DWORDX2: + case AMDGPU::GLOBAL_STORE_DWORDX3: + case AMDGPU::GLOBAL_STORE_DWORDX4: + case AMDGPU::FLAT_STORE_DWORD: + case AMDGPU::FLAT_STORE_DWORDX2: + case AMDGPU::FLAT_STORE_DWORDX3: + case AMDGPU::FLAT_STORE_DWORDX4: + return AMDGPU::FLAT_STORE_DWORD; + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + return AMDGPU::GLOBAL_STORE_DWORD_SADDR; } } +// GLOBAL loads and stores are classified as FLAT initially. If both combined +// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. +// If either or both instructions are non segment specific FLAT the resulting +// combined operation will be FLAT, potentially promoting one of the GLOBAL +// operations to FLAT. +// For other instructions return the original unmodified class. +InstClassEnum +SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, + const CombineInfo &Paired) { + assert(CI.InstClass == Paired.InstClass); + + if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && + SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) + return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; + + return CI.InstClass; +} + static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { AddressRegs Result; @@ -480,6 +607,34 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64_gfx9: Result.Addr = true; return Result; + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + Result.SAddr = true; + LLVM_FALLTHROUGH; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + case AMDGPU::GLOBAL_STORE_DWORD: + case AMDGPU::GLOBAL_STORE_DWORDX2: + case AMDGPU::GLOBAL_STORE_DWORDX3: + case AMDGPU::GLOBAL_STORE_DWORDX4: + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX4: + case AMDGPU::FLAT_STORE_DWORD: + case AMDGPU::FLAT_STORE_DWORDX2: + case AMDGPU::FLAT_STORE_DWORDX3: + case AMDGPU::FLAT_STORE_DWORDX4: + Result.VAddr = true; + return Result; } } @@ -551,6 +706,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, if (Regs.SOffset) AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); + if (Regs.SAddr) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); if (Regs.VAddr) AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); @@ -579,92 +737,58 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() { return new SILoadStoreOptimizer(); } -static void moveInstsAfter(MachineBasicBlock::iterator I, - ArrayRef<MachineInstr *> InstsToMove) { - MachineBasicBlock *MBB = I->getParent(); - ++I; - for (MachineInstr *MI : InstsToMove) { - MI->removeFromParent(); - MBB->insert(I, MI); - } -} - static void addDefsUsesToList(const MachineInstr &MI, DenseSet<Register> &RegDefs, - DenseSet<Register> &PhysRegUses) { - for (const MachineOperand &Op : MI.operands()) { - if (Op.isReg()) { - if (Op.isDef()) - RegDefs.insert(Op.getReg()); - else if (Op.readsReg() && Op.getReg().isPhysical()) - PhysRegUses.insert(Op.getReg()); - } + DenseSet<Register> &RegUses) { + for (const auto &Op : MI.operands()) { + if (!Op.isReg()) + continue; + if (Op.isDef()) + RegDefs.insert(Op.getReg()); + if (Op.readsReg()) + RegUses.insert(Op.getReg()); } } -static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, - MachineBasicBlock::iterator B, - AliasAnalysis *AA) { - // RAW or WAR - cannot reorder - // WAW - cannot reorder - // RAR - safe to reorder - return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); -} - -// Add MI and its defs to the lists if MI reads one of the defs that are -// already in the list. Returns true in that case. -static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, - DenseSet<Register> &PhysRegUses, - SmallVectorImpl<MachineInstr *> &Insts) { - for (MachineOperand &Use : MI.operands()) { - // If one of the defs is read, then there is a use of Def between I and the - // instruction that I will potentially be merged with. We will need to move - // this instruction after the merged instructions. - // - // Similarly, if there is a def which is read by an instruction that is to - // be moved for merging, then we need to move the def-instruction as well. - // This can only happen for physical registers such as M0; virtual - // registers are in SSA form. - if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || - (Use.isDef() && RegDefs.count(Use.getReg())) || - (Use.isDef() && Use.getReg().isPhysical() && - PhysRegUses.count(Use.getReg())))) { - Insts.push_back(&MI); - addDefsUsesToList(MI, RegDefs, PhysRegUses); - return true; - } - } - - return false; -} - -static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, - ArrayRef<MachineInstr *> InstsToMove, - AliasAnalysis *AA) { - assert(MemOp.mayLoadOrStore()); - - for (MachineInstr *InstToMove : InstsToMove) { - if (!InstToMove->mayLoadOrStore()) +bool SILoadStoreOptimizer::canSwapInstructions( + const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, + const MachineInstr &A, const MachineInstr &B) const { + if (A.mayLoadOrStore() && B.mayLoadOrStore() && + (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) + return false; + for (const auto &BOp : B.operands()) { + if (!BOp.isReg()) continue; - if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) + if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) + return false; + if (BOp.isDef() && ARegUses.contains(BOp.getReg())) return false; } return true; } -// This function assumes that \p A and \p B have are identical except for -// size and offset, and they reference adjacent memory. -static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, - const MachineMemOperand *A, - const MachineMemOperand *B) { - unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); - unsigned Size = A->getSize() + B->getSize(); - // This function adds the offset parameter to the existing offset for A, - // so we pass 0 here as the offset and then manually set it to the correct - // value after the call. - MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); - MMO->setOffset(MinOffset); - return MMO; +// Given that \p CI and \p Paired are adjacent memory operations produce a new +// MMO for the combined operation with a new access size. +MachineMemOperand * +SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, + const CombineInfo &Paired) { + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); + + unsigned Size = MMOa->getSize() + MMOb->getSize(); + + // A base pointer for the combined operation is the same as the leading + // operation's pointer. + if (Paired < CI) + std::swap(MMOa, MMOb); + + MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); + // If merging FLAT and GLOBAL set address space to FLAT. + if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) + PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; + + MachineFunction *MF = CI.I->getMF(); + return MF->getMachineMemOperand(MMOa, PtrInfo, Size); } bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, @@ -787,8 +911,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { return (EltOffset0 + CI.Width == EltOffset1 || EltOffset1 + Paired.Width == EltOffset0) && - CI.CPol == Paired.CPol && - (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol); + CI.CPol == Paired.CPol; } // If the offset in elements doesn't fit in 8-bits, we might be able to use @@ -889,111 +1012,59 @@ SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { return nullptr; } -/// This function assumes that CI comes before Paired in a basic block. -bool SILoadStoreOptimizer::checkAndPrepareMerge( - CombineInfo &CI, CombineInfo &Paired, - SmallVectorImpl<MachineInstr *> &InstsToMove) { +/// This function assumes that CI comes before Paired in a basic block. Return +/// an insertion point for the merged instruction or nullptr on failure. +SILoadStoreOptimizer::CombineInfo * +SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, + CombineInfo &Paired) { + // If another instruction has already been merged into CI, it may now be a + // type that we can't do any further merging into. + if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) + return nullptr; + assert(CI.InstClass == Paired.InstClass); + + if (getInstSubclass(CI.I->getOpcode(), *TII) != + getInstSubclass(Paired.I->getOpcode(), *TII)) + return nullptr; // Check both offsets (or masks for MIMG) can be combined and fit in the // reduced range. - if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) - return false; - - if (CI.InstClass != MIMG && - (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) - return false; - - const unsigned Opc = CI.I->getOpcode(); - const InstClassEnum InstClass = getInstClass(Opc, *TII); - - if (InstClass == UNKNOWN) { - return false; + if (CI.InstClass == MIMG) { + if (!dmasksCanBeCombined(CI, *TII, Paired)) + return nullptr; + } else { + if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) + return nullptr; } - const unsigned InstSubclass = getInstSubclass(Opc, *TII); - - DenseSet<Register> RegDefsToMove; - DenseSet<Register> PhysRegUsesToMove; - addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - - MachineBasicBlock::iterator E = std::next(Paired.I); - MachineBasicBlock::iterator MBBI = std::next(CI.I); - MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); - for (; MBBI != E; ++MBBI) { - - if (MBBI == MBBE) { - // CombineInfo::Order is a hint on the instruction ordering within the - // basic block. This hint suggests that CI precedes Paired, which is - // true most of the time. However, moveInstsAfter() processing a - // previous list may have changed this order in a situation when it - // moves an instruction which exists in some other merge list. - // In this case it must be dependent. - return false; - } - if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || - (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { - // This is not a matching instruction, but we can keep looking as - // long as one of these conditions are met: - // 1. It is safe to move I down past MBBI. - // 2. It is safe to move MBBI down past the instruction that I will - // be merged into. - - if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || - !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { - // We fail condition #1, but we may still be able to satisfy condition - // #2. Add this instruction to the move list and then we will check - // if condition #2 holds once we have selected the matching instruction. - InstsToMove.push_back(&*MBBI); - addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); - continue; - } - - // When we match I with another DS instruction we will be moving I down - // to the location of the matched instruction any uses of I will need to - // be moved down as well. - addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - InstsToMove); - continue; + DenseSet<Register> RegDefs; + DenseSet<Register> RegUses; + CombineInfo *Where; + if (CI.I->mayLoad()) { + // Try to hoist Paired up to CI. + addDefsUsesToList(*Paired.I, RegDefs, RegUses); + for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { + if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) + return nullptr; } - - // Handle a case like - // DS_WRITE_B32 addr, v, idx0 - // w = DS_READ_B32 addr, idx0 - // DS_WRITE_B32 addr, f(w), idx1 - // where the DS_READ_B32 ends up in InstsToMove and therefore prevents - // merging of the two writes. - if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - InstsToMove)) - continue; - - if (&*MBBI == &*Paired.I) { - // We need to go through the list of instructions that we plan to - // move and make sure they are all safe to move down past the merged - // instruction. - if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { - - // Call offsetsCanBeCombined with modify = true so that the offsets are - // correct for the new instruction. This should return true, because - // this function should only be called on CombineInfo objects that - // have already been confirmed to be mergeable. - if (CI.InstClass != MIMG) - offsetsCanBeCombined(CI, *STM, Paired, true); - return true; - } - return false; + Where = &CI; + } else { + // Try to sink CI down to Paired. + addDefsUsesToList(*CI.I, RegDefs, RegUses); + for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { + if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) + return nullptr; } - - // We've found a load/store that we couldn't merge for some reason. - // We could potentially keep looking, but we'd need to make sure that - // it was safe to move I and also all the instruction in InstsToMove - // down past this instruction. - // check if we can move I across MBBI and if we can move all I's users - if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || - !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) - break; + Where = &Paired; } - return false; + + // Call offsetsCanBeCombined with modify = true so that the offsets are + // correct for the new instruction. This should return true, because + // this function should only be called on CombineInfo objects that + // have already been confirmed to be mergeable. + if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) + offsetsCanBeCombined(CI, *STM, Paired, true); + return Where; } unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { @@ -1012,7 +1083,7 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -1051,13 +1122,13 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, unsigned BaseRegFlags = 0; if (CI.BaseOff) { Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) + TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) .addReg(ImmReg) .addReg(AddrReg->getReg(), 0, BaseSubReg) .addImm(0); // clamp bit @@ -1065,7 +1136,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, } MachineInstrBuilder Read2 = - BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) + BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 @@ -1077,14 +1148,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1109,9 +1178,9 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { : AMDGPU::DS_WRITE2ST64_B64_gfx9; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( + CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -1145,13 +1214,13 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, unsigned BaseRegFlags = 0; if (CI.BaseOff) { Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) + TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) .addReg(ImmReg) .addReg(AddrReg->getReg(), 0, BaseSubReg) .addImm(0); // clamp bit @@ -1159,7 +1228,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, } MachineInstrBuilder Write2 = - BuildMI(*MBB, Paired.I, DL, Write2Desc) + BuildMI(*MBB, InsertBefore, DL, Write2Desc) .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr .add(*Data0) // data0 .add(*Data1) // data1 @@ -1168,8 +1237,6 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, .addImm(0) // gds .cloneMergedMemRefs({&*CI.I, &*Paired.I}); - moveInstsAfter(Write2, InstsToMove); - CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1179,7 +1246,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1191,7 +1258,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, unsigned DMaskIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { if (I == DMaskIdx) MIB.addImm(MergedDMask); @@ -1204,10 +1271,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - - MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); unsigned SubRegIdx0, SubRegIdx1; std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); @@ -1217,14 +1281,12 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1233,7 +1295,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1248,15 +1310,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = - BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.CPol) // cpol - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.CPol) // cpol + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1267,14 +1326,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1283,7 +1340,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1295,7 +1352,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1307,9 +1364,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1317,7 +1371,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1328,14 +1382,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1344,7 +1396,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1356,7 +1408,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1371,9 +1423,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1382,8 +1431,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand( - combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1394,14 +1442,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1410,7 +1456,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1427,13 +1473,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) .add(*Src0) .addImm(SubRegIdx0) .add(*Src1) .addImm(SubRegIdx1); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1449,9 +1495,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1460,10 +1503,92 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand( - combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); + + CI.I->eraseFromParent(); + Paired.I->eraseFromParent(); + return New; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( + CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); - moveInstsAfter(MIB, InstsToMove); + const unsigned Opcode = getNewOpcode(CI, Paired); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); + Register DestReg = MRI->createVirtualRegister(SuperRC); + + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); + + if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) + MIB.add(*SAddr); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) + .addImm(std::min(CI.Offset, Paired.Offset)) + .addImm(CI.CPol) + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); + const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); + + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + CI.I->eraseFromParent(); + Paired.I->eraseFromParent(); + return New; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( + CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + + const unsigned Opcode = getNewOpcode(CI, Paired); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the new source register. + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); + Register SrcReg = MRI->createVirtualRegister(SuperRC); + + const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); + + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + .add(*Src0) + .addImm(SubRegIdx0) + .add(*Src1) + .addImm(SubRegIdx1); + + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) + .addReg(SrcReg, RegState::Kill); + + if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) + MIB.add(*SAddr); + + MachineInstr *New = + MIB.addImm(std::min(CI.Offset, Paired.Offset)) + .addImm(CI.CPol) + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1474,7 +1599,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired) { const unsigned Width = CI.Width + Paired.Width; - switch (CI.InstClass) { + switch (getCommonInstClass(CI, Paired)) { default: assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); // FIXME: Handle d16 correctly @@ -1498,6 +1623,72 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case 8: return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } + case GLOBAL_LOAD: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_LOAD_DWORDX2; + case 3: + return AMDGPU::GLOBAL_LOAD_DWORDX3; + case 4: + return AMDGPU::GLOBAL_LOAD_DWORDX4; + } + case GLOBAL_LOAD_SADDR: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; + case 3: + return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; + case 4: + return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; + } + case GLOBAL_STORE: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_STORE_DWORDX2; + case 3: + return AMDGPU::GLOBAL_STORE_DWORDX3; + case 4: + return AMDGPU::GLOBAL_STORE_DWORDX4; + } + case GLOBAL_STORE_SADDR: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; + case 3: + return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; + case 4: + return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; + } + case FLAT_LOAD: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::FLAT_LOAD_DWORDX2; + case 3: + return AMDGPU::FLAT_LOAD_DWORDX3; + case 4: + return AMDGPU::FLAT_LOAD_DWORDX4; + } + case FLAT_STORE: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::FLAT_STORE_DWORDX2; + case 3: + return AMDGPU::FLAT_STORE_DWORDX3; + case 4: + return AMDGPU::FLAT_STORE_DWORDX4; + } case MIMG: assert((countPopulation(CI.DMask | Paired.DMask) == Width) && "No overlaps"); @@ -1508,15 +1699,9 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, std::pair<unsigned, unsigned> SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { - bool ReverseOrder; - if (CI.InstClass == MIMG) { - assert( - (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && - "No overlaps"); - ReverseOrder = CI.DMask > Paired.DMask; - } else { - ReverseOrder = CI.Offset > Paired.Offset; - } + assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) == + CI.Width + Paired.Width)) && + "No overlaps"); unsigned Idx0; unsigned Idx1; @@ -1532,7 +1717,7 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, assert(CI.Width >= 1 && CI.Width <= 4); assert(Paired.Width >= 1 && Paired.Width <= 4); - if (ReverseOrder) { + if (Paired < CI) { Idx1 = Idxs[0][Paired.Width - 1]; Idx0 = Idxs[Paired.Width][CI.Width - 1]; } else { @@ -1569,7 +1754,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1586,13 +1771,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) .add(*Src0) .addImm(SubRegIdx0) .add(*Src1) .addImm(SubRegIdx1); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1606,9 +1791,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1616,9 +1798,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); - - moveInstsAfter(MIB, InstsToMove); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1846,7 +2026,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 // as the new-base(anchor) because of the maximum distance which can - // accomodate more intermediate bases presumeably. + // accommodate more intermediate bases presumably. // // Step3: move (&a + 8192) above load1. Compute and promote offsets from // (&a + 8192) for load1, load2, load4. @@ -2098,8 +2278,8 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( CombineInfo &CI = *First; CombineInfo &Paired = *Second; - SmallVector<MachineInstr *, 8> InstsToMove; - if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { + CombineInfo *Where = checkAndPrepareMerge(CI, Paired); + if (!Where) { ++I; continue; } @@ -2108,66 +2288,56 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); + MachineBasicBlock::iterator NewMI; switch (CI.InstClass) { default: llvm_unreachable("unknown InstClass"); break; - case DS_READ: { - MachineBasicBlock::iterator NewMI = - mergeRead2Pair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); + case DS_READ: + NewMI = mergeRead2Pair(CI, Paired, Where->I); break; - } - case DS_WRITE: { - MachineBasicBlock::iterator NewMI = - mergeWrite2Pair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); + case DS_WRITE: + NewMI = mergeWrite2Pair(CI, Paired, Where->I); break; - } - case S_BUFFER_LOAD_IMM: { - MachineBasicBlock::iterator NewMI = - mergeSBufferLoadImmPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 8; + case S_BUFFER_LOAD_IMM: + NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 8; break; - } - case BUFFER_LOAD: { - MachineBasicBlock::iterator NewMI = - mergeBufferLoadPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case BUFFER_LOAD: + NewMI = mergeBufferLoadPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; - } - case BUFFER_STORE: { - MachineBasicBlock::iterator NewMI = - mergeBufferStorePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case BUFFER_STORE: + NewMI = mergeBufferStorePair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; - } - case MIMG: { - MachineBasicBlock::iterator NewMI = - mergeImagePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case MIMG: + NewMI = mergeImagePair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; - } - case TBUFFER_LOAD: { - MachineBasicBlock::iterator NewMI = - mergeTBufferLoadPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case TBUFFER_LOAD: + NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; - } - case TBUFFER_STORE: { - MachineBasicBlock::iterator NewMI = - mergeTBufferStorePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case TBUFFER_STORE: + NewMI = mergeTBufferStorePair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; + break; + case FLAT_LOAD: + case GLOBAL_LOAD: + case GLOBAL_LOAD_SADDR: + NewMI = mergeFlatLoadPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; + break; + case FLAT_STORE: + case GLOBAL_STORE: + case GLOBAL_STORE_SADDR: + NewMI = mergeFlatStorePair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; } - } - CI.Order = Paired.Order; + CI.setMI(NewMI, *this); + CI.Order = Where->Order; if (I == Second) I = Next; |
