aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp842
1 files changed, 506 insertions, 336 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 44bdbe37dec0..6d4e1d2c898b 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -79,6 +79,13 @@ enum InstClassEnum {
MIMG,
TBUFFER_LOAD,
TBUFFER_STORE,
+ GLOBAL_LOAD_SADDR,
+ GLOBAL_STORE_SADDR,
+ FLAT_LOAD,
+ FLAT_STORE,
+ GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
+ GLOBAL_STORE // any CombineInfo, they are only ever returned by
+ // getCommonInstClass.
};
struct AddressRegs {
@@ -86,6 +93,7 @@ struct AddressRegs {
bool SBase = false;
bool SRsrc = false;
bool SOffset = false;
+ bool SAddr = false;
bool VAddr = false;
bool Addr = false;
bool SSamp = false;
@@ -160,6 +168,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
}
void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
+
+ // Compare by pointer order.
+ bool operator<(const CombineInfo& Other) const {
+ return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
+ }
};
struct BaseRegisters {
@@ -185,6 +198,9 @@ private:
AliasAnalysis *AA = nullptr;
bool OptimizeAgain;
+ bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
+ const DenseSet<Register> &ARegUses,
+ const MachineInstr &A, const MachineInstr &B) const;
static bool dmasksCanBeCombined(const CombineInfo &CI,
const SIInstrInfo &TII,
const CombineInfo &Paired);
@@ -199,38 +215,43 @@ private:
const CombineInfo &Paired);
const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
- bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
- SmallVectorImpl<MachineInstr *> &InstsToMove);
+ CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
unsigned read2Opcode(unsigned EltSize) const;
unsigned read2ST64Opcode(unsigned EltSize) const;
- MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
- CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator
+ mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore);
unsigned write2Opcode(unsigned EltSize) const;
unsigned write2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
MachineBasicBlock::iterator
mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator InsertBefore);
+ MachineBasicBlock::iterator
+ mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore);
+ MachineBasicBlock::iterator
+ mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore);
void updateBaseAndOffset(MachineInstr &I, Register NewBase,
int32_t NewOffset) const;
@@ -252,6 +273,12 @@ private:
MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
std::list<std::list<CombineInfo>> &MergeableInsts) const;
+ static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
+ const CombineInfo &Paired);
+
+ static InstClassEnum getCommonInstClass(const CombineInfo &CI,
+ const CombineInfo &Paired);
+
public:
static char ID;
@@ -298,10 +325,35 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
switch (Opc) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::GLOBAL_LOAD_DWORD:
+ case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORD:
+ case AMDGPU::FLAT_STORE_DWORD:
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX2:
+ case AMDGPU::FLAT_STORE_DWORDX2:
return 2;
+ case AMDGPU::GLOBAL_LOAD_DWORDX3:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX3:
+ case AMDGPU::FLAT_STORE_DWORDX3:
+ return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX4:
+ case AMDGPU::FLAT_STORE_DWORDX4:
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return 8;
@@ -386,11 +438,40 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B64_gfx9:
return DS_WRITE;
+ case AMDGPU::GLOBAL_LOAD_DWORD:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4:
+ case AMDGPU::FLAT_LOAD_DWORD:
+ case AMDGPU::FLAT_LOAD_DWORDX2:
+ case AMDGPU::FLAT_LOAD_DWORDX3:
+ case AMDGPU::FLAT_LOAD_DWORDX4:
+ return FLAT_LOAD;
+ case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ return GLOBAL_LOAD_SADDR;
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ case AMDGPU::FLAT_STORE_DWORD:
+ case AMDGPU::FLAT_STORE_DWORDX2:
+ case AMDGPU::FLAT_STORE_DWORDX3:
+ case AMDGPU::FLAT_STORE_DWORDX4:
+ return FLAT_STORE;
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ return GLOBAL_STORE_SADDR;
}
}
/// Determines instruction subclass from opcode. Only instructions
-/// of the same subclass can be merged together.
+/// of the same subclass can be merged together. The merged instruction may have
+/// a different subclass but must have the same class.
static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
switch (Opc) {
default:
@@ -418,9 +499,55 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
+ case AMDGPU::GLOBAL_LOAD_DWORD:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4:
+ case AMDGPU::FLAT_LOAD_DWORD:
+ case AMDGPU::FLAT_LOAD_DWORDX2:
+ case AMDGPU::FLAT_LOAD_DWORDX3:
+ case AMDGPU::FLAT_LOAD_DWORDX4:
+ return AMDGPU::FLAT_LOAD_DWORD;
+ case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ case AMDGPU::FLAT_STORE_DWORD:
+ case AMDGPU::FLAT_STORE_DWORDX2:
+ case AMDGPU::FLAT_STORE_DWORDX3:
+ case AMDGPU::FLAT_STORE_DWORDX4:
+ return AMDGPU::FLAT_STORE_DWORD;
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
}
}
+// GLOBAL loads and stores are classified as FLAT initially. If both combined
+// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
+// If either or both instructions are non segment specific FLAT the resulting
+// combined operation will be FLAT, potentially promoting one of the GLOBAL
+// operations to FLAT.
+// For other instructions return the original unmodified class.
+InstClassEnum
+SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
+ const CombineInfo &Paired) {
+ assert(CI.InstClass == Paired.InstClass);
+
+ if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
+ SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
+ return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
+
+ return CI.InstClass;
+}
+
static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
AddressRegs Result;
@@ -480,6 +607,34 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::DS_WRITE_B64_gfx9:
Result.Addr = true;
return Result;
+ case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+ case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ Result.SAddr = true;
+ LLVM_FALLTHROUGH;
+ case AMDGPU::GLOBAL_LOAD_DWORD:
+ case AMDGPU::GLOBAL_LOAD_DWORDX2:
+ case AMDGPU::GLOBAL_LOAD_DWORDX3:
+ case AMDGPU::GLOBAL_LOAD_DWORDX4:
+ case AMDGPU::GLOBAL_STORE_DWORD:
+ case AMDGPU::GLOBAL_STORE_DWORDX2:
+ case AMDGPU::GLOBAL_STORE_DWORDX3:
+ case AMDGPU::GLOBAL_STORE_DWORDX4:
+ case AMDGPU::FLAT_LOAD_DWORD:
+ case AMDGPU::FLAT_LOAD_DWORDX2:
+ case AMDGPU::FLAT_LOAD_DWORDX3:
+ case AMDGPU::FLAT_LOAD_DWORDX4:
+ case AMDGPU::FLAT_STORE_DWORD:
+ case AMDGPU::FLAT_STORE_DWORDX2:
+ case AMDGPU::FLAT_STORE_DWORDX3:
+ case AMDGPU::FLAT_STORE_DWORDX4:
+ Result.VAddr = true;
+ return Result;
}
}
@@ -551,6 +706,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
if (Regs.SOffset)
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
+ if (Regs.SAddr)
+ AddrIdx[NumAddresses++] =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
if (Regs.VAddr)
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
@@ -579,92 +737,58 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() {
return new SILoadStoreOptimizer();
}
-static void moveInstsAfter(MachineBasicBlock::iterator I,
- ArrayRef<MachineInstr *> InstsToMove) {
- MachineBasicBlock *MBB = I->getParent();
- ++I;
- for (MachineInstr *MI : InstsToMove) {
- MI->removeFromParent();
- MBB->insert(I, MI);
- }
-}
-
static void addDefsUsesToList(const MachineInstr &MI,
DenseSet<Register> &RegDefs,
- DenseSet<Register> &PhysRegUses) {
- for (const MachineOperand &Op : MI.operands()) {
- if (Op.isReg()) {
- if (Op.isDef())
- RegDefs.insert(Op.getReg());
- else if (Op.readsReg() && Op.getReg().isPhysical())
- PhysRegUses.insert(Op.getReg());
- }
+ DenseSet<Register> &RegUses) {
+ for (const auto &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+ if (Op.isDef())
+ RegDefs.insert(Op.getReg());
+ if (Op.readsReg())
+ RegUses.insert(Op.getReg());
}
}
-static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
- MachineBasicBlock::iterator B,
- AliasAnalysis *AA) {
- // RAW or WAR - cannot reorder
- // WAW - cannot reorder
- // RAR - safe to reorder
- return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
-}
-
-// Add MI and its defs to the lists if MI reads one of the defs that are
-// already in the list. Returns true in that case.
-static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
- DenseSet<Register> &PhysRegUses,
- SmallVectorImpl<MachineInstr *> &Insts) {
- for (MachineOperand &Use : MI.operands()) {
- // If one of the defs is read, then there is a use of Def between I and the
- // instruction that I will potentially be merged with. We will need to move
- // this instruction after the merged instructions.
- //
- // Similarly, if there is a def which is read by an instruction that is to
- // be moved for merging, then we need to move the def-instruction as well.
- // This can only happen for physical registers such as M0; virtual
- // registers are in SSA form.
- if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
- (Use.isDef() && RegDefs.count(Use.getReg())) ||
- (Use.isDef() && Use.getReg().isPhysical() &&
- PhysRegUses.count(Use.getReg())))) {
- Insts.push_back(&MI);
- addDefsUsesToList(MI, RegDefs, PhysRegUses);
- return true;
- }
- }
-
- return false;
-}
-
-static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
- ArrayRef<MachineInstr *> InstsToMove,
- AliasAnalysis *AA) {
- assert(MemOp.mayLoadOrStore());
-
- for (MachineInstr *InstToMove : InstsToMove) {
- if (!InstToMove->mayLoadOrStore())
+bool SILoadStoreOptimizer::canSwapInstructions(
+ const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
+ const MachineInstr &A, const MachineInstr &B) const {
+ if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
+ (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
+ return false;
+ for (const auto &BOp : B.operands()) {
+ if (!BOp.isReg())
continue;
- if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
+ if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
+ return false;
+ if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
return false;
}
return true;
}
-// This function assumes that \p A and \p B have are identical except for
-// size and offset, and they reference adjacent memory.
-static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
- const MachineMemOperand *A,
- const MachineMemOperand *B) {
- unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
- unsigned Size = A->getSize() + B->getSize();
- // This function adds the offset parameter to the existing offset for A,
- // so we pass 0 here as the offset and then manually set it to the correct
- // value after the call.
- MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
- MMO->setOffset(MinOffset);
- return MMO;
+// Given that \p CI and \p Paired are adjacent memory operations produce a new
+// MMO for the combined operation with a new access size.
+MachineMemOperand *
+SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
+ const CombineInfo &Paired) {
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
+
+ unsigned Size = MMOa->getSize() + MMOb->getSize();
+
+ // A base pointer for the combined operation is the same as the leading
+ // operation's pointer.
+ if (Paired < CI)
+ std::swap(MMOa, MMOb);
+
+ MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
+ // If merging FLAT and GLOBAL set address space to FLAT.
+ if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+ PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
+
+ MachineFunction *MF = CI.I->getMF();
+ return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
}
bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
@@ -787,8 +911,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
return (EltOffset0 + CI.Width == EltOffset1 ||
EltOffset1 + Paired.Width == EltOffset0) &&
- CI.CPol == Paired.CPol &&
- (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
+ CI.CPol == Paired.CPol;
}
// If the offset in elements doesn't fit in 8-bits, we might be able to use
@@ -889,111 +1012,59 @@ SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
return nullptr;
}
-/// This function assumes that CI comes before Paired in a basic block.
-bool SILoadStoreOptimizer::checkAndPrepareMerge(
- CombineInfo &CI, CombineInfo &Paired,
- SmallVectorImpl<MachineInstr *> &InstsToMove) {
+/// This function assumes that CI comes before Paired in a basic block. Return
+/// an insertion point for the merged instruction or nullptr on failure.
+SILoadStoreOptimizer::CombineInfo *
+SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
+ CombineInfo &Paired) {
+ // If another instruction has already been merged into CI, it may now be a
+ // type that we can't do any further merging into.
+ if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
+ return nullptr;
+ assert(CI.InstClass == Paired.InstClass);
+
+ if (getInstSubclass(CI.I->getOpcode(), *TII) !=
+ getInstSubclass(Paired.I->getOpcode(), *TII))
+ return nullptr;
// Check both offsets (or masks for MIMG) can be combined and fit in the
// reduced range.
- if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
- return false;
-
- if (CI.InstClass != MIMG &&
- (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
- return false;
-
- const unsigned Opc = CI.I->getOpcode();
- const InstClassEnum InstClass = getInstClass(Opc, *TII);
-
- if (InstClass == UNKNOWN) {
- return false;
+ if (CI.InstClass == MIMG) {
+ if (!dmasksCanBeCombined(CI, *TII, Paired))
+ return nullptr;
+ } else {
+ if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
+ return nullptr;
}
- const unsigned InstSubclass = getInstSubclass(Opc, *TII);
-
- DenseSet<Register> RegDefsToMove;
- DenseSet<Register> PhysRegUsesToMove;
- addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
-
- MachineBasicBlock::iterator E = std::next(Paired.I);
- MachineBasicBlock::iterator MBBI = std::next(CI.I);
- MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
- for (; MBBI != E; ++MBBI) {
-
- if (MBBI == MBBE) {
- // CombineInfo::Order is a hint on the instruction ordering within the
- // basic block. This hint suggests that CI precedes Paired, which is
- // true most of the time. However, moveInstsAfter() processing a
- // previous list may have changed this order in a situation when it
- // moves an instruction which exists in some other merge list.
- // In this case it must be dependent.
- return false;
- }
- if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
- (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
- // This is not a matching instruction, but we can keep looking as
- // long as one of these conditions are met:
- // 1. It is safe to move I down past MBBI.
- // 2. It is safe to move MBBI down past the instruction that I will
- // be merged into.
-
- if (MBBI->mayLoadOrStore() &&
- (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
- // We fail condition #1, but we may still be able to satisfy condition
- // #2. Add this instruction to the move list and then we will check
- // if condition #2 holds once we have selected the matching instruction.
- InstsToMove.push_back(&*MBBI);
- addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
- continue;
- }
-
- // When we match I with another DS instruction we will be moving I down
- // to the location of the matched instruction any uses of I will need to
- // be moved down as well.
- addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
- InstsToMove);
- continue;
+ DenseSet<Register> RegDefs;
+ DenseSet<Register> RegUses;
+ CombineInfo *Where;
+ if (CI.I->mayLoad()) {
+ // Try to hoist Paired up to CI.
+ addDefsUsesToList(*Paired.I, RegDefs, RegUses);
+ for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
+ if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
+ return nullptr;
}
-
- // Handle a case like
- // DS_WRITE_B32 addr, v, idx0
- // w = DS_READ_B32 addr, idx0
- // DS_WRITE_B32 addr, f(w), idx1
- // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
- // merging of the two writes.
- if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
- InstsToMove))
- continue;
-
- if (&*MBBI == &*Paired.I) {
- // We need to go through the list of instructions that we plan to
- // move and make sure they are all safe to move down past the merged
- // instruction.
- if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
-
- // Call offsetsCanBeCombined with modify = true so that the offsets are
- // correct for the new instruction. This should return true, because
- // this function should only be called on CombineInfo objects that
- // have already been confirmed to be mergeable.
- if (CI.InstClass != MIMG)
- offsetsCanBeCombined(CI, *STM, Paired, true);
- return true;
- }
- return false;
+ Where = &CI;
+ } else {
+ // Try to sink CI down to Paired.
+ addDefsUsesToList(*CI.I, RegDefs, RegUses);
+ for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
+ if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
+ return nullptr;
}
-
- // We've found a load/store that we couldn't merge for some reason.
- // We could potentially keep looking, but we'd need to make sure that
- // it was safe to move I and also all the instruction in InstsToMove
- // down past this instruction.
- // check if we can move I across MBBI and if we can move all I's users
- if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
- break;
+ Where = &Paired;
}
- return false;
+
+ // Call offsetsCanBeCombined with modify = true so that the offsets are
+ // correct for the new instruction. This should return true, because
+ // this function should only be called on CombineInfo objects that
+ // have already been confirmed to be mergeable.
+ if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
+ offsetsCanBeCombined(CI, *STM, Paired, true);
+ return Where;
}
unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
@@ -1012,7 +1083,7 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
MachineBasicBlock::iterator
SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@@ -1051,13 +1122,13 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+ TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
.addReg(ImmReg)
.addReg(AddrReg->getReg(), 0, BaseSubReg)
.addImm(0); // clamp bit
@@ -1065,7 +1136,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
}
MachineInstrBuilder Read2 =
- BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
+ BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
.addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
@@ -1077,14 +1148,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
// Copy to the old destination registers.
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1109,9 +1178,9 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
: AMDGPU::DS_WRITE2ST64_B64_gfx9;
}
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+ CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -1145,13 +1214,13 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+ TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
.addReg(ImmReg)
.addReg(AddrReg->getReg(), 0, BaseSubReg)
.addImm(0); // clamp bit
@@ -1159,7 +1228,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
}
MachineInstrBuilder Write2 =
- BuildMI(*MBB, Paired.I, DL, Write2Desc)
+ BuildMI(*MBB, InsertBefore, DL, Write2Desc)
.addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
.add(*Data0) // data0
.add(*Data1) // data1
@@ -1168,8 +1237,6 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
.addImm(0) // gds
.cloneMergedMemRefs({&*CI.I, &*Paired.I});
- moveInstsAfter(Write2, InstsToMove);
-
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1179,7 +1246,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator
SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1191,7 +1258,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
unsigned DMaskIdx =
AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
if (I == DMaskIdx)
MIB.addImm(MergedDMask);
@@ -1204,10 +1271,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
- MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
unsigned SubRegIdx0, SubRegIdx1;
std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
@@ -1217,14 +1281,12 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1233,7 +1295,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1248,15 +1310,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
- BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
- .addImm(MergedOffset) // offset
- .addImm(CI.CPol) // cpol
- .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.CPol) // cpol
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1267,14 +1326,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1283,7 +1340,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1295,7 +1352,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1307,9 +1364,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1317,7 +1371,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
.addImm(CI.CPol) // cpol
.addImm(0) // tfe
.addImm(0) // swz
- .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1328,14 +1382,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1344,7 +1396,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1356,7 +1408,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1371,9 +1423,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1382,8 +1431,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
.addImm(CI.CPol) // cpol
.addImm(0) // tfe
.addImm(0) // swz
- .addMemOperand(
- combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1394,14 +1442,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, CopyDesc)
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
- .add(*Dest1)
- .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
- moveInstsAfter(Copy1, InstsToMove);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1410,7 +1456,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1427,13 +1473,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
.add(*Src0)
.addImm(SubRegIdx0)
.add(*Src1)
.addImm(SubRegIdx1);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1449,9 +1495,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1460,10 +1503,92 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
.addImm(CI.CPol) // cpol
.addImm(0) // tfe
.addImm(0) // swz
- .addMemOperand(
- combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+
+ CI.I->eraseFromParent();
+ Paired.I->eraseFromParent();
+ return New;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
+ CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
- moveInstsAfter(MIB, InstsToMove);
+ const unsigned Opcode = getNewOpcode(CI, Paired);
+
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
+
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
+
+ if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+ MIB.add(*SAddr);
+
+ MachineInstr *New =
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+ .addImm(std::min(CI.Offset, Paired.Offset))
+ .addImm(CI.CPol)
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
+ const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
+
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ CI.I->eraseFromParent();
+ Paired.I->eraseFromParent();
+ return New;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
+ CombineInfo &CI, CombineInfo &Paired,
+ MachineBasicBlock::iterator InsertBefore) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+
+ const unsigned Opcode = getNewOpcode(CI, Paired);
+
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+ // Copy to the new source register.
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+ Register SrcReg = MRI->createVirtualRegister(SuperRC);
+
+ const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ .add(*Src0)
+ .addImm(SubRegIdx0)
+ .add(*Src1)
+ .addImm(SubRegIdx1);
+
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+ .addReg(SrcReg, RegState::Kill);
+
+ if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+ MIB.add(*SAddr);
+
+ MachineInstr *New =
+ MIB.addImm(std::min(CI.Offset, Paired.Offset))
+ .addImm(CI.CPol)
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1474,7 +1599,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;
- switch (CI.InstClass) {
+ switch (getCommonInstClass(CI, Paired)) {
default:
assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
// FIXME: Handle d16 correctly
@@ -1498,6 +1623,72 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case 8:
return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
+ case GLOBAL_LOAD:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_LOAD_DWORDX2;
+ case 3:
+ return AMDGPU::GLOBAL_LOAD_DWORDX3;
+ case 4:
+ return AMDGPU::GLOBAL_LOAD_DWORDX4;
+ }
+ case GLOBAL_LOAD_SADDR:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
+ case 3:
+ return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
+ case 4:
+ return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
+ }
+ case GLOBAL_STORE:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_STORE_DWORDX2;
+ case 3:
+ return AMDGPU::GLOBAL_STORE_DWORDX3;
+ case 4:
+ return AMDGPU::GLOBAL_STORE_DWORDX4;
+ }
+ case GLOBAL_STORE_SADDR:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
+ case 3:
+ return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
+ case 4:
+ return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
+ }
+ case FLAT_LOAD:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::FLAT_LOAD_DWORDX2;
+ case 3:
+ return AMDGPU::FLAT_LOAD_DWORDX3;
+ case 4:
+ return AMDGPU::FLAT_LOAD_DWORDX4;
+ }
+ case FLAT_STORE:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::FLAT_STORE_DWORDX2;
+ case 3:
+ return AMDGPU::FLAT_STORE_DWORDX3;
+ case 4:
+ return AMDGPU::FLAT_STORE_DWORDX4;
+ }
case MIMG:
assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
"No overlaps");
@@ -1508,15 +1699,9 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
std::pair<unsigned, unsigned>
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
const CombineInfo &Paired) {
- bool ReverseOrder;
- if (CI.InstClass == MIMG) {
- assert(
- (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
- "No overlaps");
- ReverseOrder = CI.DMask > Paired.DMask;
- } else {
- ReverseOrder = CI.Offset > Paired.Offset;
- }
+ assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
+ CI.Width + Paired.Width)) &&
+ "No overlaps");
unsigned Idx0;
unsigned Idx1;
@@ -1532,7 +1717,7 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
assert(CI.Width >= 1 && CI.Width <= 4);
assert(Paired.Width >= 1 && Paired.Width <= 4);
- if (ReverseOrder) {
+ if (Paired < CI) {
Idx1 = Idxs[0][Paired.Width - 1];
Idx0 = Idxs[Paired.Width][CI.Width - 1];
} else {
@@ -1569,7 +1754,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
- const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+ MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1586,13 +1771,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
.add(*Src0)
.addImm(SubRegIdx0)
.add(*Src1)
.addImm(SubRegIdx1);
- auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1606,9 +1791,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
- const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1616,9 +1798,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
.addImm(CI.CPol) // cpol
.addImm(0) // tfe
.addImm(0) // swz
- .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
-
- moveInstsAfter(MIB, InstsToMove);
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1846,7 +2026,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
// from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
// has 13bit distance from &a + 4096. The heuristic considers &a + 8192
// as the new-base(anchor) because of the maximum distance which can
- // accomodate more intermediate bases presumeably.
+ // accommodate more intermediate bases presumably.
//
// Step3: move (&a + 8192) above load1. Compute and promote offsets from
// (&a + 8192) for load1, load2, load4.
@@ -2098,8 +2278,8 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
CombineInfo &CI = *First;
CombineInfo &Paired = *Second;
- SmallVector<MachineInstr *, 8> InstsToMove;
- if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
+ CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
+ if (!Where) {
++I;
continue;
}
@@ -2108,66 +2288,56 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
+ MachineBasicBlock::iterator NewMI;
switch (CI.InstClass) {
default:
llvm_unreachable("unknown InstClass");
break;
- case DS_READ: {
- MachineBasicBlock::iterator NewMI =
- mergeRead2Pair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
+ case DS_READ:
+ NewMI = mergeRead2Pair(CI, Paired, Where->I);
break;
- }
- case DS_WRITE: {
- MachineBasicBlock::iterator NewMI =
- mergeWrite2Pair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
+ case DS_WRITE:
+ NewMI = mergeWrite2Pair(CI, Paired, Where->I);
break;
- }
- case S_BUFFER_LOAD_IMM: {
- MachineBasicBlock::iterator NewMI =
- mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
+ case S_BUFFER_LOAD_IMM:
+ NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 8;
break;
- }
- case BUFFER_LOAD: {
- MachineBasicBlock::iterator NewMI =
- mergeBufferLoadPair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case BUFFER_LOAD:
+ NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
- }
- case BUFFER_STORE: {
- MachineBasicBlock::iterator NewMI =
- mergeBufferStorePair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case BUFFER_STORE:
+ NewMI = mergeBufferStorePair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
- }
- case MIMG: {
- MachineBasicBlock::iterator NewMI =
- mergeImagePair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case MIMG:
+ NewMI = mergeImagePair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
- }
- case TBUFFER_LOAD: {
- MachineBasicBlock::iterator NewMI =
- mergeTBufferLoadPair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case TBUFFER_LOAD:
+ NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
- }
- case TBUFFER_STORE: {
- MachineBasicBlock::iterator NewMI =
- mergeTBufferStorePair(CI, Paired, InstsToMove);
- CI.setMI(NewMI, *this);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+ case TBUFFER_STORE:
+ NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
+ break;
+ case FLAT_LOAD:
+ case GLOBAL_LOAD:
+ case GLOBAL_LOAD_SADDR:
+ NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
+ break;
+ case FLAT_STORE:
+ case GLOBAL_STORE:
+ case GLOBAL_STORE_SADDR:
+ NewMI = mergeFlatStorePair(CI, Paired, Where->I);
+ OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
}
- }
- CI.Order = Paired.Order;
+ CI.setMI(NewMI, *this);
+ CI.Order = Where->Order;
if (I == Second)
I = Next;