aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-12-20 14:16:56 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-12-20 14:16:56 +0000
commit2cab237b5dbfe1b3e9c7aa7a3c02d2b98fcf7462 (patch)
tree524fe828571f81358bba62fdb6d04c6e5e96a2a4 /contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
parent6c7828a2807ea5e50c79ca42dbedf2b589ce63b2 (diff)
parent044eb2f6afba375a914ac9d8024f8f5142bb912e (diff)
Notes
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp532
1 files changed, 463 insertions, 69 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index c6ad61a325cc..84cd47a101a8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1,4 +1,4 @@
-//===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
+//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,6 +14,12 @@
// ==>
// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
//
+// The same is done for certain SMEM and VMEM opcodes, e.g.:
+// s_buffer_load_dword s4, s[0:3], 4
+// s_buffer_load_dword s5, s[0:3], 8
+// ==>
+// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
+//
//
// Future improvements:
//
@@ -56,8 +62,9 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
#include <cassert>
+#include <cstdlib>
#include <iterator>
#include <utility>
@@ -68,31 +75,56 @@ using namespace llvm;
namespace {
class SILoadStoreOptimizer : public MachineFunctionPass {
-
- typedef struct {
+ enum InstClassEnum {
+ DS_READ_WRITE,
+ S_BUFFER_LOAD_IMM,
+ BUFFER_LOAD_OFFEN,
+ BUFFER_LOAD_OFFSET,
+ BUFFER_STORE_OFFEN,
+ BUFFER_STORE_OFFSET,
+ };
+
+ struct CombineInfo {
MachineBasicBlock::iterator I;
MachineBasicBlock::iterator Paired;
unsigned EltSize;
unsigned Offset0;
unsigned Offset1;
unsigned BaseOff;
+ InstClassEnum InstClass;
+ bool GLC0;
+ bool GLC1;
+ bool SLC0;
+ bool SLC1;
bool UseST64;
+ bool IsX2;
SmallVector<MachineInstr*, 8> InstsToMove;
- } CombineInfo;
+ };
private:
+ const SISubtarget *STM = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
MachineRegisterInfo *MRI = nullptr;
AliasAnalysis *AA = nullptr;
+ unsigned CreatedX2;
static bool offsetsCanBeCombined(CombineInfo &CI);
- bool findMatchingDSInst(CombineInfo &CI);
+ bool findMatchingInst(CombineInfo &CI);
+ unsigned read2Opcode(unsigned EltSize) const;
+ unsigned read2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
+ unsigned write2Opcode(unsigned EltSize) const;
+ unsigned write2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
+ unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
+ bool &IsOffen) const;
+ MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
public:
static char ID;
@@ -141,36 +173,35 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
}
}
-static void addDefsToList(const MachineInstr &MI,
- SmallVectorImpl<const MachineOperand *> &Defs) {
- for (const MachineOperand &Def : MI.defs()) {
- Defs.push_back(&Def);
- }
+static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
+ // XXX: Should this be looking for implicit defs?
+ for (const MachineOperand &Def : MI.defs())
+ Defs.insert(Def.getReg());
}
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
MachineBasicBlock::iterator B,
const SIInstrInfo *TII,
AliasAnalysis * AA) {
- return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
- // RAW or WAR - cannot reorder
- // WAW - cannot reorder
- // RAR - safe to reorder
- !(A->mayStore() || B->mayStore()));
+ // RAW or WAR - cannot reorder
+ // WAW - cannot reorder
+ // RAR - safe to reorder
+ return !(A->mayStore() || B->mayStore()) ||
+ TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
}
// Add MI and its defs to the lists if MI reads one of the defs that are
// already in the list. Returns true in that case.
static bool
addToListsIfDependent(MachineInstr &MI,
- SmallVectorImpl<const MachineOperand *> &Defs,
+ DenseSet<unsigned> &Defs,
SmallVectorImpl<MachineInstr*> &Insts) {
- for (const MachineOperand *Def : Defs) {
- bool ReadDef = MI.readsVirtualRegister(Def->getReg());
- // If ReadDef is true, then there is a use of Def between I
- // and the instruction that I will potentially be merged with. We
- // will need to move this instruction after the merged instructions.
- if (ReadDef) {
+ for (MachineOperand &Use : MI.operands()) {
+ // If one of the defs is read, then there is a use of Def between I and the
+ // instruction that I will potentially be merged with. We will need to move
+ // this instruction after the merged instructions.
+
+ if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
Insts.push_back(&MI);
addDefsToList(MI, Defs);
return true;
@@ -211,6 +242,15 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
CI.UseST64 = false;
CI.BaseOff = 0;
+ // Handle SMEM and VMEM instructions.
+ if (CI.InstClass != DS_READ_WRITE) {
+ unsigned Diff = CI.IsX2 ? 2 : 1;
+ return (EltOffset0 + Diff == EltOffset1 ||
+ EltOffset1 + Diff == EltOffset0) &&
+ CI.GLC0 == CI.GLC1 &&
+ (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
+ }
+
// If the offset in elements doesn't fit in 8-bits, we might be able to use
// the stride 64 versions.
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
@@ -248,30 +288,70 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
return false;
}
-bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
- MachineBasicBlock::iterator E = CI.I->getParent()->end();
+bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = CI.I;
+
+ unsigned AddrOpName[3] = {0};
+ int AddrIdx[3];
+ const MachineOperand *AddrReg[3];
+ unsigned NumAddresses = 0;
+
+ switch (CI.InstClass) {
+ case DS_READ_WRITE:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
+ break;
+ case S_BUFFER_LOAD_IMM:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
+ break;
+ case BUFFER_LOAD_OFFEN:
+ case BUFFER_STORE_OFFEN:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
+ break;
+ case BUFFER_LOAD_OFFSET:
+ case BUFFER_STORE_OFFSET:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
+ break;
+ }
+
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
+ AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
+
+ // We only ever merge operations with the same base address register, so don't
+ // bother scanning forward if there are no other uses.
+ if (AddrReg[i]->isReg() &&
+ (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
+ MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
+ return false;
+ }
+
++MBBI;
- SmallVector<const MachineOperand *, 8> DefsToMove;
+ DenseSet<unsigned> DefsToMove;
addDefsToList(*CI.I, DefsToMove);
for ( ; MBBI != E; ++MBBI) {
if (MBBI->getOpcode() != CI.I->getOpcode()) {
-
// This is not a matching DS instruction, but we can keep looking as
// long as one of these conditions are met:
// 1. It is safe to move I down past MBBI.
// 2. It is safe to move MBBI down past the instruction that I will
// be merged into.
- if (MBBI->hasUnmodeledSideEffects())
+ if (MBBI->hasUnmodeledSideEffects()) {
// We can't re-order this instruction with respect to other memory
- // opeations, so we fail both conditions mentioned above.
+ // operations, so we fail both conditions mentioned above.
return false;
+ }
if (MBBI->mayLoadOrStore() &&
- !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) {
+ (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
// We fail condition #1, but we may still be able to satisfy condition
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
@@ -300,21 +380,47 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
continue;
- int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
- AMDGPU::OpName::addr);
- const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
- const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+ bool Match = true;
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
+
+ if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
+ if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
+ AddrReg[i]->getImm() != AddrRegNext.getImm()) {
+ Match = false;
+ break;
+ }
+ continue;
+ }
+
+ // Check same base pointer. Be careful of subregisters, which can occur with
+ // vectors of pointers.
+ if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
+ AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
+ Match = false;
+ break;
+ }
+ }
- // Check same base pointer. Be careful of subregisters, which can occur with
- // vectors of pointers.
- if (AddrReg0.getReg() == AddrReg1.getReg() &&
- AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+ if (Match) {
int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
AMDGPU::OpName::offset);
- CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff;
- CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+ CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
+ CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
CI.Paired = MBBI;
+ if (CI.InstClass == DS_READ_WRITE) {
+ CI.Offset0 &= 0xffff;
+ CI.Offset1 &= 0xffff;
+ } else {
+ CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
+ CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
+ if (CI.InstClass != S_BUFFER_LOAD_IMM) {
+ CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
+ CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
+ }
+ }
+
// Check both offsets fit in the reduced range.
// We also need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
@@ -336,6 +442,20 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
return false;
}
+unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
+ return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
+}
+
+unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
+
+ return (EltSize == 4) ?
+ AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
+}
+
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
@@ -349,12 +469,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
- : AMDGPU::DS_READ2_B64;
-
- if (CI.UseST64)
- Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
- : AMDGPU::DS_READ2ST64_B64;
+ unsigned Opc = CI.UseST64 ?
+ read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
@@ -382,9 +498,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
if (CI.BaseOff) {
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
- .addImm(CI.BaseOff)
- .addReg(AddrReg->getReg());
+
+ unsigned AddOpc = STM->hasAddNoCarry() ?
+ AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
+ .addImm(CI.BaseOff)
+ .addReg(AddrReg->getReg());
}
MachineInstrBuilder Read2 =
@@ -417,6 +536,20 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
return Next;
}
+unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
+}
+
+unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+
+ return (EltSize == 4) ?
+ AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
+}
+
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
@@ -430,12 +563,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
- : AMDGPU::DS_WRITE2_B64;
-
- if (CI.UseST64)
- Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
- : AMDGPU::DS_WRITE2ST64_B64;
+ unsigned Opc = CI.UseST64 ?
+ write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
if (NewOffset0 > NewOffset1) {
// Canonicalize the merged instruction so the smaller offset comes first.
@@ -455,9 +584,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
if (CI.BaseOff) {
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
- .addImm(CI.BaseOff)
- .addReg(Addr->getReg());
+
+ unsigned AddOpc = STM->hasAddNoCarry() ?
+ AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
+ .addImm(CI.BaseOff)
+ .addReg(Addr->getReg());
}
MachineInstrBuilder Write2 =
@@ -480,6 +612,194 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
return Next;
}
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
+ CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
+ AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+
+ const TargetRegisterClass *SuperRC =
+ CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
+ unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+
+ BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+ unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+ // Handle descending offsets
+ if (CI.Offset0 > CI.Offset1)
+ std::swap(SubRegIdx0, SubRegIdx1);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
+ const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
+
+ BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ moveInstsAfter(Copy1, CI.InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(CI.I);
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return Next;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
+ CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ unsigned Opcode;
+
+ if (CI.InstClass == BUFFER_LOAD_OFFEN) {
+ Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
+ AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+ } else {
+ Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
+ AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
+ }
+
+ const TargetRegisterClass *SuperRC =
+ CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+
+ auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
+
+ if (CI.InstClass == BUFFER_LOAD_OFFEN)
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+ unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+ // Handle descending offsets
+ if (CI.Offset0 > CI.Offset1)
+ std::swap(SubRegIdx0, SubRegIdx1);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ moveInstsAfter(Copy1, CI.InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(CI.I);
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return Next;
+}
+
+unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
+ const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
+ IsX2 = false;
+ IsOffen = false;
+
+ switch (I.getOpcode()) {
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
+ IsX2 = true;
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
+ IsX2 = true;
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
+ IsX2 = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
+ IsX2 = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
+ }
+ return 0;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
+ CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ bool Unused1, Unused2;
+ unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
+
+ unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+ // Handle descending offsets
+ if (CI.Offset0 > CI.Offset1)
+ std::swap(SubRegIdx0, SubRegIdx1);
+
+ // Copy to the new source register.
+ const TargetRegisterClass *SuperRC =
+ CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
+
+ const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ .add(*Src0)
+ .addImm(SubRegIdx0)
+ .add(*Src1)
+ .addImm(SubRegIdx1);
+
+ auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
+ .addReg(SrcReg, RegState::Kill);
+
+ if (CI.InstClass == BUFFER_STORE_OFFEN)
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+ moveInstsAfter(MIB, CI.InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(CI.I);
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return Next;
+}
+
// Scan through looking for adjacent LDS operations with constant offsets from
// the same base register. We rely on the scheduler to do the hard work of
// clustering nearby loads, and assume these are all adjacent.
@@ -498,9 +818,14 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
CombineInfo CI;
CI.I = I;
unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
- CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
- if (findMatchingDSInst(CI)) {
+ if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
+ Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
+
+ CI.InstClass = DS_READ_WRITE;
+ CI.EltSize =
+ (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
+
+ if (findMatchingInst(CI)) {
Modified = true;
I = mergeRead2Pair(CI);
} else {
@@ -508,9 +833,14 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
}
continue;
- } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
- CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
- if (findMatchingDSInst(CI)) {
+ } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
+ Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
+ Opc == AMDGPU::DS_WRITE_B64_gfx9) {
+ CI.InstClass = DS_READ_WRITE;
+ CI.EltSize
+ = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
+
+ if (findMatchingInst(CI)) {
Modified = true;
I = mergeWrite2Pair(CI);
} else {
@@ -519,6 +849,62 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
}
+ if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
+ (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+ Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
+ // EltSize is in units of the offset encoding.
+ CI.InstClass = S_BUFFER_LOAD_IMM;
+ CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
+ CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ I = mergeSBufferLoadImmPair(CI);
+ if (!CI.IsX2)
+ CreatedX2++;
+ } else {
+ ++I;
+ }
+ continue;
+ }
+ if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
+ if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
+ CI.InstClass = BUFFER_LOAD_OFFEN;
+ else
+ CI.InstClass = BUFFER_LOAD_OFFSET;
+
+ CI.EltSize = 4;
+ CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ I = mergeBufferLoadPair(CI);
+ if (!CI.IsX2)
+ CreatedX2++;
+ } else {
+ ++I;
+ }
+ continue;
+ }
+
+ bool StoreIsX2, IsOffen;
+ if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
+ CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
+ CI.EltSize = 4;
+ CI.IsX2 = StoreIsX2;
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ I = mergeBufferStorePair(CI);
+ if (!CI.IsX2)
+ CreatedX2++;
+ } else {
+ ++I;
+ }
+ continue;
+ }
++I;
}
@@ -527,25 +913,33 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
}
bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
- if (!STM.loadStoreOptEnabled())
+ STM = &MF.getSubtarget<SISubtarget>();
+ if (!STM->loadStoreOptEnabled())
return false;
- TII = STM.getInstrInfo();
+ TII = STM->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ assert(MRI->isSSA() && "Must be run on SSA");
+
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
bool Modified = false;
- for (MachineBasicBlock &MBB : MF)
+ for (MachineBasicBlock &MBB : MF) {
+ CreatedX2 = 0;
Modified |= optimizeBlock(MBB);
+ // Run again to convert x2 to x4.
+ if (CreatedX2 >= 1)
+ Modified |= optimizeBlock(MBB);
+ }
+
return Modified;
}