summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp204
1 files changed, 159 insertions, 45 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index b1c73df269fb..0640e24b37ec 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -56,35 +56,17 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/InitializePasses.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <vector>
using namespace llvm;
@@ -154,6 +136,11 @@ private:
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
+ unsigned AndOpc;
+ unsigned XorTermrOpc;
+ unsigned OrSaveExecOpc;
+ unsigned Exec;
+
DenseMap<const MachineInstr *, InstrInfo> Instructions;
MapVector<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<MachineInstr *, 1> LiveMaskQueries;
@@ -164,6 +151,8 @@ private:
void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
+ void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
+ unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
void markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
@@ -252,6 +241,8 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
assert(!(Flag & StateExact) && Flag != 0);
+ LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
+
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where
@@ -267,9 +258,70 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
Worklist.push_back(&MI);
}
+/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
+void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
+ Register Reg, unsigned SubReg, char Flag,
+ std::vector<WorkItem> &Worklist) {
+ assert(!MRI->isSSA());
+
+ LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
+
+ LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
+ if (!UseLRQ.valueIn())
+ return;
+
+ SmallPtrSet<const VNInfo *, 4> Visited;
+ SmallVector<const VNInfo *, 4> ToProcess;
+ ToProcess.push_back(UseLRQ.valueIn());
+ do {
+ const VNInfo *Value = ToProcess.pop_back_val();
+ Visited.insert(Value);
+
+ if (Value->isPHIDef()) {
+ // Need to mark all defs used in the PHI node
+ const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
+ assert(MBB && "Phi-def has no defining MBB");
+ for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+ PE = MBB->pred_end();
+ PI != PE; ++PI) {
+ if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
+ if (!Visited.count(VN))
+ ToProcess.push_back(VN);
+ }
+ }
+ } else {
+ MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
+ assert(MI && "Def has no defining instruction");
+ markInstruction(*MI, Flag, Worklist);
+
+ // Iterate over all operands to find relevant definitions
+ for (const MachineOperand &Op : MI->operands()) {
+ if (!(Op.isReg() && Op.getReg() == Reg))
+ continue;
+
+ // Does this def cover whole register?
+ bool DefinesFullReg =
+ Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
+ if (!DefinesFullReg) {
+ // Partial definition; need to follow and mark input value
+ LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
+ if (const VNInfo *VN = LRQ.valueIn()) {
+ if (!Visited.count(VN))
+ ToProcess.push_back(VN);
+ }
+ }
+ }
+ }
+ } while (!ToProcess.empty());
+}
+
/// Mark all instructions defining the uses in \p MI with \p Flag.
void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
+
+ LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
+ << MI);
+
for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() || !Use.isUse())
continue;
@@ -279,30 +331,39 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
- if (!Register::isVirtualRegister(Reg)) {
+ if (!Reg.isVirtual()) {
if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
continue;
- for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+ for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
+ ++RegUnit) {
LiveRange &LR = LIS->getRegUnit(*RegUnit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
if (!Value)
continue;
- // Since we're in machine SSA, we do not need to track physical
- // registers across basic blocks.
- if (Value->isPHIDef())
- continue;
-
- markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
- Worklist);
+ if (MRI->isSSA()) {
+ // Since we're in machine SSA, we do not need to track physical
+ // registers across basic blocks.
+ if (Value->isPHIDef())
+ continue;
+ markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
+ Worklist);
+ } else {
+ markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
+ }
}
continue;
}
- for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
- markInstruction(DefMI, Flag, Worklist);
+ if (MRI->isSSA()) {
+ for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+ markInstruction(DefMI, Flag, Worklist);
+ } else {
+ LiveRange &LR = LIS->getInterval(Reg);
+ markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
+ }
}
}
@@ -363,7 +424,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
LowerToCopyInstrs.push_back(&MI);
} else {
Register Reg = Inactive.getReg();
- if (Register::isVirtualRegister(Reg)) {
+ if (Reg.isVirtual()) {
for (MachineInstr &DefMI : MRI->def_instructions(Reg))
markInstruction(DefMI, StateWWM, Worklist);
}
@@ -393,7 +454,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Register Reg = MO.getReg();
- if (!Register::isVirtualRegister(Reg) &&
+ if (!Reg.isVirtual() &&
TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
Flags = StateWQM;
break;
@@ -552,7 +613,8 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
if (!SaveSCC)
return PreferLast ? Last : First;
- LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+ LiveRange &LR =
+ LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
auto MBBE = MBB.end();
SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
: LIS->getMBBEndIdx(&MBB);
@@ -572,7 +634,12 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
break;
Idx = Next;
} else {
- SlotIndex Next = S->end.getNextIndex().getBaseIndex();
+ MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
+ assert(EndMI && "Segment does not end on valid instruction");
+ auto NextI = std::next(EndMI->getIterator());
+ if (NextI == MBB.end())
+ break;
+ SlotIndex Next = LIS->getInstructionIndex(*NextI);
if (Next > LastIdx)
break;
Idx = Next;
@@ -588,6 +655,23 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
MBBI = MBB.end();
}
+ // Move insertion point past any operations modifying EXEC.
+ // This assumes that the value of SCC defined by any of these operations
+ // does not need to be preserved.
+ while (MBBI != Last) {
+ bool IsExecDef = false;
+ for (const MachineOperand &MO : MBBI->operands()) {
+ if (MO.isReg() && MO.isDef()) {
+ IsExecDef |=
+ MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
+ }
+ }
+ if (!IsExecDef)
+ break;
+ MBBI++;
+ S = nullptr;
+ }
+
if (S)
MBBI = saveSCC(MBB, MBBI);
@@ -682,8 +766,11 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
- if (isEntry)
- ++II; // Skip the instruction that saves LiveMask
+ if (isEntry) {
+ // Skip the instruction that saves LiveMask
+ if (II != IE && II->getOpcode() == AMDGPU::COPY)
+ ++II;
+ }
// This stores the first instruction where it's safe to switch from WQM to
// Exact or vice versa.
@@ -694,6 +781,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
// FirstWQM since if it's safe to switch to/from WWM, it must be safe to
// switch to/from WQM as well.
MachineBasicBlock::iterator FirstWWM = IE;
+
for (;;) {
MachineBasicBlock::iterator Next = II;
char Needs = StateExact | StateWQM; // WWM is disabled by default
@@ -730,9 +818,6 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (MI.isTerminator() && OutNeeds == StateExact)
Needs = StateExact;
- if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
- MI.getOperand(3).setImm(1);
-
++Next;
} else {
// End of basic block
@@ -809,6 +894,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (II == IE)
break;
+
II = Next;
}
assert(!SavedWQMReg);
@@ -819,6 +905,7 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();
Register Dest = MI->getOperand(0).getReg();
+
MachineInstr *Copy =
BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
.addReg(LiveMaskReg);
@@ -833,19 +920,35 @@ void SIWholeQuadMode::lowerCopyInstrs() {
assert(MI->getNumExplicitOperands() == 2);
const Register Reg = MI->getOperand(0).getReg();
+ const unsigned SubReg = MI->getOperand(0).getSubReg();
if (TRI->isVGPR(*MRI, Reg)) {
- const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
- ? MRI->getRegClass(Reg)
- : TRI->getPhysRegClass(Reg);
+ const TargetRegisterClass *regClass =
+ Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
+ if (SubReg)
+ regClass = TRI->getSubRegClass(regClass, SubReg);
const unsigned MovOp = TII->getMovOpcode(regClass);
MI->setDesc(TII->get(MovOp));
// And make it implicitly depend on exec (like all VALU movs should do).
MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- } else {
+ } else if (!MRI->isSSA()) {
+ // Remove early-clobber and exec dependency from simple SGPR copies.
+ // This allows some to be eliminated during/post RA.
+ LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
+ if (MI->getOperand(0).isEarlyClobber()) {
+ LIS->removeInterval(Reg);
+ MI->getOperand(0).setIsEarlyClobber(false);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
+ while (Index >= 0) {
+ MI->RemoveOperand(Index);
+ Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
+ }
MI->setDesc(TII->get(AMDGPU::COPY));
+ LLVM_DEBUG(dbgs() << " -> " << *MI);
}
}
for (MachineInstr *MI : LowerToCopyInstrs) {
@@ -881,9 +984,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
+ if (ST->isWave32()) {
+ AndOpc = AMDGPU::S_AND_B32;
+ XorTermrOpc = AMDGPU::S_XOR_B32_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+ Exec = AMDGPU::EXEC_LO;
+ } else {
+ AndOpc = AMDGPU::S_AND_B64;
+ XorTermrOpc = AMDGPU::S_XOR_B64_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+ Exec = AMDGPU::EXEC;
+ }
+
char GlobalFlags = analyzeFunction(MF);
unsigned LiveMaskReg = 0;
- unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (!(GlobalFlags & StateWQM)) {
lowerLiveMaskQueries(Exec);
if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
@@ -932,7 +1046,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// Physical registers like SCC aren't tracked by default anyway, so just
// removing the ranges we computed is the simplest option for maintaining
// the analysis results.
- LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+ LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
return true;
}