diff options
Diffstat (limited to 'lib/Target/AMDGPU/GCNNSAReassign.cpp')
-rw-r--r-- | lib/Target/AMDGPU/GCNNSAReassign.cpp | 343 |
1 files changed, 343 insertions, 0 deletions
diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp new file mode 100644 index 000000000000..51c4c99cfb18 --- /dev/null +++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -0,0 +1,343 @@ +//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential +/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA +/// with sequential versions where possible. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/MathExtras.h" +#include <algorithm> + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-nsa-reassign" + +STATISTIC(NumNSAInstructions, + "Number of NSA instructions with non-sequential address found"); +STATISTIC(NumNSAConverted, + "Number of NSA instructions changed to sequential"); + +namespace { + +class GCNNSAReassign : public MachineFunctionPass { +public: + static char ID; + + GCNNSAReassign() : MachineFunctionPass(ID) { + initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN NSA Reassign"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addRequired<VirtRegMap>(); + AU.addRequired<LiveRegMatrix>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + typedef enum { + NOT_NSA, // Not an NSA instruction + FIXED, // NSA which we cannot modify + NON_CONTIGUOUS, // NSA with non-sequential address which we can try + // to optimize. + CONTIGUOUS // NSA with all sequential address registers + } NSA_Status; + + const GCNSubtarget *ST; + + const MachineRegisterInfo *MRI; + + const SIRegisterInfo *TRI; + + VirtRegMap *VRM; + + LiveRegMatrix *LRM; + + LiveIntervals *LIS; + + unsigned MaxNumVGPRs; + + const MCPhysReg *CSRegs; + + NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; + + bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, + unsigned StartReg) const; + + bool canAssign(unsigned StartReg, unsigned NumRegs) const; + + bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", + false, false) + + +char GCNNSAReassign::ID = 0; + +char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; + +bool +GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, + unsigned StartReg) const { + unsigned NumRegs = Intervals.size(); + + for (unsigned N = 0; N < NumRegs; ++N) + if (VRM->hasPhys(Intervals[N]->reg)) + LRM->unassign(*Intervals[N]); + + for (unsigned N = 0; N < NumRegs; ++N) + if (LRM->checkInterference(*Intervals[N], StartReg + N)) + return false; + + for (unsigned N = 0; N < NumRegs; ++N) + LRM->assign(*Intervals[N], StartReg + N); + + return true; +} + +bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { + for (unsigned N = 0; N < NumRegs; ++N) { + unsigned Reg = StartReg + N; + if (!MRI->isAllocatable(Reg)) + return false; + + for (unsigned I = 0; CSRegs[I]; ++I) + if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && + !LRM->isPhysRegUsed(CSRegs[I])) + return false; + } + + return true; +} + +bool +GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const { + unsigned NumRegs = Intervals.size(); + + if (NumRegs > MaxNumVGPRs) + return false; + unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; + + for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { + if (!canAssign(Reg, NumRegs)) + continue; + + if (tryAssignRegisters(Intervals, Reg)) + return true; + } + + return false; +} + +GCNNSAReassign::NSA_Status +GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); + if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + return NSA_Status::NOT_NSA; + + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); + + unsigned VgprBase = 0; + bool NSA = false; + for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); + unsigned Reg = Op.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + return NSA_Status::FIXED; + + unsigned PhysReg = VRM->getPhys(Reg); + + if (!Fast) { + if (!PhysReg) + return NSA_Status::FIXED; + + // Bail if address is not a VGPR32. That should be possible to extend the + // optimization to work with subregs of a wider register tuples, but the + // logic to find free registers will be much more complicated with much + // less chances for success. That seems reasonable to assume that in most + // cases a tuple is used because a vector variable contains different + // parts of an address and it is either already consequitive or cannot + // be reassigned if not. If needed it is better to rely on register + // coalescer to process such address tuples. + if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) + return NSA_Status::FIXED; + + const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); + + if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) + return NSA_Status::FIXED; + + for (auto U : MRI->use_nodbg_operands(Reg)) { + if (U.isImplicit()) + return NSA_Status::FIXED; + const MachineInstr *UseInst = U.getParent(); + if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) + return NSA_Status::FIXED; + } + + if (!LIS->hasInterval(Reg)) + return NSA_Status::FIXED; + } + + if (I == 0) + VgprBase = PhysReg; + else if (VgprBase + I != PhysReg) + NSA = true; + } + + return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; +} + +bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget<GCNSubtarget>(); + if (ST->getGeneration() < GCNSubtarget::GFX10) + return false; + + MRI = &MF.getRegInfo(); + TRI = ST->getRegisterInfo(); + VRM = &getAnalysis<VirtRegMap>(); + LRM = &getAnalysis<LiveRegMatrix>(); + LIS = &getAnalysis<LiveIntervals>(); + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MaxNumVGPRs = ST->getMaxNumVGPRs(MF); + MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); + CSRegs = MRI->getCalleeSavedRegs(); + + using Candidate = std::pair<const MachineInstr*, bool>; + SmallVector<Candidate, 32> Candidates; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + switch (CheckNSA(MI)) { + default: + continue; + case NSA_Status::CONTIGUOUS: + Candidates.push_back(std::make_pair(&MI, true)); + break; + case NSA_Status::NON_CONTIGUOUS: + Candidates.push_back(std::make_pair(&MI, false)); + ++NumNSAInstructions; + break; + } + } + } + + bool Changed = false; + for (auto &C : Candidates) { + if (C.second) + continue; + + const MachineInstr *MI = C.first; + if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { + // Already happen to be fixed. + C.second = true; + ++NumNSAConverted; + continue; + } + + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); + + SmallVector<LiveInterval *, 16> Intervals; + SmallVector<unsigned, 16> OrigRegs; + SlotIndex MinInd, MaxInd; + for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); + unsigned Reg = Op.getReg(); + LiveInterval *LI = &LIS->getInterval(Reg); + if (llvm::find(Intervals, LI) != Intervals.end()) { + // Same register used, unable to make sequential + Intervals.clear(); + break; + } + Intervals.push_back(LI); + OrigRegs.push_back(VRM->getPhys(Reg)); + MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); + MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); + } + + if (Intervals.empty()) + continue; + + LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI + << "\tOriginal allocation:\t"; + for(auto *LI : Intervals) + dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI); + dbgs() << '\n'); + + bool Success = scavengeRegs(Intervals); + if (!Success) { + LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); + if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation. + continue; + } else { + // Check we did not make it worse for other instructions. + auto I = std::lower_bound(Candidates.begin(), &C, MinInd, + [this](const Candidate &C, SlotIndex I) { + return LIS->getInstructionIndex(*C.first) < I; + }); + for (auto E = Candidates.end(); Success && I != E && + LIS->getInstructionIndex(*I->first) < MaxInd; ++I) { + if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) { + Success = false; + LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); + } + } + } + + if (!Success) { + for (unsigned I = 0; I < Info->VAddrDwords; ++I) + if (VRM->hasPhys(Intervals[I]->reg)) + LRM->unassign(*Intervals[I]); + + for (unsigned I = 0; I < Info->VAddrDwords; ++I) + LRM->assign(*Intervals[I], OrigRegs[I]); + + continue; + } + + C.second = true; + ++NumNSAConverted; + LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t [" + << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI) + << " : " + << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI) + << "]\n"); + Changed = true; + } + + return Changed; +} |