diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 439 |
1 files changed, 439 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp new file mode 100644 index 000000000000..5c507ef70a8c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -0,0 +1,439 @@ +//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file This file defines a set of schedule DAG mutations that can be used to +// override default scheduler behavior to enforce specific scheduling patterns. +// They should be used in cases where runtime performance considerations such as +// inter-wavefront interactions, mean that compile-time heuristics cannot +// predict the optimal instruction ordering, or in kernels where optimum +// instruction scheduling is important enough to warrant manual intervention. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUIGroupLP.h" +#include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/BitmaskEnum.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/TargetOpcodes.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +namespace { + +static cl::opt<bool> + EnableIGroupLP("amdgpu-igrouplp", + cl::desc("Enable construction of Instruction Groups and " + "their ordering for scheduling"), + cl::init(false)); + +static cl::opt<Optional<unsigned>> + VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in VMEM group.")); + +static cl::opt<Optional<unsigned>> + MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in MFMA group.")); + +static cl::opt<Optional<unsigned>> + LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in lds/gds read group.")); + +static cl::opt<Optional<unsigned>> + LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in lds/gds write group.")); + +typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)> + CanAddMIFn; + +// Classify instructions into groups to enable fine tuned control over the +// scheduler. These groups may be more specific than current SchedModel +// instruction classes. +class SchedGroup { +private: + // Function that returns true if a non-bundle MI may be inserted into this + // group. + const CanAddMIFn canAddMI; + + // Maximum number of SUnits that can be added to this group. + Optional<unsigned> MaxSize; + + // Collection of SUnits that are classified as members of this group. + SmallVector<SUnit *, 32> Collection; + + ScheduleDAGInstrs *DAG; + + void tryAddEdge(SUnit *A, SUnit *B) { + if (A != B && DAG->canAddEdge(B, A)) { + DAG->addEdge(B, SDep(A, SDep::Artificial)); + LLVM_DEBUG(dbgs() << "Adding edge...\n" + << "from: SU(" << A->NodeNum << ") " << *A->getInstr() + << "to: SU(" << B->NodeNum << ") " << *B->getInstr()); + } + } + +public: + // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If + // MakePred is true, SU will be a predecessor of the SUnits in this + // SchedGroup, otherwise SU will be a successor. + void link(SUnit &SU, bool MakePred = false) { + for (auto A : Collection) { + SUnit *B = &SU; + if (MakePred) + std::swap(A, B); + + tryAddEdge(A, B); + } + } + + // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use + // the predicate to determine whether SU should be a predecessor (P = true) + // or a successor (P = false) of this SchedGroup. + void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) { + for (auto A : Collection) { + SUnit *B = &SU; + if (P(A, B)) + std::swap(A, B); + + tryAddEdge(A, B); + } + } + + // Add DAG dependencies such that SUnits in this group shall be ordered + // before SUnits in OtherGroup. + void link(SchedGroup &OtherGroup) { + for (auto B : OtherGroup.Collection) + link(*B); + } + + // Returns true if no more instructions may be added to this group. + bool isFull() { return MaxSize && Collection.size() >= *MaxSize; } + + // Returns true if SU can be added to this SchedGroup. + bool canAddSU(SUnit &SU, const SIInstrInfo *TII) { + if (isFull()) + return false; + + MachineInstr &MI = *SU.getInstr(); + if (MI.getOpcode() != TargetOpcode::BUNDLE) + return canAddMI(MI, TII); + + // Special case for bundled MIs. + const MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; + while (E != MBB->end() && E->isBundledWithPred()) + ++E; + + // Return true if all of the bundled MIs can be added to this group. + return std::all_of( + B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); }); + } + + void add(SUnit &SU) { Collection.push_back(&SU); } + + SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize, + ScheduleDAGInstrs *DAG) + : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {} +}; + +bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isMFMA(MI); +} + +bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isVALU(MI) && !TII->isMFMA(MI); +} + +bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isSALU(MI); +} + +bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); +} + +bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayLoad() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); +} + +bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayStore() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); +} + +bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayStore() && TII->isDS(MI); +} + +bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayLoad() && TII->isDS(MI); +} + +class IGroupLPDAGMutation : public ScheduleDAGMutation { +public: + const SIInstrInfo *TII; + ScheduleDAGMI *DAG; + + IGroupLPDAGMutation() = default; + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +// DAG mutation that coordinates with the SCHED_BARRIER instruction and +// corresponding builtin. The mutation adds edges from specific instruction +// classes determined by the SCHED_BARRIER mask so that they cannot be +// scheduled around the SCHED_BARRIER. +class SchedBarrierDAGMutation : public ScheduleDAGMutation { +private: + const SIInstrInfo *TII; + + ScheduleDAGMI *DAG; + + // Components of the mask that determines which instructions may not be + // scheduled across the SCHED_BARRIER. + enum class SchedBarrierMasks { + NONE = 0u, + ALU = 1u << 0, + VALU = 1u << 1, + SALU = 1u << 2, + MFMA = 1u << 3, + VMEM = 1u << 4, + VMEM_READ = 1u << 5, + VMEM_WRITE = 1u << 6, + DS = 1u << 7, + DS_READ = 1u << 8, + DS_WRITE = 1u << 9, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE) + }; + + // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a + // region. + // + std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr; + std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr; + std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr; + std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr; + std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr; + std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr; + std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr; + + // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should + // not be reordered accross the SCHED_BARRIER. + void getSchedGroupsFromMask(int32_t Mask, + SmallVectorImpl<SchedGroup *> &SchedGroups); + + // Add DAG edges that enforce SCHED_BARRIER ordering. + void addSchedBarrierEdges(SUnit &SU); + + // Classify instructions and add them to the SchedGroup. + void initSchedGroup(SchedGroup *SG); + + // Remove all existing edges from a SCHED_BARRIER. + void resetSchedBarrierEdges(SUnit &SU); + +public: + void apply(ScheduleDAGInstrs *DAGInstrs) override; + + SchedBarrierDAGMutation() = default; +}; + +void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); + + // The order of InstructionGroups in this vector defines the + // order in which edges will be added. In other words, given the + // present ordering, we will try to make each VMEMRead instruction + // a predecessor of each DSRead instruction, and so on. + SmallVector<SchedGroup, 4> PipelineOrderGroups = { + SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG), + SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG), + SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG), + SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)}; + + for (SUnit &SU : DAG->SUnits) { + LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); + for (auto &SG : PipelineOrderGroups) + if (SG.canAddSU(SU, TII)) + SG.add(SU); + } + + for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) { + auto &GroupA = PipelineOrderGroups[i]; + for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) { + auto &GroupB = PipelineOrderGroups[j]; + GroupA.link(GroupB); + } + } +} + +void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAGInstrs->SUnits.empty()) + return; + + LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n"); + + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); + for (auto &SU : DAG->SUnits) + if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) + addSchedBarrierEdges(SU); +} + +void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { + MachineInstr &MI = *SchedBarrier.getInstr(); + assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); + // Remove all existing edges from the SCHED_BARRIER that were added due to the + // instruction having side effects. + resetSchedBarrierEdges(SchedBarrier); + SmallVector<SchedGroup *, 4> SchedGroups; + int32_t Mask = MI.getOperand(0).getImm(); + getSchedGroupsFromMask(Mask, SchedGroups); + for (auto SG : SchedGroups) + SG->link( + SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( + const SUnit *A, const SUnit *B) { + return A->NodeNum > B->NodeNum; + }); +} + +void SchedBarrierDAGMutation::getSchedGroupsFromMask( + int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) { + SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask; + // See IntrinsicsAMDGPU.td for an explanation of these masks and their + // mappings. + // + if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if (!VALUSchedGroup) { + VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG); + initSchedGroup(VALUSchedGroup.get()); + } + + SchedGroups.push_back(VALUSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if (!SALUSchedGroup) { + SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG); + initSchedGroup(SALUSchedGroup.get()); + } + + SchedGroups.push_back(SALUSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if (!MFMASchedGroup) { + MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG); + initSchedGroup(MFMASchedGroup.get()); + } + + SchedGroups.push_back(MFMASchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { + if (!VMEMReadSchedGroup) { + VMEMReadSchedGroup = + std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG); + initSchedGroup(VMEMReadSchedGroup.get()); + } + + SchedGroups.push_back(VMEMReadSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { + if (!VMEMWriteSchedGroup) { + VMEMWriteSchedGroup = + std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG); + initSchedGroup(VMEMWriteSchedGroup.get()); + } + + SchedGroups.push_back(VMEMWriteSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { + if (!DSReadSchedGroup) { + DSReadSchedGroup = + std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG); + initSchedGroup(DSReadSchedGroup.get()); + } + + SchedGroups.push_back(DSReadSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { + if (!DSWriteSchedGroup) { + DSWriteSchedGroup = + std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG); + initSchedGroup(DSWriteSchedGroup.get()); + } + + SchedGroups.push_back(DSWriteSchedGroup.get()); + } +} + +void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) { + assert(SG); + for (auto &SU : DAG->SUnits) + if (SG->canAddSU(SU, TII)) + SG->add(SU); +} + +void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) { + assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER); + for (auto &P : SU.Preds) + SU.removePred(P); + + for (auto &S : SU.Succs) { + for (auto &SP : S.getSUnit()->Preds) { + if (SP.getSUnit() == &SU) { + S.getSUnit()->removePred(SP); + } + } + } +} + +} // namespace + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() { + return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr; +} + +std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() { + return std::make_unique<SchedBarrierDAGMutation>(); +} + +} // end namespace llvm |
