summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp212
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp818
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h233
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h8
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td56
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td2
11 files changed, 873 insertions, 488 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index b4a8766d682e..56a9a30bc59a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -29,6 +29,8 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/IR/Constants.h"
@@ -43,6 +45,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/OptimizedStructLayout.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <tuple>
#include <vector>
#define DEBUG_TYPE "amdgpu-lower-module-lds"
@@ -97,6 +100,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
static void
removeFromUsedLists(Module &M,
const std::vector<GlobalVariable *> &LocalVars) {
+ // The verifier rejects used lists containing an inttoptr of a constant
+ // so remove the variables from these lists before replaceAllUsesWith
+
SmallPtrSet<Constant *, 32> LocalVarsSet;
for (GlobalVariable *LocalVar : LocalVars)
if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
@@ -146,12 +152,59 @@ public:
}
bool runOnModule(Module &M) override {
+ LLVMContext &Ctx = M.getContext();
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
+
+ // Move variables used by functions into amdgcn.module.lds
std::vector<GlobalVariable *> ModuleScopeVariables =
AMDGPU::findVariablesToLower(M, nullptr);
- Changed |= processUsedLDS(CG, M, ModuleScopeVariables);
+ if (!ModuleScopeVariables.empty()) {
+ std::string VarName = "llvm.amdgcn.module.lds";
+
+ GlobalVariable *SGV;
+ DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
+ std::tie(SGV, LDSVarsToConstantGEP) =
+ createLDSVariableReplacement(M, VarName, ModuleScopeVariables);
+
+ appendToCompilerUsed(
+ M, {static_cast<GlobalValue *>(
+ ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+ removeFromUsedLists(M, ModuleScopeVariables);
+ replaceLDSVariablesWithStruct(M, ModuleScopeVariables, SGV,
+ LDSVarsToConstantGEP,
+ [](Use &) { return true; });
+
+ // This ensures the variable is allocated when called functions access it.
+ // It also lets other passes, specifically PromoteAlloca, accurately
+ // calculate how much LDS will be used by the kernel after lowering.
+
+ IRBuilder<> Builder(Ctx);
+ for (Function &Func : M.functions()) {
+ if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
+ const CallGraphNode *N = CG[&Func];
+ const bool CalleesRequireModuleLDS = N->size() > 0;
+
+ if (CalleesRequireModuleLDS) {
+ // If a function this kernel might call requires module LDS,
+ // annotate the kernel to let later passes know it will allocate
+ // this structure, even if not apparent from the IR.
+ markUsedByKernel(Builder, &Func, SGV);
+ } else {
+ // However if we are certain this kernel cannot call a function that
+ // requires module LDS, annotate the kernel so the backend can elide
+ // the allocation without repeating callgraph walks.
+ Func.addFnAttr("amdgpu-elide-module-lds");
+ }
+ }
+ }
+
+ Changed = true;
+ }
+ // Move variables used by kernels into per-kernel instances
for (Function &F : M.functions()) {
if (F.isDeclaration())
continue;
@@ -159,9 +212,37 @@ public:
// Only lower compute kernels' LDS.
if (!AMDGPU::isKernel(F.getCallingConv()))
continue;
+
std::vector<GlobalVariable *> KernelUsedVariables =
AMDGPU::findVariablesToLower(M, &F);
- Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F);
+
+ // Replace all constant uses with instructions if they belong to the
+ // current kernel. Unnecessary, removing will cause test churn.
+ for (size_t I = 0; I < KernelUsedVariables.size(); I++) {
+ GlobalVariable *GV = KernelUsedVariables[I];
+ for (User *U : make_early_inc_range(GV->users())) {
+ if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
+ AMDGPU::replaceConstantUsesInFunction(C, &F);
+ }
+ GV->removeDeadConstantUsers();
+ }
+
+ if (!KernelUsedVariables.empty()) {
+ std::string VarName =
+ (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str();
+ GlobalVariable *SGV;
+ DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
+ std::tie(SGV, LDSVarsToConstantGEP) =
+ createLDSVariableReplacement(M, VarName, KernelUsedVariables);
+
+ removeFromUsedLists(M, KernelUsedVariables);
+ replaceLDSVariablesWithStruct(
+ M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) {
+ Instruction *I = dyn_cast<Instruction>(U.getUser());
+ return I && I->getFunction() == &F;
+ });
+ Changed = true;
+ }
}
return Changed;
@@ -212,16 +293,18 @@ private:
return Changed;
}
- bool processUsedLDS(CallGraph const &CG, Module &M,
- std::vector<GlobalVariable *> const &LDSVarsToTransform,
- Function *F = nullptr) {
+ std::tuple<GlobalVariable *, DenseMap<GlobalVariable *, Constant *>>
+ createLDSVariableReplacement(
+ Module &M, std::string VarName,
+ std::vector<GlobalVariable *> const &LDSVarsToTransform) {
+ // Create a struct instance containing LDSVarsToTransform and map from those
+ // variables to ConstantExprGEP
+ // Variables may be introduced to meet alignment requirements. No aliasing
+ // metadata is useful for these as they have no uses. Erased before return.
+
LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
-
- if (LDSVarsToTransform.empty()) {
- // No variables to rewrite, no changes made.
- return false;
- }
+ assert(!LDSVarsToTransform.empty());
SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
LayoutFields.reserve(LDSVarsToTransform.size());
@@ -234,9 +317,10 @@ private:
performOptimizedStructLayout(LayoutFields);
std::vector<GlobalVariable *> LocalVars;
+ BitVector IsPaddingField;
LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
+ IsPaddingField.reserve(LDSVarsToTransform.size());
{
- // This usually won't need to insert any padding, perhaps avoid the alloc
uint64_t CurrentOffset = 0;
for (size_t I = 0; I < LayoutFields.size(); I++) {
GlobalVariable *FGV = static_cast<GlobalVariable *>(
@@ -256,10 +340,12 @@ private:
M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
"", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
false));
+ IsPaddingField.push_back(true);
CurrentOffset += Padding;
}
LocalVars.push_back(FGV);
+ IsPaddingField.push_back(false);
CurrentOffset += LayoutFields[I].Size;
}
}
@@ -270,9 +356,6 @@ private:
LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
[](const GlobalVariable *V) -> Type * { return V->getValueType(); });
- std::string VarName(
- F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
- : "llvm.amdgcn.module.lds");
StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
Align StructAlign =
@@ -283,62 +366,65 @@ private:
VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
false);
SGV->setAlignment(StructAlign);
- if (!F) {
- appendToCompilerUsed(
- M, {static_cast<GlobalValue *>(
- ConstantExpr::getPointerBitCastOrAddrSpaceCast(
- cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+ DenseMap<GlobalVariable *, Constant *> Map;
+ Type *I32 = Type::getInt32Ty(Ctx);
+ for (size_t I = 0; I < LocalVars.size(); I++) {
+ GlobalVariable *GV = LocalVars[I];
+ Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+ Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true);
+ if (IsPaddingField[I]) {
+ assert(GV->use_empty());
+ GV->eraseFromParent();
+ } else {
+ Map[GV] = GEP;
+ }
}
+ assert(Map.size() == LDSVarsToTransform.size());
+ return {SGV, std::move(Map)};
+ }
- // The verifier rejects used lists containing an inttoptr of a constant
- // so remove the variables from these lists before replaceAllUsesWith
- removeFromUsedLists(M, LocalVars);
+ template <typename PredicateTy>
+ void replaceLDSVariablesWithStruct(
+ Module &M, std::vector<GlobalVariable *> const &LDSVarsToTransform,
+ GlobalVariable *SGV,
+ DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP,
+ PredicateTy Predicate) {
+ LLVMContext &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
// Create alias.scope and their lists. Each field in the new structure
// does not alias with all other fields.
SmallVector<MDNode *> AliasScopes;
SmallVector<Metadata *> NoAliasList;
- if (LocalVars.size() > 1) {
+ const size_t NumberVars = LDSVarsToTransform.size();
+ if (NumberVars > 1) {
MDBuilder MDB(Ctx);
- AliasScopes.reserve(LocalVars.size());
+ AliasScopes.reserve(NumberVars);
MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
- for (size_t I = 0; I < LocalVars.size(); I++) {
+ for (size_t I = 0; I < NumberVars; I++) {
MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
AliasScopes.push_back(Scope);
}
NoAliasList.append(&AliasScopes[1], AliasScopes.end());
}
- // Replace uses of ith variable with a constantexpr to the ith field of the
- // instance that will be allocated by AMDGPUMachineFunction
- Type *I32 = Type::getInt32Ty(Ctx);
- for (size_t I = 0; I < LocalVars.size(); I++) {
- GlobalVariable *GV = LocalVars[I];
- Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
- Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
- if (F) {
- // Replace all constant uses with instructions if they belong to the
- // current kernel.
- for (User *U : make_early_inc_range(GV->users())) {
- if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
- AMDGPU::replaceConstantUsesInFunction(C, F);
- }
-
- GV->removeDeadConstantUsers();
+ // Replace uses of ith variable with a constantexpr to the corresponding
+ // field of the instance that will be allocated by AMDGPUMachineFunction
+ for (size_t I = 0; I < NumberVars; I++) {
+ GlobalVariable *GV = LDSVarsToTransform[I];
+ Constant *GEP = LDSVarsToConstantGEP[GV];
- GV->replaceUsesWithIf(GEP, [F](Use &U) {
- Instruction *I = dyn_cast<Instruction>(U.getUser());
- return I && I->getFunction() == F;
- });
- } else {
- GV->replaceAllUsesWith(GEP);
- }
+ GV->replaceUsesWithIf(GEP, Predicate);
if (GV->use_empty()) {
GV->eraseFromParent();
}
- uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
- Align A = commonAlignment(StructAlign, Off);
+ APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff);
+ uint64_t Offset = APOff.getZExtValue();
+
+ Align A = commonAlignment(SGV->getAlign().valueOrOne(), Offset);
if (I)
NoAliasList[I - 1] = AliasScopes[I - 1];
@@ -349,32 +435,6 @@ private:
refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
}
-
- // This ensures the variable is allocated when called functions access it.
- // It also lets other passes, specifically PromoteAlloca, accurately
- // calculate how much LDS will be used by the kernel after lowering.
- if (!F) {
- IRBuilder<> Builder(Ctx);
- for (Function &Func : M.functions()) {
- if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
- const CallGraphNode *N = CG[&Func];
- const bool CalleesRequireModuleLDS = N->size() > 0;
-
- if (CalleesRequireModuleLDS) {
- // If a function this kernel might call requires module LDS,
- // annotate the kernel to let later passes know it will allocate
- // this structure, even if not apparent from the IR.
- markUsedByKernel(Builder, &Func, SGV);
- } else {
- // However if we are certain this kernel cannot call a function that
- // requires module LDS, annotate the kernel so the backend can elide
- // the allocation without repeating callgraph walks.
- Func.addFnAttr("amdgpu-elide-module-lds");
- }
- }
- }
- }
- return true;
}
void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index 753f7edc9385..98b5031071cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -29,7 +29,7 @@ public:
virtual ~AMDGPUMIRFormatter() = default;
/// Implement target specific parsing of target custom pseudo source value.
- virtual bool
+ bool
parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF,
PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index bfe2e9b66ed4..98e9907068f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -191,8 +191,8 @@ public:
report_fatal_error("Invalid rule identifier");
}
- virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
- MachineIRBuilder &B) const override;
+ bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+ MachineIRBuilder &B) const override;
};
bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 04da14cc4916..859deae86f35 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -9,6 +9,18 @@
/// \file
/// This contains a MachineSchedStrategy implementation for maximizing wave
/// occupancy on GCN hardware.
+///
+/// This pass will apply multiple scheduling stages to the same function.
+/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
+/// entry point for the scheduling of those regions is
+/// GCNScheduleDAGMILive::runSchedStages.
+
+/// Generally, the reason for having multiple scheduling stages is to account
+/// for the kernel-wide effect of register usage on occupancy. Usually, only a
+/// few scheduling regions will have register pressure high enough to limit
+/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
+/// other regions.
+///
//===----------------------------------------------------------------------===//
#include "GCNSchedStrategy.h"
@@ -20,9 +32,9 @@
using namespace llvm;
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
- const MachineSchedContext *C) :
- GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false),
- HasExcessPressure(false), MF(nullptr) { }
+ const MachineSchedContext *C)
+ : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
+ HasClusteredNodes(false), HasExcessPressure(false) {}
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
@@ -302,210 +314,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
return SU;
}
-GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
- std::unique_ptr<MachineSchedStrategy> S) :
- ScheduleDAGMILive(C, std::move(S)),
- ST(MF.getSubtarget<GCNSubtarget>()),
- MFI(*MF.getInfo<SIMachineFunctionInfo>()),
- StartingOccupancy(MFI.getOccupancy()),
- MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
+GCNScheduleDAGMILive::GCNScheduleDAGMILive(
+ MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
+ : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
+ MFI(*MF.getInfo<SIMachineFunctionInfo>()),
+ StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}
void GCNScheduleDAGMILive::schedule() {
- if (Stage == Collect) {
- // Just record regions at the first pass.
- Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
- return;
- }
-
- std::vector<MachineInstr*> Unsched;
- Unsched.reserve(NumRegionInstrs);
- for (auto &I : *this) {
- Unsched.push_back(&I);
- }
-
- GCNRegPressure PressureBefore;
- if (LIS) {
- PressureBefore = Pressure[RegionIdx];
-
- LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
- GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
- dbgs() << "Region live-in pressure: ";
- llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
- dbgs() << "Region register pressure: ";
- PressureBefore.print(dbgs()));
- }
-
- GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
- // Set HasClusteredNodes to true for late stages where we have already
- // collected it. That way pickNode() will not scan SDep's when not needed.
- S.HasClusteredNodes = Stage > InitialSchedule;
- S.HasExcessPressure = false;
- ScheduleDAGMILive::schedule();
- Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
- RescheduleRegions[RegionIdx] = false;
- if (Stage == InitialSchedule && S.HasClusteredNodes)
- RegionsWithClusters[RegionIdx] = true;
- if (S.HasExcessPressure)
- RegionsWithHighRP[RegionIdx] = true;
-
- if (!LIS)
- return;
-
- // Check the results of scheduling.
- auto PressureAfter = getRealRegPressure();
-
- LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
- PressureAfter.print(dbgs()));
-
- if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
- PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
- Pressure[RegionIdx] = PressureAfter;
- RegionsWithMinOcc[RegionIdx] =
- PressureAfter.getOccupancy(ST) == MinOccupancy;
-
- LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
- return;
- }
-
- unsigned WavesAfter =
- std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST));
- unsigned WavesBefore =
- std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST));
- LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
- << ", after " << WavesAfter << ".\n");
-
- // We may not be able to keep the current target occupancy because of the just
- // scheduled region. We might still be able to revert scheduling if the
- // occupancy before was higher, or if the current schedule has register
- // pressure higher than the excess limits which could lead to more spilling.
- unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
-
- // Allow memory bound functions to drop to 4 waves if not limited by an
- // attribute.
- if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
- WavesAfter >= MFI.getMinAllowedOccupancy()) {
- LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
- << MFI.getMinAllowedOccupancy() << " waves\n");
- NewOccupancy = WavesAfter;
- }
-
- if (NewOccupancy < MinOccupancy) {
- MinOccupancy = NewOccupancy;
- MFI.limitOccupancy(MinOccupancy);
- RegionsWithMinOcc.reset();
- LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
- << MinOccupancy << ".\n");
- }
-
- unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
- unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
- if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
- PressureAfter.getAGPRNum() > MaxVGPRs ||
- PressureAfter.getSGPRNum() > MaxSGPRs) {
- RescheduleRegions[RegionIdx] = true;
- RegionsWithHighRP[RegionIdx] = true;
- }
-
- // If this condition is true, then either the occupancy before and after
- // scheduling is the same, or we are allowing the occupancy to drop because
- // the function is memory bound. Even if we are OK with the current occupancy,
- // we still need to verify that we will not introduce any extra chance of
- // spilling.
- if (WavesAfter >= MinOccupancy) {
- if (Stage == UnclusteredReschedule &&
- !PressureAfter.less(ST, PressureBefore)) {
- LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
- } else if (WavesAfter > MFI.getMinWavesPerEU() ||
- PressureAfter.less(ST, PressureBefore) ||
- !RescheduleRegions[RegionIdx]) {
- Pressure[RegionIdx] = PressureAfter;
- RegionsWithMinOcc[RegionIdx] =
- PressureAfter.getOccupancy(ST) == MinOccupancy;
- if (!RegionsWithClusters[RegionIdx] &&
- (Stage + 1) == UnclusteredReschedule)
- RescheduleRegions[RegionIdx] = false;
- return;
- } else {
- LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
- }
- }
-
- RegionsWithMinOcc[RegionIdx] =
- PressureBefore.getOccupancy(ST) == MinOccupancy;
- LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
- RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
- (Stage + 1) != UnclusteredReschedule;
- RegionEnd = RegionBegin;
- int SkippedDebugInstr = 0;
- for (MachineInstr *MI : Unsched) {
- if (MI->isDebugInstr()) {
- ++SkippedDebugInstr;
- continue;
- }
-
- if (MI->getIterator() != RegionEnd) {
- BB->remove(MI);
- BB->insert(RegionEnd, MI);
- if (!MI->isDebugInstr())
- LIS->handleMove(*MI, true);
- }
- // Reset read-undef flags and update them later.
- for (auto &Op : MI->operands())
- if (Op.isReg() && Op.isDef())
- Op.setIsUndef(false);
- RegisterOperands RegOpers;
- RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
- if (!MI->isDebugInstr()) {
- if (ShouldTrackLaneMasks) {
- // Adjust liveness and add missing dead+read-undef flags.
- SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
- RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
- } else {
- // Adjust for missing dead-def flags.
- RegOpers.detectDeadDefs(*MI, *LIS);
- }
- }
- RegionEnd = MI->getIterator();
- ++RegionEnd;
- LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
- }
-
- // After reverting schedule, debug instrs will now be at the end of the block
- // and RegionEnd will point to the first debug instr. Increment RegionEnd
- // pass debug instrs to the actual end of the scheduling region.
- while (SkippedDebugInstr-- > 0)
- ++RegionEnd;
-
- // If Unsched.front() instruction is a debug instruction, this will actually
- // shrink the region since we moved all debug instructions to the end of the
- // block. Find the first instruction that is not a debug instruction.
- RegionBegin = Unsched.front()->getIterator();
- if (RegionBegin->isDebugInstr()) {
- for (MachineInstr *MI : Unsched) {
- if (MI->isDebugInstr())
- continue;
- RegionBegin = MI->getIterator();
- break;
- }
- }
-
- // Then move the debug instructions back into their correct place and set
- // RegionBegin and RegionEnd if needed.
- placeDebugValues();
-
- Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+ // Collect all scheduling regions. The actual scheduling is performed in
+ // GCNScheduleDAGMILive::finalizeSchedule.
+ Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
}
-GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
+GCNRegPressure
+GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
GCNDownwardRPTracker RPTracker(*LIS);
RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
return RPTracker.moveMaxPressure();
}
-void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
+void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
+ const MachineBasicBlock *MBB) {
GCNDownwardRPTracker RPTracker(*LIS);
// If the block has the only successor then live-ins of that successor are
@@ -542,7 +374,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
RPTracker.reset(*I, &LRS);
}
- for ( ; ; ) {
+ for (;;) {
I = RPTracker.getNext();
if (Regions[CurRegion].first == I || NonDbgMI == I) {
@@ -588,8 +420,9 @@ GCNScheduleDAGMILive::getBBLiveInMap() const {
}
void GCNScheduleDAGMILive::finalizeSchedule() {
- LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
-
+ // Start actual scheduling here. This function is called by the base
+ // MachineScheduler after all regions have been recorded by
+ // GCNScheduleDAGMILive::schedule().
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
RescheduleRegions.resize(Regions.size());
@@ -601,142 +434,470 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
RegionsWithHighRP.reset();
RegionsWithMinOcc.reset();
+ runSchedStages();
+}
+
+void GCNScheduleDAGMILive::runSchedStages() {
+ LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
+ InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this);
+ UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this);
+ ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule,
+ *this);
+ PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this);
+ GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3};
+
if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();
- std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+ for (auto *Stage : SchedStages) {
+ if (!Stage->initGCNSchedStage())
+ continue;
- do {
- Stage++;
- RegionIdx = 0;
- MachineBasicBlock *MBB = nullptr;
+ for (auto Region : Regions) {
+ RegionBegin = Region.first;
+ RegionEnd = Region.second;
+ // Setup for scheduling the region and check whether it should be skipped.
+ if (!Stage->initGCNRegion()) {
+ Stage->advanceRegion();
+ exitRegion();
+ continue;
+ }
- if (Stage > InitialSchedule) {
- if (!LIS)
- break;
+ ScheduleDAGMILive::schedule();
+ Stage->finalizeGCNRegion();
+ }
- // Retry function scheduling if we found resulting occupancy and it is
- // lower than used for first pass scheduling. This will give more freedom
- // to schedule low register pressure blocks.
- // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+ Stage->finalizeGCNSchedStage();
+ }
+}
- if (Stage == UnclusteredReschedule) {
- if (RescheduleRegions.none())
- continue;
- LLVM_DEBUG(dbgs() <<
- "Retrying function scheduling without clustering.\n");
- }
+#ifndef NDEBUG
+raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
+ switch (StageID) {
+ case GCNSchedStageID::InitialSchedule:
+ OS << "Initial Schedule";
+ break;
+ case GCNSchedStageID::UnclusteredReschedule:
+ OS << "Unclustered Reschedule";
+ break;
+ case GCNSchedStageID::ClusteredLowOccupancyReschedule:
+ OS << "Clustered Low Occupancy Reschedule";
+ break;
+ case GCNSchedStageID::PreRARematerialize:
+ OS << "Pre-RA Rematerialize";
+ break;
+ }
+ return OS;
+}
+#endif
- if (Stage == ClusteredLowOccupancyReschedule) {
- if (StartingOccupancy <= MinOccupancy)
- break;
+GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ : DAG(DAG), S(static_cast<GCNMaxOccupancySchedStrategy &>(*DAG.SchedImpl)),
+ MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
- LLVM_DEBUG(
- dbgs()
- << "Retrying function scheduling with lowest recorded occupancy "
- << MinOccupancy << ".\n");
- }
+bool GCNSchedStage::initGCNSchedStage() {
+ if (!DAG.LIS)
+ return false;
- if (Stage == PreRARematerialize) {
- if (RegionsWithMinOcc.none() || Regions.size() == 1)
- break;
+ LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
+ return true;
+}
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
- // Check maximum occupancy
- if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
- MinOccupancy)
- break;
+bool UnclusteredRescheduleStage::initGCNSchedStage() {
+ if (!GCNSchedStage::initGCNSchedStage())
+ return false;
- // FIXME: This pass will invalidate cached MBBLiveIns for regions
- // inbetween the defs and region we sinked the def to. Cached pressure
- // for regions where a def is sinked from will also be invalidated. Will
- // need to be fixed if there is another pass after this pass.
- static_assert(LastStage == PreRARematerialize,
- "Passes after PreRARematerialize are not supported");
+ if (DAG.RescheduleRegions.none())
+ return false;
- collectRematerializableInstructions();
- if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
- break;
+ SavedMutations.swap(DAG.Mutations);
- LLVM_DEBUG(
- dbgs() << "Retrying function scheduling with improved occupancy of "
- << MinOccupancy << " from rematerializing\n");
- }
- }
+ LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n");
+ return true;
+}
- if (Stage == UnclusteredReschedule)
- SavedMutations.swap(Mutations);
+bool ClusteredLowOccStage::initGCNSchedStage() {
+ if (!GCNSchedStage::initGCNSchedStage())
+ return false;
- for (auto Region : Regions) {
- if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) &&
- !RescheduleRegions[RegionIdx]) ||
- (Stage == ClusteredLowOccupancyReschedule &&
- !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
+ // Don't bother trying to improve ILP in lower RP regions if occupancy has not
+ // been dropped. All regions will have already been scheduled with the ideal
+ // occupancy targets.
+ if (DAG.StartingOccupancy <= DAG.MinOccupancy)
+ return false;
- ++RegionIdx;
- continue;
- }
+ LLVM_DEBUG(
+ dbgs() << "Retrying function scheduling with lowest recorded occupancy "
+ << DAG.MinOccupancy << ".\n");
+ return true;
+}
- RegionBegin = Region.first;
- RegionEnd = Region.second;
+bool PreRARematStage::initGCNSchedStage() {
+ if (!GCNSchedStage::initGCNSchedStage())
+ return false;
- if (RegionBegin->getParent() != MBB) {
- if (MBB) finishBlock();
- MBB = RegionBegin->getParent();
- startBlock(MBB);
- if (Stage == InitialSchedule)
- computeBlockPressure(MBB);
- }
+ if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
+ return false;
- unsigned NumRegionInstrs = std::distance(begin(), end());
- enterRegion(MBB, begin(), end(), NumRegionInstrs);
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ // Check maximum occupancy
+ if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
+ DAG.MinOccupancy)
+ return false;
- // Skip empty scheduling regions (0 or 1 schedulable instructions).
- if (begin() == end() || begin() == std::prev(end())) {
- exitRegion();
- ++RegionIdx;
- continue;
- }
+ // FIXME: This pass will invalidate cached MBBLiveIns for regions
+ // inbetween the defs and region we sinked the def to. Cached pressure
+ // for regions where a def is sinked from will also be invalidated. Will
+ // need to be fixed if there is another pass after this pass.
- LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
- LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
- << MBB->getName() << "\n From: " << *begin()
- << " To: ";
- if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
- else dbgs() << "End";
- dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+ collectRematerializableInstructions();
+ if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+ return false;
- schedule();
+ LLVM_DEBUG(
+ dbgs() << "Retrying function scheduling with improved occupancy of "
+ << DAG.MinOccupancy << " from rematerializing\n");
+ return true;
+}
+
+void GCNSchedStage::finalizeGCNSchedStage() {
+ DAG.finishBlock();
+ LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
+}
+
+void UnclusteredRescheduleStage::finalizeGCNSchedStage() {
+ SavedMutations.swap(DAG.Mutations);
+
+ GCNSchedStage::finalizeGCNSchedStage();
+}
+
+bool GCNSchedStage::initGCNRegion() {
+ // Check whether this new region is also a new block.
+ if (DAG.RegionBegin->getParent() != CurrentMBB)
+ setupNewBlock();
+
+ unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
+ DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
+
+ // Skip empty scheduling regions (0 or 1 schedulable instructions).
+ if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
+ LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
+ << " " << CurrentMBB->getName()
+ << "\n From: " << *DAG.begin() << " To: ";
+ if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
+ else dbgs() << "End";
+ dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+
+ // Save original instruction order before scheduling for possible revert.
+ Unsched.clear();
+ Unsched.reserve(DAG.NumRegionInstrs);
+ for (auto &I : DAG)
+ Unsched.push_back(&I);
+
+ PressureBefore = DAG.Pressure[RegionIdx];
+
+ LLVM_DEBUG(
+ dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+ GCNRPTracker::printLiveRegs(dbgs(), DAG.LiveIns[RegionIdx], DAG.MRI);
+ dbgs() << "Region live-in pressure: ";
+ llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs());
+ dbgs() << "Region register pressure: "; PressureBefore.print(dbgs()));
+
+ // Set HasClusteredNodes to true for late stages where we have already
+ // collected it. That way pickNode() will not scan SDep's when not needed.
+ S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule;
+ S.HasExcessPressure = false;
+
+ return true;
+}
+
+bool UnclusteredRescheduleStage::initGCNRegion() {
+ if (!DAG.RescheduleRegions[RegionIdx])
+ return false;
+
+ return GCNSchedStage::initGCNRegion();
+}
+
+bool ClusteredLowOccStage::initGCNRegion() {
+ // We may need to reschedule this region if it doesn't have clusters so it
+ // wasn't rescheduled in the last stage, or if we found it was testing
+ // critical register pressure limits in the unclustered reschedule stage. The
+ // later is because we may not have been able to raise the min occupancy in
+ // the previous stage so the region may be overly constrained even if it was
+ // already rescheduled.
+ if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx])
+ return false;
+
+ return GCNSchedStage::initGCNRegion();
+}
+
+bool PreRARematStage::initGCNRegion() {
+ if (!DAG.RescheduleRegions[RegionIdx])
+ return false;
+
+ return GCNSchedStage::initGCNRegion();
+}
+
+void GCNSchedStage::setupNewBlock() {
+ if (CurrentMBB)
+ DAG.finishBlock();
+
+ CurrentMBB = DAG.RegionBegin->getParent();
+ DAG.startBlock(CurrentMBB);
+ // Get real RP for the region if it hasn't be calculated before. After the
+ // initial schedule stage real RP will be collected after scheduling.
+ if (StageID == GCNSchedStageID::InitialSchedule)
+ DAG.computeBlockPressure(RegionIdx, CurrentMBB);
+}
+
+void GCNSchedStage::finalizeGCNRegion() {
+ DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
+ DAG.RescheduleRegions[RegionIdx] = false;
+ if (S.HasExcessPressure)
+ DAG.RegionsWithHighRP[RegionIdx] = true;
+
+ // Revert scheduling if we have dropped occupancy or there is some other
+ // reason that the original schedule is better.
+ checkScheduling();
+
+ DAG.exitRegion();
+ RegionIdx++;
+}
+
+void InitialScheduleStage::finalizeGCNRegion() {
+ // Record which regions have clustered nodes for the next unclustered
+ // reschedule stage.
+ assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
+ if (S.HasClusteredNodes)
+ DAG.RegionsWithClusters[RegionIdx] = true;
+
+ GCNSchedStage::finalizeGCNRegion();
+}
+
+void GCNSchedStage::checkScheduling() {
+ // Check the results of scheduling.
+ PressureAfter = DAG.getRealRegPressure(RegionIdx);
+ LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
+ PressureAfter.print(dbgs()));
+
+ if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
+ PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
+ DAG.Pressure[RegionIdx] = PressureAfter;
+ DAG.RegionsWithMinOcc[RegionIdx] =
+ PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+
+ // Early out if we have achieve the occupancy target.
+ LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+ return;
+ }
+
+ unsigned WavesAfter =
+ std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST));
+ unsigned WavesBefore =
+ std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
+ LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
+ << ", after " << WavesAfter << ".\n");
+
+ // We may not be able to keep the current target occupancy because of the just
+ // scheduled region. We might still be able to revert scheduling if the
+ // occupancy before was higher, or if the current schedule has register
+ // pressure higher than the excess limits which could lead to more spilling.
+ unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+
+ // Allow memory bound functions to drop to 4 waves if not limited by an
+ // attribute.
+ if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
+ WavesAfter >= MFI.getMinAllowedOccupancy()) {
+ LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
+ << MFI.getMinAllowedOccupancy() << " waves\n");
+ NewOccupancy = WavesAfter;
+ }
+
+ if (NewOccupancy < DAG.MinOccupancy) {
+ DAG.MinOccupancy = NewOccupancy;
+ MFI.limitOccupancy(DAG.MinOccupancy);
+ DAG.RegionsWithMinOcc.reset();
+ LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
+ << DAG.MinOccupancy << ".\n");
+ }
- exitRegion();
- ++RegionIdx;
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+ unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+ if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
+ PressureAfter.getAGPRNum() > MaxVGPRs ||
+ PressureAfter.getSGPRNum() > MaxSGPRs) {
+ DAG.RescheduleRegions[RegionIdx] = true;
+ DAG.RegionsWithHighRP[RegionIdx] = true;
+ }
+
+ // Revert if this region's schedule would cause a drop in occupancy or
+ // spilling.
+ if (shouldRevertScheduling(WavesAfter)) {
+ revertScheduling();
+ } else {
+ DAG.Pressure[RegionIdx] = PressureAfter;
+ DAG.RegionsWithMinOcc[RegionIdx] =
+ PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+ }
+}
+
+bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
+ if (WavesAfter < DAG.MinOccupancy)
+ return true;
+
+ return false;
+}
+
+bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+ if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+ return true;
+
+ if (mayCauseSpilling(WavesAfter))
+ return true;
+
+ assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
+ // Don't reschedule the region in the next stage if it doesn't have clusters.
+ if (!DAG.RegionsWithClusters[RegionIdx])
+ DAG.RescheduleRegions[RegionIdx] = false;
+
+ return false;
+}
+
+bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+ if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+ return true;
+
+ // If RP is not reduced in the unclustred reschedule stage, revert to the old
+ // schedule.
+ if (!PressureAfter.less(ST, PressureBefore)) {
+ LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
+ return true;
+ }
+
+ return false;
+}
+
+bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
+ if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+ return true;
+
+ if (mayCauseSpilling(WavesAfter))
+ return true;
+
+ return false;
+}
+
+bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
+ if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+ return true;
+
+ if (mayCauseSpilling(WavesAfter))
+ return true;
+
+ return false;
+}
+
+bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
+ if (WavesAfter <= MFI.getMinWavesPerEU() &&
+ !PressureAfter.less(ST, PressureBefore) &&
+ DAG.RescheduleRegions[RegionIdx]) {
+ LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
+ return true;
+ }
+
+ return false;
+}
+
+void GCNSchedStage::revertScheduling() {
+ DAG.RegionsWithMinOcc[RegionIdx] =
+ PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
+ LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+ DAG.RescheduleRegions[RegionIdx] =
+ DAG.RegionsWithClusters[RegionIdx] ||
+ (nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule;
+ DAG.RegionEnd = DAG.RegionBegin;
+ int SkippedDebugInstr = 0;
+ for (MachineInstr *MI : Unsched) {
+ if (MI->isDebugInstr()) {
+ ++SkippedDebugInstr;
+ continue;
+ }
+
+ if (MI->getIterator() != DAG.RegionEnd) {
+ DAG.BB->remove(MI);
+ DAG.BB->insert(DAG.RegionEnd, MI);
+ if (!MI->isDebugInstr())
+ DAG.LIS->handleMove(*MI, true);
+ }
+
+ // Reset read-undef flags and update them later.
+ for (auto &Op : MI->operands())
+ if (Op.isReg() && Op.isDef())
+ Op.setIsUndef(false);
+ RegisterOperands RegOpers;
+ RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
+ if (!MI->isDebugInstr()) {
+ if (DAG.ShouldTrackLaneMasks) {
+ // Adjust liveness and add missing dead+read-undef flags.
+ SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
+ RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
+ } else {
+ // Adjust for missing dead-def flags.
+ RegOpers.detectDeadDefs(*MI, *DAG.LIS);
+ }
}
- finishBlock();
+ DAG.RegionEnd = MI->getIterator();
+ ++DAG.RegionEnd;
+ LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
+ }
+
+ // After reverting schedule, debug instrs will now be at the end of the block
+ // and RegionEnd will point to the first debug instr. Increment RegionEnd
+ // pass debug instrs to the actual end of the scheduling region.
+ while (SkippedDebugInstr-- > 0)
+ ++DAG.RegionEnd;
+
+ // If Unsched.front() instruction is a debug instruction, this will actually
+ // shrink the region since we moved all debug instructions to the end of the
+ // block. Find the first instruction that is not a debug instruction.
+ DAG.RegionBegin = Unsched.front()->getIterator();
+ if (DAG.RegionBegin->isDebugInstr()) {
+ for (MachineInstr *MI : Unsched) {
+ if (MI->isDebugInstr())
+ continue;
+ DAG.RegionBegin = MI->getIterator();
+ break;
+ }
+ }
+
+ // Then move the debug instructions back into their correct place and set
+ // RegionBegin and RegionEnd if needed.
+ DAG.placeDebugValues();
- if (Stage == UnclusteredReschedule)
- SavedMutations.swap(Mutations);
- } while (Stage != LastStage);
+ DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
}
-void GCNScheduleDAGMILive::collectRematerializableInstructions() {
- const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
- for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+void PreRARematStage::collectRematerializableInstructions() {
+ const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+ for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
Register Reg = Register::index2VirtReg(I);
- if (!LIS->hasInterval(Reg))
+ if (!DAG.LIS->hasInterval(Reg))
continue;
// TODO: Handle AGPR and SGPR rematerialization
- if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) ||
- !MRI.hasOneNonDBGUse(Reg))
+ if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+ !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
continue;
- MachineOperand *Op = MRI.getOneDef(Reg);
+ MachineOperand *Op = DAG.MRI.getOneDef(Reg);
MachineInstr *Def = Op->getParent();
if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
continue;
- MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg);
+ MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
if (Def->getParent() == UseI->getParent())
continue;
@@ -744,10 +905,10 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() {
// live-through or used inside regions at MinOccupancy. This means that the
// register must be in the live-in set for the region.
bool AddedToRematList = false;
- for (unsigned I = 0, E = Regions.size(); I != E; ++I) {
- auto It = LiveIns[I].find(Reg);
- if (It != LiveIns[I].end() && !It->second.none()) {
- if (RegionsWithMinOcc[I]) {
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ auto It = DAG.LiveIns[I].find(Reg);
+ if (It != DAG.LiveIns[I].end() && !It->second.none()) {
+ if (DAG.RegionsWithMinOcc[I]) {
RematerializableInsts[I][Def] = UseI;
AddedToRematList = true;
}
@@ -762,8 +923,8 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() {
}
}
-bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
- const TargetInstrInfo *TII) {
+bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+ const TargetInstrInfo *TII) {
// Temporary copies of cached variables we will be modifying and replacing if
// sinking succeeds.
SmallVector<
@@ -772,9 +933,10 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
DenseMap<unsigned, GCNRegPressure> NewPressure;
BitVector NewRescheduleRegions;
+ LiveIntervals *LIS = DAG.LIS;
- NewRegions.resize(Regions.size());
- NewRescheduleRegions.resize(Regions.size());
+ NewRegions.resize(DAG.Regions.size());
+ NewRescheduleRegions.resize(DAG.Regions.size());
// Collect only regions that has a rematerializable def as a live-in.
SmallSet<unsigned, 16> ImpactedRegions;
@@ -784,16 +946,16 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
// Make copies of register pressure and live-ins cache that will be updated
// as we rematerialize.
for (auto Idx : ImpactedRegions) {
- NewPressure[Idx] = Pressure[Idx];
- NewLiveIns[Idx] = LiveIns[Idx];
+ NewPressure[Idx] = DAG.Pressure[Idx];
+ NewLiveIns[Idx] = DAG.LiveIns[Idx];
}
- NewRegions = Regions;
+ NewRegions = DAG.Regions;
NewRescheduleRegions.reset();
DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
bool Improved = false;
for (auto I : ImpactedRegions) {
- if (!RegionsWithMinOcc[I])
+ if (!DAG.RegionsWithMinOcc[I])
continue;
Improved = false;
@@ -802,12 +964,12 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
// TODO: Handle occupancy drop due to AGPR and SGPR.
// Check if cause of occupancy drop is due to VGPR usage and not SGPR.
- if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
+ if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
break;
// The occupancy of this region could have been improved by a previous
// iteration's sinking of defs.
- if (NewPressure[I].getOccupancy(ST) > MinOccupancy) {
+ if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
NewRescheduleRegions[I] = true;
Improved = true;
continue;
@@ -827,7 +989,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
// If in the most optimistic scenario, we cannot improve occupancy, then do
// not attempt to sink any instructions.
- if (OptimisticOccupancy <= MinOccupancy)
+ if (OptimisticOccupancy <= DAG.MinOccupancy)
break;
unsigned ImproveOccupancy = 0;
@@ -842,7 +1004,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
// call LiveRangeEdit::allUsesAvailableAt() and
// LiveRangeEdit::canRematerializeAt().
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
- Def->getOperand(0).getSubReg(), *Def, *TRI);
+ Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
MachineInstr *NewMI = &*(--InsertPos);
LIS->InsertMachineInstrInMaps(*NewMI);
LIS->removeInterval(Reg);
@@ -851,11 +1013,11 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
// Update region boundaries in scheduling region we sinked from since we
// may sink an instruction that was at the beginning or end of its region
- updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
- /*Removing =*/true);
+ DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
+ /*Removing =*/true);
// Update region boundaries in region we sinked to.
- updateRegionBoundaries(NewRegions, InsertPos, NewMI);
+ DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
LaneBitmask PrevMask = NewLiveIns[I][Reg];
// FIXME: Also update cached pressure for where the def was sinked from.
@@ -863,9 +1025,9 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
// the reg from all regions as a live-in.
for (auto Idx : RematDefToLiveInRegions[Def]) {
NewLiveIns[Idx].erase(Reg);
- if (InsertPos->getParent() != Regions[Idx].first->getParent()) {
+ if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
// Def is live-through and not used in this block.
- NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI);
+ NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
} else {
// Def is used and rematerialized into this block.
GCNDownwardRPTracker RPT(*LIS);
@@ -879,7 +1041,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
SinkedDefs.push_back(Def);
ImproveOccupancy = NewPressure[I].getOccupancy(ST);
- if (ImproveOccupancy > MinOccupancy)
+ if (ImproveOccupancy > DAG.MinOccupancy)
break;
}
@@ -888,7 +1050,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
for (auto TrackedIdx : RematDefToLiveInRegions[Def])
RematerializableInsts[TrackedIdx].erase(Def);
- if (ImproveOccupancy <= MinOccupancy)
+ if (ImproveOccupancy <= DAG.MinOccupancy)
break;
NewRescheduleRegions[I] = true;
@@ -917,7 +1079,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
MachineInstr *OldMI = Entry.second;
// Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
- BBLiveInMap.erase(OldMI);
+ DAG.BBLiveInMap.erase(OldMI);
// Remove OldMI and update LIS
Register Reg = MI->getOperand(0).getReg();
@@ -929,22 +1091,22 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
// Update live-ins, register pressure, and regions caches.
for (auto Idx : ImpactedRegions) {
- LiveIns[Idx] = NewLiveIns[Idx];
- Pressure[Idx] = NewPressure[Idx];
- MBBLiveIns.erase(Regions[Idx].first->getParent());
+ DAG.LiveIns[Idx] = NewLiveIns[Idx];
+ DAG.Pressure[Idx] = NewPressure[Idx];
+ DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
}
- Regions = NewRegions;
- RescheduleRegions = NewRescheduleRegions;
+ DAG.Regions = NewRegions;
+ DAG.RescheduleRegions = NewRescheduleRegions;
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
- MFI.increaseOccupancy(MF, ++MinOccupancy);
+ MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
return true;
}
// Copied from MachineLICM
-bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI) {
- if (!TII->isTriviallyReMaterializable(MI))
+bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
+ if (!DAG.TII->isTriviallyReMaterializable(MI))
return false;
for (const MachineOperand &MO : MI.operands())
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index c3db849cf81a..7aadf89e0bf7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -28,8 +28,6 @@ class GCNSubtarget;
/// heuristics to determine excess/critical pressure sets. Its goal is to
/// maximize kernel occupancy (i.e. maximum number of waves per simd).
class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
- friend class GCNScheduleDAGMILive;
-
SUnit *pickNodeBidirectional(bool &IsTopNode);
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
@@ -42,15 +40,18 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
unsigned SGPRPressure, unsigned VGPRPressure);
std::vector<unsigned> Pressure;
+
std::vector<unsigned> MaxPressure;
unsigned SGPRExcessLimit;
+
unsigned VGPRExcessLimit;
- unsigned SGPRCriticalLimit;
- unsigned VGPRCriticalLimit;
unsigned TargetOccupancy;
+ MachineFunction *MF;
+
+public:
// schedule() have seen a clustered memory operation. Set it to false
// before a region scheduling to know if the region had such clusters.
bool HasClusteredNodes;
@@ -59,28 +60,53 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
// register pressure for actual scheduling heuristics.
bool HasExcessPressure;
- MachineFunction *MF;
+ unsigned SGPRCriticalLimit;
+
+ unsigned VGPRCriticalLimit;
-public:
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
SUnit *pickNode(bool &IsTopNode) override;
void initialize(ScheduleDAGMI *DAG) override;
+ unsigned getTargetOccupancy() { return TargetOccupancy; }
+
void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
};
-class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
+enum class GCNSchedStageID : unsigned {
+ InitialSchedule = 0,
+ UnclusteredReschedule = 1,
+ ClusteredLowOccupancyReschedule = 2,
+ PreRARematerialize = 3,
+ LastStage = PreRARematerialize
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
+#endif
+
+inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) {
+ assert(Stage != GCNSchedStageID::PreRARematerialize);
+ Stage = static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
+ return Stage;
+}
+
+inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) {
+ return static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
+}
- enum : unsigned {
- Collect,
- InitialSchedule,
- UnclusteredReschedule,
- ClusteredLowOccupancyReschedule,
- PreRARematerialize,
- LastStage = PreRARematerialize
- };
+inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) {
+ return static_cast<unsigned>(LHS) > static_cast<unsigned>(RHS);
+}
+
+class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
+ friend class GCNSchedStage;
+ friend class InitialScheduleStage;
+ friend class UnclusteredRescheduleStage;
+ friend class ClusteredLowOccStage;
+ friend class PreRARematStage;
const GCNSubtarget &ST;
@@ -92,12 +118,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Minimal real occupancy recorder for the function.
unsigned MinOccupancy;
- // Scheduling stage number.
- unsigned Stage;
-
- // Current region index.
- size_t RegionIdx;
-
// Vector of regions recorder for later rescheduling
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
@@ -121,6 +141,148 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Region pressure cache.
SmallVector<GCNRegPressure, 32> Pressure;
+ // Temporary basic block live-in cache.
+ DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;
+
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
+
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+
+ // Return current region pressure.
+ GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;
+
+ // Compute and cache live-ins and pressure for all regions in block.
+ void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
+
+ // Update region boundaries when removing MI or inserting NewMI before MI.
+ void updateRegionBoundaries(
+ SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+ MachineBasicBlock::iterator>> &RegionBoundaries,
+ MachineBasicBlock::iterator MI, MachineInstr *NewMI,
+ bool Removing = false);
+
+ void runSchedStages();
+
+public:
+ GCNScheduleDAGMILive(MachineSchedContext *C,
+ std::unique_ptr<MachineSchedStrategy> S);
+
+ void schedule() override;
+
+ void finalizeSchedule() override;
+};
+
+// GCNSchedStrategy applies multiple scheduling stages to a function.
+class GCNSchedStage {
+protected:
+ GCNScheduleDAGMILive &DAG;
+
+ GCNMaxOccupancySchedStrategy &S;
+
+ MachineFunction &MF;
+
+ SIMachineFunctionInfo &MFI;
+
+ const GCNSubtarget &ST;
+
+ const GCNSchedStageID StageID;
+
+ // The current block being scheduled.
+ MachineBasicBlock *CurrentMBB = nullptr;
+
+ // Current region index.
+ unsigned RegionIdx = 0;
+
+ // Record the original order of instructions before scheduling.
+ std::vector<MachineInstr *> Unsched;
+
+ // RP before scheduling the current region.
+ GCNRegPressure PressureBefore;
+
+ // RP after scheduling the current region.
+ GCNRegPressure PressureAfter;
+
+ GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG);
+
+public:
+ // Initialize state for a scheduling stage. Returns false if the current stage
+ // should be skipped.
+ virtual bool initGCNSchedStage();
+
+ // Finalize state after finishing a scheduling pass on the function.
+ virtual void finalizeGCNSchedStage();
+
+ // Setup for scheduling a region. Returns false if the current region should
+ // be skipped.
+ virtual bool initGCNRegion();
+
+ // Track whether a new region is also a new MBB.
+ void setupNewBlock();
+
+ // Finalize state after scheudling a region.
+ virtual void finalizeGCNRegion();
+
+ // Check result of scheduling.
+ void checkScheduling();
+
+ // Returns true if scheduling should be reverted.
+ virtual bool shouldRevertScheduling(unsigned WavesAfter);
+
+ // Returns true if the new schedule may result in more spilling.
+ bool mayCauseSpilling(unsigned WavesAfter);
+
+ // Attempt to revert scheduling for this region.
+ void revertScheduling();
+
+ void advanceRegion() { RegionIdx++; }
+
+ virtual ~GCNSchedStage() = default;
+};
+
+class InitialScheduleStage : public GCNSchedStage {
+public:
+ void finalizeGCNRegion() override;
+
+ bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+ InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ : GCNSchedStage(StageID, DAG) {}
+};
+
+class UnclusteredRescheduleStage : public GCNSchedStage {
+private:
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
+public:
+ bool initGCNSchedStage() override;
+
+ void finalizeGCNSchedStage() override;
+
+ bool initGCNRegion() override;
+
+ bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+ UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ : GCNSchedStage(StageID, DAG) {}
+};
+
+// Retry function scheduling if we found resulting occupancy and it is
+// lower than used for other scheduling passes. This will give more freedom
+// to schedule low register pressure blocks.
+class ClusteredLowOccStage : public GCNSchedStage {
+public:
+ bool initGCNSchedStage() override;
+
+ bool initGCNRegion() override;
+
+ bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+ ClusteredLowOccStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ : GCNSchedStage(StageID, DAG) {}
+};
+
+class PreRARematStage : public GCNSchedStage {
+private:
// Each region at MinOccupancy will have their own list of trivially
// rematerializable instructions we can remat to reduce RP. The list maps an
// instruction to the position we should remat before, usually the MI using
@@ -132,12 +294,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// that has the defined reg as a live-in.
DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
- // Temporary basic block live-in cache.
- DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
-
- DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
- DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
-
// Collect all trivially rematerializable VGPR instructions with a single def
// and single use outside the defining block into RematerializableInsts.
void collectRematerializableInstructions();
@@ -150,26 +306,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII);
- // Return current region pressure.
- GCNRegPressure getRealRegPressure() const;
-
- // Compute and cache live-ins and pressure for all regions in block.
- void computeBlockPressure(const MachineBasicBlock *MBB);
-
- // Update region boundaries when removing MI or inserting NewMI before MI.
- void updateRegionBoundaries(
- SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
- MachineBasicBlock::iterator>> &RegionBoundaries,
- MachineBasicBlock::iterator MI, MachineInstr *NewMI,
- bool Removing = false);
-
public:
- GCNScheduleDAGMILive(MachineSchedContext *C,
- std::unique_ptr<MachineSchedStrategy> S);
+ bool initGCNSchedStage() override;
- void schedule() override;
+ bool initGCNRegion() override;
- void finalizeSchedule() override;
+ bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+ PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ : GCNSchedStage(StageID, DAG) {}
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index e093d78b2cc6..d9d7d4efa8c3 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -309,6 +309,11 @@ uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2;
}
+static bool isVCMPX64(const MCInstrDesc &Desc) {
+ return (Desc.TSFlags & SIInstrFlags::VOP3) &&
+ Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC);
+}
+
void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
@@ -326,6 +331,17 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
Encoding |= getImplicitOpSelHiEncoding(Opcode);
}
+ // GFX11 v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
+ // Documentation requires dst to be encoded as EXEC (0x7E),
+ // but it looks like the actual value encoded for dst operand
+ // is ignored by HW. It was decided to define dst as "do not care"
+ // in td files to allow disassembler accept any dst value.
+ // However, dst is encoded as EXEC for compatibility with SP3.
+ if (AMDGPU::isGFX11Plus(STI) && isVCMPX64(Desc)) {
+ assert((Encoding & 0xFF) == 0);
+ Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO);
+ }
+
for (unsigned i = 0; i < bytes; i++) {
OS.write((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i));
}
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index e7706fa0ef5c..1ed79add64c9 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -54,8 +54,8 @@ public:
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *IsFast = nullptr) const override;
- virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
- bool LegalOperations) const override {
+ bool canCombineTruncStore(EVT ValVT, EVT MemVT,
+ bool LegalOperations) const override {
// R600 has "custom" lowering for truncating stores despite not supporting
// those instructions. If we allow that custom lowering in the DAG combiner
// then all truncates are merged into truncating stores, giving worse code
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 438e8b200ecc..f7d139adc63b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2132,7 +2132,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
SIMachineFunctionInfo &Info,
CallingConv::ID CallConv,
bool IsShader) const {
- if (Subtarget->hasUserSGPRInit16Bug()) {
+ if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
+ // Note: user SGPRs are handled by the front-end for graphics shaders
// Pad up the used user SGPRs with dead inputs.
unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
@@ -2195,7 +2196,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
}
- assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16);
+ assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
+ Info.getNumPreloadedSGPRs() >= 16);
}
static void reservePrivateMemoryRegs(const TargetMachine &TM,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d1fecc1afc7f..e0101f53880f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -487,10 +487,10 @@ public:
AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
- virtual const TargetRegisterClass *
- getRegClassFor(MVT VT, bool isDivergent) const override;
- virtual bool requiresUniformRegister(MachineFunction &MF,
- const Value *V) const override;
+ const TargetRegisterClass *getRegClassFor(MVT VT,
+ bool isDivergent) const override;
+ bool requiresUniformRegister(MachineFunction &MF,
+ const Value *V) const override;
Align getPrefLoopAlignment(MachineLoop *ML) const override;
void allocateHSAUserSGPRs(CCState &CCInfo,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index ffe8dce79816..fccb08f86e6d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -349,7 +349,7 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add (sequence "SGPR%u_LO16", 0, 105))> {
- let AllocationPriority = 9;
+ let AllocationPriority = 0;
let Size = 16;
let GeneratePressureSet = 0;
let HasSGPR = 1;
@@ -368,7 +368,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add (sequence "SGPR%u", 0, 105))> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
- let AllocationPriority = 9;
+ let AllocationPriority = 0;
let GeneratePressureSet = 0;
let HasSGPR = 1;
}
@@ -528,14 +528,14 @@ def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
let HasVGPR = 1 in {
def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (sequence "VGPR%u_LO16", 0, 255))> {
- let AllocationPriority = 1;
+ let AllocationPriority = 0;
let Size = 16;
let GeneratePressureSet = 0;
}
def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (sequence "VGPR%u_HI16", 0, 255))> {
- let AllocationPriority = 1;
+ let AllocationPriority = 0;
let Size = 16;
let GeneratePressureSet = 0;
}
@@ -544,7 +544,7 @@ def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// i16/f16 only on VI+
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
- let AllocationPriority = 1;
+ let AllocationPriority = 0;
let Size = 32;
let Weight = 1;
}
@@ -588,7 +588,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// AccVGPR 32-bit registers
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add (sequence "AGPR%u", 0, 255))> {
- let AllocationPriority = 1;
+ let AllocationPriority = 0;
let Size = 32;
let Weight = 1;
}
@@ -653,7 +653,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2
SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE,
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
- let AllocationPriority = 10;
+ let AllocationPriority = 0;
}
def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
@@ -663,42 +663,42 @@ def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16,
SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> {
let Size = 16;
- let AllocationPriority = 10;
+ let AllocationPriority = 0;
}
def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
- let AllocationPriority = 10;
+ let AllocationPriority = 0;
}
def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> {
let Size = 16;
- let AllocationPriority = 10;
+ let AllocationPriority = 0;
}
def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
- let AllocationPriority = 10;
+ let AllocationPriority = 0;
}
def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> {
let Size = 16;
- let AllocationPriority = 10;
+ let AllocationPriority = 0;
}
def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> {
let Size = 16;
- let AllocationPriority = 10;
+ let AllocationPriority = 0;
}
} // End GeneratePressureSet = 0
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
- let AllocationPriority = 10;
+ let AllocationPriority = 0;
let HasSGPR = 1;
}
@@ -712,7 +712,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16],
def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
(add SGPR_64Regs)> {
let CopyCost = 1;
- let AllocationPriority = 11;
+ let AllocationPriority = 1;
let HasSGPR = 1;
}
@@ -725,14 +725,14 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
- let AllocationPriority = 13;
+ let AllocationPriority = 1;
let HasSGPR = 1;
}
def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
(add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
- let AllocationPriority = 13;
+ let AllocationPriority = 1;
let HasSGPR = 1;
}
@@ -750,7 +750,7 @@ def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32,
let HasSGPR = 1;
}
-multiclass SRegClass<int numRegs, int priority,
+multiclass SRegClass<int numRegs,
list<ValueType> regTypes,
SIRegisterTuples regList,
SIRegisterTuples ttmpList = regList,
@@ -760,7 +760,7 @@ multiclass SRegClass<int numRegs, int priority,
defvar sgprName = !strconcat("SGPR_", suffix);
defvar ttmpName = !strconcat("TTMP_", suffix);
- let AllocationPriority = priority, CopyCost = copyCost, HasSGPR = 1 in {
+ let AllocationPriority = !sub(numRegs, 1), CopyCost = copyCost, HasSGPR = 1 in {
def "" # sgprName : SIRegisterClass<"AMDGPU", regTypes, 32, (add regList)> {
}
@@ -781,14 +781,14 @@ multiclass SRegClass<int numRegs, int priority,
}
}
-defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
-defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
-defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
-defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
-defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
-defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
-defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
+defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
+defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
+defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
+defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
+defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add VGPR_32, LDS_DIRECT_CLASS)> {
@@ -803,7 +803,7 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
// Requires n v_mov_b32 to copy
let CopyCost = numRegs;
- let AllocationPriority = numRegs;
+ let AllocationPriority = !sub(numRegs, 1);
let Weight = numRegs;
}
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index d489a089ac78..5973d32c91d6 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -718,7 +718,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
// DPP8 forbids modifiers and can inherit from VOPC_Profile
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
- dag InsPartVOP3DPP = (ins Src0Mod:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1);
+ dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1);
let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
(ins)));
let Asm64 = "$sdst, $src0_modifiers, $src1";