11 files changed, 873 insertions, 488 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index b4a8766d682e..56a9a30bc59a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -29,6 +29,8 @@
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Constants.h"
@@ -43,6 +45,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/OptimizedStructLayout.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <tuple>
 #include <vector>
 
 #define DEBUG_TYPE "amdgpu-lower-module-lds"
@@ -97,6 +100,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
   static void
   removeFromUsedLists(Module &M,
                       const std::vector<GlobalVariable *> &LocalVars) {
+    // The verifier rejects used lists containing an inttoptr of a constant
+    // so remove the variables from these lists before replaceAllUsesWith
+
     SmallPtrSet<Constant *, 32> LocalVarsSet;
     for (GlobalVariable *LocalVar : LocalVars)
       if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
@@ -146,12 +152,59 @@ public:
   }
 
   bool runOnModule(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
+
+    // Move variables used by functions into amdgcn.module.lds
     std::vector<GlobalVariable *> ModuleScopeVariables =
         AMDGPU::findVariablesToLower(M, nullptr);
-    Changed |= processUsedLDS(CG, M, ModuleScopeVariables);
+    if (!ModuleScopeVariables.empty()) {
+      std::string VarName = "llvm.amdgcn.module.lds";
+
+      GlobalVariable *SGV;
+      DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
+      std::tie(SGV, LDSVarsToConstantGEP) =
+          createLDSVariableReplacement(M, VarName, ModuleScopeVariables);
+
+      appendToCompilerUsed(
+          M, {static_cast<GlobalValue *>(
+                 ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                     cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+      removeFromUsedLists(M, ModuleScopeVariables);
+      replaceLDSVariablesWithStruct(M, ModuleScopeVariables, SGV,
+                                    LDSVarsToConstantGEP,
+                                    [](Use &) { return true; });
+
+      // This ensures the variable is allocated when called functions access it.
+      // It also lets other passes, specifically PromoteAlloca, accurately
+      // calculate how much LDS will be used by the kernel after lowering.
+
+      IRBuilder<> Builder(Ctx);
+      for (Function &Func : M.functions()) {
+        if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
+          const CallGraphNode *N = CG[&Func];
+          const bool CalleesRequireModuleLDS = N->size() > 0;
+
+          if (CalleesRequireModuleLDS) {
+            // If a function this kernel might call requires module LDS,
+            // annotate the kernel to let later passes know it will allocate
+            // this structure, even if not apparent from the IR.
+            markUsedByKernel(Builder, &Func, SGV);
+          } else {
+            // However if we are certain this kernel cannot call a function that
+            // requires module LDS, annotate the kernel so the backend can elide
+            // the allocation without repeating callgraph walks.
+            Func.addFnAttr("amdgpu-elide-module-lds");
+          }
+        }
+      }
+
+      Changed = true;
+    }
 
+    // Move variables used by kernels into per-kernel instances
     for (Function &F : M.functions()) {
       if (F.isDeclaration())
         continue;
@@ -159,9 +212,37 @@ public:
       // Only lower compute kernels' LDS.
       if (!AMDGPU::isKernel(F.getCallingConv()))
         continue;
+
       std::vector<GlobalVariable *> KernelUsedVariables =
           AMDGPU::findVariablesToLower(M, &F);
-      Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F);
+
+      // Replace all constant uses with instructions if they belong to the
+      // current kernel. Unnecessary, removing will cause test churn.
+      for (size_t I = 0; I < KernelUsedVariables.size(); I++) {
+        GlobalVariable *GV = KernelUsedVariables[I];
+        for (User *U : make_early_inc_range(GV->users())) {
+          if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
+            AMDGPU::replaceConstantUsesInFunction(C, &F);
+        }
+        GV->removeDeadConstantUsers();
+      }
+
+      if (!KernelUsedVariables.empty()) {
+        std::string VarName =
+            (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str();
+        GlobalVariable *SGV;
+        DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
+        std::tie(SGV, LDSVarsToConstantGEP) =
+            createLDSVariableReplacement(M, VarName, KernelUsedVariables);
+
+        removeFromUsedLists(M, KernelUsedVariables);
+        replaceLDSVariablesWithStruct(
+            M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) {
+              Instruction *I = dyn_cast<Instruction>(U.getUser());
+              return I && I->getFunction() == &F;
+            });
+        Changed = true;
+      }
     }
 
     return Changed;
@@ -212,16 +293,18 @@ private:
     return Changed;
   }
 
-  bool processUsedLDS(CallGraph const &CG, Module &M,
-                      std::vector<GlobalVariable *> const &LDSVarsToTransform,
-                      Function *F = nullptr) {
+  std::tuple<GlobalVariable *, DenseMap<GlobalVariable *, Constant *>>
+  createLDSVariableReplacement(
+      Module &M, std::string VarName,
+      std::vector<GlobalVariable *> const &LDSVarsToTransform) {
+    // Create a struct instance containing LDSVarsToTransform and map from those
+    // variables to ConstantExprGEP
+    // Variables may be introduced to meet alignment requirements. No aliasing
+    // metadata is useful for these as they have no uses. Erased before return.
+
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();
-
-    if (LDSVarsToTransform.empty()) {
-      // No variables to rewrite, no changes made.
-      return false;
-    }
+    assert(!LDSVarsToTransform.empty());
 
     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
     LayoutFields.reserve(LDSVarsToTransform.size());
@@ -234,9 +317,10 @@ private:
     performOptimizedStructLayout(LayoutFields);
 
     std::vector<GlobalVariable *> LocalVars;
+    BitVector IsPaddingField;
     LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
+    IsPaddingField.reserve(LDSVarsToTransform.size());
     {
-      // This usually won't need to insert any padding, perhaps avoid the alloc
       uint64_t CurrentOffset = 0;
       for (size_t I = 0; I < LayoutFields.size(); I++) {
         GlobalVariable *FGV = static_cast<GlobalVariable *>(
@@ -256,10 +340,12 @@ private:
               M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
               "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
               false));
+          IsPaddingField.push_back(true);
           CurrentOffset += Padding;
         }
 
         LocalVars.push_back(FGV);
+        IsPaddingField.push_back(false);
         CurrentOffset += LayoutFields[I].Size;
       }
     }
@@ -270,9 +356,6 @@ private:
         LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
         [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
 
-    std::string VarName(
-        F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
-          : "llvm.amdgcn.module.lds");
     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
 
     Align StructAlign =
@@ -283,62 +366,65 @@ private:
         VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
         false);
     SGV->setAlignment(StructAlign);
-    if (!F) {
-      appendToCompilerUsed(
-          M, {static_cast<GlobalValue *>(
-                 ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-                     cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+    DenseMap<GlobalVariable *, Constant *> Map;
+    Type *I32 = Type::getInt32Ty(Ctx);
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      GlobalVariable *GV = LocalVars[I];
+      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+      Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true);
+      if (IsPaddingField[I]) {
+        assert(GV->use_empty());
+        GV->eraseFromParent();
+      } else {
+        Map[GV] = GEP;
+      }
     }
+    assert(Map.size() == LDSVarsToTransform.size());
+    return {SGV, std::move(Map)};
+  }
 
-    // The verifier rejects used lists containing an inttoptr of a constant
-    // so remove the variables from these lists before replaceAllUsesWith
-    removeFromUsedLists(M, LocalVars);
+  template <typename PredicateTy>
+  void replaceLDSVariablesWithStruct(
+      Module &M, std::vector<GlobalVariable *> const &LDSVarsToTransform,
+      GlobalVariable *SGV,
+      DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP,
+      PredicateTy Predicate) {
+    LLVMContext &Ctx = M.getContext();
+    const DataLayout &DL = M.getDataLayout();
 
     // Create alias.scope and their lists. Each field in the new structure
     // does not alias with all other fields.
     SmallVector<MDNode *> AliasScopes;
     SmallVector<Metadata *> NoAliasList;
-    if (LocalVars.size() > 1) {
+    const size_t NumberVars = LDSVarsToTransform.size();
+    if (NumberVars > 1) {
       MDBuilder MDB(Ctx);
-      AliasScopes.reserve(LocalVars.size());
+      AliasScopes.reserve(NumberVars);
       MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
-      for (size_t I = 0; I < LocalVars.size(); I++) {
+      for (size_t I = 0; I < NumberVars; I++) {
         MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
         AliasScopes.push_back(Scope);
       }
       NoAliasList.append(&AliasScopes[1], AliasScopes.end());
     }
 
-    // Replace uses of ith variable with a constantexpr to the ith field of the
-    // instance that will be allocated by AMDGPUMachineFunction
-    Type *I32 = Type::getInt32Ty(Ctx);
-    for (size_t I = 0; I < LocalVars.size(); I++) {
-      GlobalVariable *GV = LocalVars[I];
-      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
-      Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
-      if (F) {
-        // Replace all constant uses with instructions if they belong to the
-        // current kernel.
-        for (User *U : make_early_inc_range(GV->users())) {
-          if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
-            AMDGPU::replaceConstantUsesInFunction(C, F);
-        }
-
-        GV->removeDeadConstantUsers();
+    // Replace uses of ith variable with a constantexpr to the corresponding
+    // field of the instance that will be allocated by AMDGPUMachineFunction
+    for (size_t I = 0; I < NumberVars; I++) {
+      GlobalVariable *GV = LDSVarsToTransform[I];
+      Constant *GEP = LDSVarsToConstantGEP[GV];
 
-        GV->replaceUsesWithIf(GEP, [F](Use &U) {
-          Instruction *I = dyn_cast<Instruction>(U.getUser());
-          return I && I->getFunction() == F;
-        });
-      } else {
-        GV->replaceAllUsesWith(GEP);
-      }
+      GV->replaceUsesWithIf(GEP, Predicate);
       if (GV->use_empty()) {
         GV->eraseFromParent();
       }
 
-      uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
-      Align A = commonAlignment(StructAlign, Off);
+      APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+      GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff);
+      uint64_t Offset = APOff.getZExtValue();
+
+      Align A = commonAlignment(SGV->getAlign().valueOrOne(), Offset);
 
       if (I)
         NoAliasList[I - 1] = AliasScopes[I - 1];
@@ -349,32 +435,6 @@ private:
 
       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
     }
-
-    // This ensures the variable is allocated when called functions access it.
-    // It also lets other passes, specifically PromoteAlloca, accurately
-    // calculate how much LDS will be used by the kernel after lowering.
-    if (!F) {
-      IRBuilder<> Builder(Ctx);
-      for (Function &Func : M.functions()) {
-        if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
-          const CallGraphNode *N = CG[&Func];
-          const bool CalleesRequireModuleLDS = N->size() > 0;
-
-          if (CalleesRequireModuleLDS) {
-            // If a function this kernel might call requires module LDS,
-            // annotate the kernel to let later passes know it will allocate
-            // this structure, even if not apparent from the IR.
-            markUsedByKernel(Builder, &Func, SGV);
-          } else {
-            // However if we are certain this kernel cannot call a function that
-            // requires module LDS, annotate the kernel so the backend can elide
-            // the allocation without repeating callgraph walks.
-            Func.addFnAttr("amdgpu-elide-module-lds");
-          }
-        }
-      }
-    }
-    return true;
   }
 
   void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index 753f7edc9385..98b5031071cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -29,7 +29,7 @@ public:
   virtual ~AMDGPUMIRFormatter() = default;
 
   /// Implement target specific parsing of target custom pseudo source value.
-  virtual bool
+  bool
   parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF,
                                PerFunctionMIParsingState &PFS,
                                const PseudoSourceValue *&PSV,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index bfe2e9b66ed4..98e9907068f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -191,8 +191,8 @@ public:
       report_fatal_error("Invalid rule identifier");
   }
 
-  virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
-                       MachineIRBuilder &B) const override;
+  bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+               MachineIRBuilder &B) const override;
 };
 
 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 04da14cc4916..859deae86f35 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -9,6 +9,18 @@
 /// \file
 /// This contains a MachineSchedStrategy implementation for maximizing wave
 /// occupancy on GCN hardware.
+///
+/// This pass will apply multiple scheduling stages to the same function.
+/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
+/// entry point for the scheduling of those regions is
+/// GCNScheduleDAGMILive::runSchedStages.
+
+/// Generally, the reason for having multiple scheduling stages is to account
+/// for the kernel-wide effect of register usage on occupancy.  Usually, only a
+/// few scheduling regions will have register pressure high enough to limit
+/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
+/// other regions.
+///
 //===----------------------------------------------------------------------===//
 
 #include "GCNSchedStrategy.h"
@@ -20,9 +32,9 @@
 using namespace llvm;
 
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
-    const MachineSchedContext *C) :
-    GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false),
-    HasExcessPressure(false), MF(nullptr) { }
+    const MachineSchedContext *C)
+    : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
+      HasClusteredNodes(false), HasExcessPressure(false) {}
 
 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
@@ -302,210 +314,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
   return SU;
 }
 
-GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
-                        std::unique_ptr<MachineSchedStrategy> S) :
-  ScheduleDAGMILive(C, std::move(S)),
-  ST(MF.getSubtarget<GCNSubtarget>()),
-  MFI(*MF.getInfo<SIMachineFunctionInfo>()),
-  StartingOccupancy(MFI.getOccupancy()),
-  MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
+GCNScheduleDAGMILive::GCNScheduleDAGMILive(
+    MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
+    : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
+      MFI(*MF.getInfo<SIMachineFunctionInfo>()),
+      StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
 
   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }
 
 void GCNScheduleDAGMILive::schedule() {
-  if (Stage == Collect) {
-    // Just record regions at the first pass.
-    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
-    return;
-  }
-
-  std::vector<MachineInstr*> Unsched;
-  Unsched.reserve(NumRegionInstrs);
-  for (auto &I : *this) {
-    Unsched.push_back(&I);
-  }
-
-  GCNRegPressure PressureBefore;
-  if (LIS) {
-    PressureBefore = Pressure[RegionIdx];
-
-    LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
-               GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
-               dbgs() << "Region live-in pressure:  ";
-               llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
-               dbgs() << "Region register pressure: ";
-               PressureBefore.print(dbgs()));
-  }
-
-  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  // Set HasClusteredNodes to true for late stages where we have already
-  // collected it. That way pickNode() will not scan SDep's when not needed.
-  S.HasClusteredNodes = Stage > InitialSchedule;
-  S.HasExcessPressure = false;
-  ScheduleDAGMILive::schedule();
-  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
-  RescheduleRegions[RegionIdx] = false;
-  if (Stage == InitialSchedule && S.HasClusteredNodes)
-    RegionsWithClusters[RegionIdx] = true;
-  if (S.HasExcessPressure)
-    RegionsWithHighRP[RegionIdx] = true;
-
-  if (!LIS)
-    return;
-
-  // Check the results of scheduling.
-  auto PressureAfter = getRealRegPressure();
-
-  LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
-             PressureAfter.print(dbgs()));
-
-  if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
-      PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
-    Pressure[RegionIdx] = PressureAfter;
-    RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST) == MinOccupancy;
-
-    LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
-    return;
-  }
-
-  unsigned WavesAfter =
-      std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST));
-  unsigned WavesBefore =
-      std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST));
-  LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
-                    << ", after " << WavesAfter << ".\n");
-
-  // We may not be able to keep the current target occupancy because of the just
-  // scheduled region. We might still be able to revert scheduling if the
-  // occupancy before was higher, or if the current schedule has register
-  // pressure higher than the excess limits which could lead to more spilling.
-  unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
-
-  // Allow memory bound functions to drop to 4 waves if not limited by an
-  // attribute.
-  if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
-      WavesAfter >= MFI.getMinAllowedOccupancy()) {
-    LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
-                      << MFI.getMinAllowedOccupancy() << " waves\n");
-    NewOccupancy = WavesAfter;
-  }
-
-  if (NewOccupancy < MinOccupancy) {
-    MinOccupancy = NewOccupancy;
-    MFI.limitOccupancy(MinOccupancy);
-    RegionsWithMinOcc.reset();
-    LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
-                      << MinOccupancy << ".\n");
-  }
-
-  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
-  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
-  if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
-      PressureAfter.getAGPRNum() > MaxVGPRs ||
-      PressureAfter.getSGPRNum() > MaxSGPRs) {
-    RescheduleRegions[RegionIdx] = true;
-    RegionsWithHighRP[RegionIdx] = true;
-  }
-
-  // If this condition is true, then either the occupancy before and after
-  // scheduling is the same, or we are allowing the occupancy to drop because
-  // the function is memory bound. Even if we are OK with the current occupancy,
-  // we still need to verify that we will not introduce any extra chance of
-  // spilling.
-  if (WavesAfter >= MinOccupancy) {
-    if (Stage == UnclusteredReschedule &&
-        !PressureAfter.less(ST, PressureBefore)) {
-      LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
-    } else if (WavesAfter > MFI.getMinWavesPerEU() ||
-        PressureAfter.less(ST, PressureBefore) ||
-        !RescheduleRegions[RegionIdx]) {
-      Pressure[RegionIdx] = PressureAfter;
-      RegionsWithMinOcc[RegionIdx] =
-          PressureAfter.getOccupancy(ST) == MinOccupancy;
-      if (!RegionsWithClusters[RegionIdx] &&
-          (Stage + 1) == UnclusteredReschedule)
-        RescheduleRegions[RegionIdx] = false;
-      return;
-    } else {
-      LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
-    }
-  }
-
-  RegionsWithMinOcc[RegionIdx] =
-      PressureBefore.getOccupancy(ST) == MinOccupancy;
-  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
-  RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
-                                 (Stage + 1) != UnclusteredReschedule;
-  RegionEnd = RegionBegin;
-  int SkippedDebugInstr = 0;
-  for (MachineInstr *MI : Unsched) {
-    if (MI->isDebugInstr()) {
-      ++SkippedDebugInstr;
-      continue;
-    }
-
-    if (MI->getIterator() != RegionEnd) {
-      BB->remove(MI);
-      BB->insert(RegionEnd, MI);
-      if (!MI->isDebugInstr())
-        LIS->handleMove(*MI, true);
-    }
-    // Reset read-undef flags and update them later.
-    for (auto &Op : MI->operands())
-      if (Op.isReg() && Op.isDef())
-        Op.setIsUndef(false);
-    RegisterOperands RegOpers;
-    RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
-    if (!MI->isDebugInstr()) {
-      if (ShouldTrackLaneMasks) {
-        // Adjust liveness and add missing dead+read-undef flags.
-        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
-        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
-      } else {
-        // Adjust for missing dead-def flags.
-        RegOpers.detectDeadDefs(*MI, *LIS);
-      }
-    }
-    RegionEnd = MI->getIterator();
-    ++RegionEnd;
-    LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
-  }
-
-  // After reverting schedule, debug instrs will now be at the end of the block
-  // and RegionEnd will point to the first debug instr. Increment RegionEnd
-  // pass debug instrs to the actual end of the scheduling region.
-  while (SkippedDebugInstr-- > 0)
-    ++RegionEnd;
-
-  // If Unsched.front() instruction is a debug instruction, this will actually
-  // shrink the region since we moved all debug instructions to the end of the
-  // block. Find the first instruction that is not a debug instruction.
-  RegionBegin = Unsched.front()->getIterator();
-  if (RegionBegin->isDebugInstr()) {
-    for (MachineInstr *MI : Unsched) {
-      if (MI->isDebugInstr())
-        continue;
-      RegionBegin = MI->getIterator();
-      break;
-    }
-  }
-
-  // Then move the debug instructions back into their correct place and set
-  // RegionBegin and RegionEnd if needed.
-  placeDebugValues();
-
-  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+  // Collect all scheduling regions. The actual scheduling is performed in
+  // GCNScheduleDAGMILive::finalizeSchedule.
+  Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
 }
 
-GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
+GCNRegPressure
+GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
   GCNDownwardRPTracker RPTracker(*LIS);
   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
   return RPTracker.moveMaxPressure();
 }
 
-void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
+void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
+                                                const MachineBasicBlock *MBB) {
   GCNDownwardRPTracker RPTracker(*LIS);
 
   // If the block has the only successor then live-ins of that successor are
@@ -542,7 +374,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
     RPTracker.reset(*I, &LRS);
   }
 
-  for ( ; ; ) {
+  for (;;) {
     I = RPTracker.getNext();
 
     if (Regions[CurRegion].first == I || NonDbgMI == I) {
@@ -588,8 +420,9 @@ GCNScheduleDAGMILive::getBBLiveInMap() const {
 }
 
 void GCNScheduleDAGMILive::finalizeSchedule() {
-  LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
-
+  // Start actual scheduling here. This function is called by the base
+  // MachineScheduler after all regions have been recorded by
+  // GCNScheduleDAGMILive::schedule().
   LiveIns.resize(Regions.size());
   Pressure.resize(Regions.size());
   RescheduleRegions.resize(Regions.size());
@@ -601,142 +434,470 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
   RegionsWithHighRP.reset();
   RegionsWithMinOcc.reset();
 
+  runSchedStages();
+}
+
+void GCNScheduleDAGMILive::runSchedStages() {
+  LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
+  InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this);
+  UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this);
+  ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule,
+                          *this);
+  PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this);
+  GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3};
+
   if (!Regions.empty())
     BBLiveInMap = getBBLiveInMap();
 
-  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+  for (auto *Stage : SchedStages) {
+    if (!Stage->initGCNSchedStage())
+      continue;
 
-  do {
-    Stage++;
-    RegionIdx = 0;
-    MachineBasicBlock *MBB = nullptr;
+    for (auto Region : Regions) {
+      RegionBegin = Region.first;
+      RegionEnd = Region.second;
+      // Setup for scheduling the region and check whether it should be skipped.
+      if (!Stage->initGCNRegion()) {
+        Stage->advanceRegion();
+        exitRegion();
+        continue;
+      }
 
-    if (Stage > InitialSchedule) {
-      if (!LIS)
-        break;
+      ScheduleDAGMILive::schedule();
+      Stage->finalizeGCNRegion();
+    }
 
-      // Retry function scheduling if we found resulting occupancy and it is
-      // lower than used for first pass scheduling. This will give more freedom
-      // to schedule low register pressure blocks.
-      // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+    Stage->finalizeGCNSchedStage();
+  }
+}
 
-      if (Stage == UnclusteredReschedule) {
-        if (RescheduleRegions.none())
-          continue;
-        LLVM_DEBUG(dbgs() <<
-          "Retrying function scheduling without clustering.\n");
-      }
+#ifndef NDEBUG
+raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
+  switch (StageID) {
+  case GCNSchedStageID::InitialSchedule:
+    OS << "Initial Schedule";
+    break;
+  case GCNSchedStageID::UnclusteredReschedule:
+    OS << "Unclustered Reschedule";
+    break;
+  case GCNSchedStageID::ClusteredLowOccupancyReschedule:
+    OS << "Clustered Low Occupancy Reschedule";
+    break;
+  case GCNSchedStageID::PreRARematerialize:
+    OS << "Pre-RA Rematerialize";
+    break;
+  }
+  return OS;
+}
+#endif
 
-      if (Stage == ClusteredLowOccupancyReschedule) {
-        if (StartingOccupancy <= MinOccupancy)
-          break;
+GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+    : DAG(DAG), S(static_cast<GCNMaxOccupancySchedStrategy &>(*DAG.SchedImpl)),
+      MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
 
-        LLVM_DEBUG(
-            dbgs()
-            << "Retrying function scheduling with lowest recorded occupancy "
-            << MinOccupancy << ".\n");
-      }
+bool GCNSchedStage::initGCNSchedStage() {
+  if (!DAG.LIS)
+    return false;
 
-      if (Stage == PreRARematerialize) {
-        if (RegionsWithMinOcc.none() || Regions.size() == 1)
-          break;
+  LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
+  return true;
+}
 
-        const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-        const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-        // Check maximum occupancy
-        if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
-            MinOccupancy)
-          break;
+bool UnclusteredRescheduleStage::initGCNSchedStage() {
+  if (!GCNSchedStage::initGCNSchedStage())
+    return false;
 
-        // FIXME: This pass will invalidate cached MBBLiveIns for regions
-        // inbetween the defs and region we sinked the def to. Cached pressure
-        // for regions where a def is sinked from will also be invalidated. Will
-        // need to be fixed if there is another pass after this pass.
-        static_assert(LastStage == PreRARematerialize,
-                      "Passes after PreRARematerialize are not supported");
+  if (DAG.RescheduleRegions.none())
+    return false;
 
-        collectRematerializableInstructions();
-        if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
-          break;
+  SavedMutations.swap(DAG.Mutations);
 
-        LLVM_DEBUG(
-            dbgs() << "Retrying function scheduling with improved occupancy of "
-                   << MinOccupancy << " from rematerializing\n");
-      }
-    }
+  LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n");
+  return true;
+}
 
-    if (Stage == UnclusteredReschedule)
-      SavedMutations.swap(Mutations);
+bool ClusteredLowOccStage::initGCNSchedStage() {
+  if (!GCNSchedStage::initGCNSchedStage())
+    return false;
 
-    for (auto Region : Regions) {
-      if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) &&
-           !RescheduleRegions[RegionIdx]) ||
-          (Stage == ClusteredLowOccupancyReschedule &&
-           !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
+  // Don't bother trying to improve ILP in lower RP regions if occupancy has not
+  // been dropped. All regions will have already been scheduled with the ideal
+  // occupancy targets.
+  if (DAG.StartingOccupancy <= DAG.MinOccupancy)
+    return false;
 
-        ++RegionIdx;
-        continue;
-      }
+  LLVM_DEBUG(
+      dbgs() << "Retrying function scheduling with lowest recorded occupancy "
+             << DAG.MinOccupancy << ".\n");
+  return true;
+}
 
-      RegionBegin = Region.first;
-      RegionEnd = Region.second;
+bool PreRARematStage::initGCNSchedStage() {
+  if (!GCNSchedStage::initGCNSchedStage())
+    return false;
 
-      if (RegionBegin->getParent() != MBB) {
-        if (MBB) finishBlock();
-        MBB = RegionBegin->getParent();
-        startBlock(MBB);
-        if (Stage == InitialSchedule)
-          computeBlockPressure(MBB);
-      }
+  if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
+    return false;
 
-      unsigned NumRegionInstrs = std::distance(begin(), end());
-      enterRegion(MBB, begin(), end(), NumRegionInstrs);
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  // Check maximum occupancy
+  if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
+      DAG.MinOccupancy)
+    return false;
 
-      // Skip empty scheduling regions (0 or 1 schedulable instructions).
-      if (begin() == end() || begin() == std::prev(end())) {
-        exitRegion();
-        ++RegionIdx;
-        continue;
-      }
+  // FIXME: This pass will invalidate cached MBBLiveIns for regions
+  // inbetween the defs and region we sinked the def to. Cached pressure
+  // for regions where a def is sinked from will also be invalidated. Will
+  // need to be fixed if there is another pass after this pass.
 
-      LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
-      LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
-                        << MBB->getName() << "\n  From: " << *begin()
-                        << "    To: ";
-                 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-                 else dbgs() << "End";
-                 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+  collectRematerializableInstructions();
+  if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+    return false;
 
-      schedule();
+  LLVM_DEBUG(
+      dbgs() << "Retrying function scheduling with improved occupancy of "
+             << DAG.MinOccupancy << " from rematerializing\n");
+  return true;
+}
+
+void GCNSchedStage::finalizeGCNSchedStage() {
+  DAG.finishBlock();
+  LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
+}
+
+void UnclusteredRescheduleStage::finalizeGCNSchedStage() {
+  SavedMutations.swap(DAG.Mutations);
+
+  GCNSchedStage::finalizeGCNSchedStage();
+}
+
+bool GCNSchedStage::initGCNRegion() {
+  // Check whether this new region is also a new block.
+  if (DAG.RegionBegin->getParent() != CurrentMBB)
+    setupNewBlock();
+
+  unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
+  DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
+
+  // Skip empty scheduling regions (0 or 1 schedulable instructions).
+  if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
+  LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
+                    << " " << CurrentMBB->getName()
+                    << "\n  From: " << *DAG.begin() << "    To: ";
+             if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
+             else dbgs() << "End";
+             dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+
+  // Save original instruction order before scheduling for possible revert.
+  Unsched.clear();
+  Unsched.reserve(DAG.NumRegionInstrs);
+  for (auto &I : DAG)
+    Unsched.push_back(&I);
+
+  PressureBefore = DAG.Pressure[RegionIdx];
+
+  LLVM_DEBUG(
+      dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+      GCNRPTracker::printLiveRegs(dbgs(), DAG.LiveIns[RegionIdx], DAG.MRI);
+      dbgs() << "Region live-in pressure:  ";
+      llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs());
+      dbgs() << "Region register pressure: "; PressureBefore.print(dbgs()));
+
+  // Set HasClusteredNodes to true for late stages where we have already
+  // collected it. That way pickNode() will not scan SDep's when not needed.
+  S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule;
+  S.HasExcessPressure = false;
+
+  return true;
+}
+
+bool UnclusteredRescheduleStage::initGCNRegion() {
+  if (!DAG.RescheduleRegions[RegionIdx])
+    return false;
+
+  return GCNSchedStage::initGCNRegion();
+}
+
+bool ClusteredLowOccStage::initGCNRegion() {
+  // We may need to reschedule this region if it doesn't have clusters so it
+  // wasn't rescheduled in the last stage, or if we found it was testing
+  // critical register pressure limits in the unclustered reschedule stage. The
+  // later is because we may not have been able to raise the min occupancy in
+  // the previous stage so the region may be overly constrained even if it was
+  // already rescheduled.
+  if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx])
+    return false;
+
+  return GCNSchedStage::initGCNRegion();
+}
+
+bool PreRARematStage::initGCNRegion() {
+  if (!DAG.RescheduleRegions[RegionIdx])
+    return false;
+
+  return GCNSchedStage::initGCNRegion();
+}
+
+void GCNSchedStage::setupNewBlock() {
+  if (CurrentMBB)
+    DAG.finishBlock();
+
+  CurrentMBB = DAG.RegionBegin->getParent();
+  DAG.startBlock(CurrentMBB);
+  // Get real RP for the region if it hasn't be calculated before. After the
+  // initial schedule stage real RP will be collected after scheduling.
+  if (StageID == GCNSchedStageID::InitialSchedule)
+    DAG.computeBlockPressure(RegionIdx, CurrentMBB);
+}
+
+void GCNSchedStage::finalizeGCNRegion() {
+  DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
+  DAG.RescheduleRegions[RegionIdx] = false;
+  if (S.HasExcessPressure)
+    DAG.RegionsWithHighRP[RegionIdx] = true;
+
+  // Revert scheduling if we have dropped occupancy or there is some other
+  // reason that the original schedule is better.
+  checkScheduling();
+
+  DAG.exitRegion();
+  RegionIdx++;
+}
+
+void InitialScheduleStage::finalizeGCNRegion() {
+  // Record which regions have clustered nodes for the next unclustered
+  // reschedule stage.
+  assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
+  if (S.HasClusteredNodes)
+    DAG.RegionsWithClusters[RegionIdx] = true;
+
+  GCNSchedStage::finalizeGCNRegion();
+}
+
+void GCNSchedStage::checkScheduling() {
+  // Check the results of scheduling.
+  PressureAfter = DAG.getRealRegPressure(RegionIdx);
+  LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
+             PressureAfter.print(dbgs()));
+
+  if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
+      PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
+    DAG.Pressure[RegionIdx] = PressureAfter;
+    DAG.RegionsWithMinOcc[RegionIdx] =
+        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+
+    // Early out if we have achieve the occupancy target.
+    LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+    return;
+  }
+
+  unsigned WavesAfter =
+      std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST));
+  unsigned WavesBefore =
+      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
+  LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
+                    << ", after " << WavesAfter << ".\n");
+
+  // We may not be able to keep the current target occupancy because of the just
+  // scheduled region. We might still be able to revert scheduling if the
+  // occupancy before was higher, or if the current schedule has register
+  // pressure higher than the excess limits which could lead to more spilling.
+  unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+
+  // Allow memory bound functions to drop to 4 waves if not limited by an
+  // attribute.
+  if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
+      WavesAfter >= MFI.getMinAllowedOccupancy()) {
+    LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
+                      << MFI.getMinAllowedOccupancy() << " waves\n");
+    NewOccupancy = WavesAfter;
+  }
+
+  if (NewOccupancy < DAG.MinOccupancy) {
+    DAG.MinOccupancy = NewOccupancy;
+    MFI.limitOccupancy(DAG.MinOccupancy);
+    DAG.RegionsWithMinOcc.reset();
+    LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
+                      << DAG.MinOccupancy << ".\n");
+  }
 
-      exitRegion();
-      ++RegionIdx;
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+  if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
+      PressureAfter.getAGPRNum() > MaxVGPRs ||
+      PressureAfter.getSGPRNum() > MaxSGPRs) {
+    DAG.RescheduleRegions[RegionIdx] = true;
+    DAG.RegionsWithHighRP[RegionIdx] = true;
+  }
+
+  // Revert if this region's schedule would cause a drop in occupancy or
+  // spilling.
+  if (shouldRevertScheduling(WavesAfter)) {
+    revertScheduling();
+  } else {
+    DAG.Pressure[RegionIdx] = PressureAfter;
+    DAG.RegionsWithMinOcc[RegionIdx] =
+        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+  }
+}
+
+bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (WavesAfter < DAG.MinOccupancy)
+    return true;
+
+  return false;
+}
+
+bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+    return true;
+
+  if (mayCauseSpilling(WavesAfter))
+    return true;
+
+  assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
+  // Don't reschedule the region in the next stage if it doesn't have clusters.
+  if (!DAG.RegionsWithClusters[RegionIdx])
+    DAG.RescheduleRegions[RegionIdx] = false;
+
+  return false;
+}
+
+bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+    return true;
+
+  // If RP is not reduced in the unclustred reschedule stage, revert to the old
+  // schedule.
+  if (!PressureAfter.less(ST, PressureBefore)) {
+    LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
+    return true;
+  }
+
+  return false;
+}
+
+bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+    return true;
+
+  if (mayCauseSpilling(WavesAfter))
+    return true;
+
+  return false;
+}
+
+bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+    return true;
+
+  if (mayCauseSpilling(WavesAfter))
+    return true;
+
+  return false;
+}
+
+bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
+  if (WavesAfter <= MFI.getMinWavesPerEU() &&
+      !PressureAfter.less(ST, PressureBefore) &&
+      DAG.RescheduleRegions[RegionIdx]) {
+    LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
+    return true;
+  }
+
+  return false;
+}
+
+void GCNSchedStage::revertScheduling() {
+  DAG.RegionsWithMinOcc[RegionIdx] =
+      PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
+  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  DAG.RescheduleRegions[RegionIdx] =
+      DAG.RegionsWithClusters[RegionIdx] ||
+      (nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule;
+  DAG.RegionEnd = DAG.RegionBegin;
+  int SkippedDebugInstr = 0;
+  for (MachineInstr *MI : Unsched) {
+    if (MI->isDebugInstr()) {
+      ++SkippedDebugInstr;
+      continue;
+    }
+
+    if (MI->getIterator() != DAG.RegionEnd) {
+      DAG.BB->remove(MI);
+      DAG.BB->insert(DAG.RegionEnd, MI);
+      if (!MI->isDebugInstr())
+        DAG.LIS->handleMove(*MI, true);
+    }
+
+    // Reset read-undef flags and update them later.
+    for (auto &Op : MI->operands())
+      if (Op.isReg() && Op.isDef())
+        Op.setIsUndef(false);
+    RegisterOperands RegOpers;
+    RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
+    if (!MI->isDebugInstr()) {
+      if (DAG.ShouldTrackLaneMasks) {
+        // Adjust liveness and add missing dead+read-undef flags.
+        SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
+        RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
+      } else {
+        // Adjust for missing dead-def flags.
+        RegOpers.detectDeadDefs(*MI, *DAG.LIS);
+      }
     }
-    finishBlock();
+    DAG.RegionEnd = MI->getIterator();
+    ++DAG.RegionEnd;
+    LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
+  }
+
+  // After reverting schedule, debug instrs will now be at the end of the block
+  // and RegionEnd will point to the first debug instr. Increment RegionEnd
+  // pass debug instrs to the actual end of the scheduling region.
+  while (SkippedDebugInstr-- > 0)
+    ++DAG.RegionEnd;
+
+  // If Unsched.front() instruction is a debug instruction, this will actually
+  // shrink the region since we moved all debug instructions to the end of the
+  // block. Find the first instruction that is not a debug instruction.
+  DAG.RegionBegin = Unsched.front()->getIterator();
+  if (DAG.RegionBegin->isDebugInstr()) {
+    for (MachineInstr *MI : Unsched) {
+      if (MI->isDebugInstr())
+        continue;
+      DAG.RegionBegin = MI->getIterator();
+      break;
+    }
+  }
+
+  // Then move the debug instructions back into their correct place and set
+  // RegionBegin and RegionEnd if needed.
+  DAG.placeDebugValues();
 
-    if (Stage == UnclusteredReschedule)
-      SavedMutations.swap(Mutations);
-  } while (Stage != LastStage);
+  DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
-void GCNScheduleDAGMILive::collectRematerializableInstructions() {
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+void PreRARematStage::collectRematerializableInstructions() {
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
-    if (!LIS->hasInterval(Reg))
+    if (!DAG.LIS->hasInterval(Reg))
       continue;
 
     // TODO: Handle AGPR and SGPR rematerialization
-    if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) ||
-        !MRI.hasOneNonDBGUse(Reg))
+    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
       continue;
 
-    MachineOperand *Op = MRI.getOneDef(Reg);
+    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
     MachineInstr *Def = Op->getParent();
     if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
       continue;
 
-    MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg);
+    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
     if (Def->getParent() == UseI->getParent())
       continue;
 
@@ -744,10 +905,10 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() {
     // live-through or used inside regions at MinOccupancy. This means that the
     // register must be in the live-in set for the region.
     bool AddedToRematList = false;
-    for (unsigned I = 0, E = Regions.size(); I != E; ++I) {
-      auto It = LiveIns[I].find(Reg);
-      if (It != LiveIns[I].end() && !It->second.none()) {
-        if (RegionsWithMinOcc[I]) {
+    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+      auto It = DAG.LiveIns[I].find(Reg);
+      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
+        if (DAG.RegionsWithMinOcc[I]) {
           RematerializableInsts[I][Def] = UseI;
           AddedToRematList = true;
         }
@@ -762,8 +923,8 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() {
   }
 }
 
-bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
-                                                   const TargetInstrInfo *TII) {
+bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+                                              const TargetInstrInfo *TII) {
   // Temporary copies of cached variables we will be modifying and replacing if
   // sinking succeeds.
   SmallVector<
@@ -772,9 +933,10 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
   DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
   DenseMap<unsigned, GCNRegPressure> NewPressure;
   BitVector NewRescheduleRegions;
+  LiveIntervals *LIS = DAG.LIS;
 
-  NewRegions.resize(Regions.size());
-  NewRescheduleRegions.resize(Regions.size());
+  NewRegions.resize(DAG.Regions.size());
+  NewRescheduleRegions.resize(DAG.Regions.size());
 
   // Collect only regions that has a rematerializable def as a live-in.
   SmallSet<unsigned, 16> ImpactedRegions;
@@ -784,16 +946,16 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
   // Make copies of register pressure and live-ins cache that will be updated
   // as we rematerialize.
   for (auto Idx : ImpactedRegions) {
-    NewPressure[Idx] = Pressure[Idx];
-    NewLiveIns[Idx] = LiveIns[Idx];
+    NewPressure[Idx] = DAG.Pressure[Idx];
+    NewLiveIns[Idx] = DAG.LiveIns[Idx];
   }
-  NewRegions = Regions;
+  NewRegions = DAG.Regions;
   NewRescheduleRegions.reset();
 
   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
   bool Improved = false;
   for (auto I : ImpactedRegions) {
-    if (!RegionsWithMinOcc[I])
+    if (!DAG.RegionsWithMinOcc[I])
       continue;
 
     Improved = false;
@@ -802,12 +964,12 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
 
     // TODO: Handle occupancy drop due to AGPR and SGPR.
     // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
-    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
+    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
       break;
 
     // The occupancy of this region could have been improved by a previous
     // iteration's sinking of defs.
-    if (NewPressure[I].getOccupancy(ST) > MinOccupancy) {
+    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
       NewRescheduleRegions[I] = true;
       Improved = true;
       continue;
@@ -827,7 +989,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
     unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
     // If in the most optimistic scenario, we cannot improve occupancy, then do
     // not attempt to sink any instructions.
-    if (OptimisticOccupancy <= MinOccupancy)
+    if (OptimisticOccupancy <= DAG.MinOccupancy)
       break;
 
     unsigned ImproveOccupancy = 0;
@@ -842,7 +1004,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
       // call LiveRangeEdit::allUsesAvailableAt() and
       // LiveRangeEdit::canRematerializeAt().
       TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
-                         Def->getOperand(0).getSubReg(), *Def, *TRI);
+                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
       MachineInstr *NewMI = &*(--InsertPos);
       LIS->InsertMachineInstrInMaps(*NewMI);
       LIS->removeInterval(Reg);
@@ -851,11 +1013,11 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
 
       // Update region boundaries in scheduling region we sinked from since we
       // may sink an instruction that was at the beginning or end of its region
-      updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
-                             /*Removing =*/true);
+      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
+                                 /*Removing =*/true);
 
       // Update region boundaries in region we sinked to.
-      updateRegionBoundaries(NewRegions, InsertPos, NewMI);
+      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
 
       LaneBitmask PrevMask = NewLiveIns[I][Reg];
       // FIXME: Also update cached pressure for where the def was sinked from.
@@ -863,9 +1025,9 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
       // the reg from all regions as a live-in.
       for (auto Idx : RematDefToLiveInRegions[Def]) {
         NewLiveIns[Idx].erase(Reg);
-        if (InsertPos->getParent() != Regions[Idx].first->getParent()) {
+        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
           // Def is live-through and not used in this block.
-          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI);
+          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
         } else {
           // Def is used and rematerialized into this block.
           GCNDownwardRPTracker RPT(*LIS);
@@ -879,7 +1041,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
 
       SinkedDefs.push_back(Def);
       ImproveOccupancy = NewPressure[I].getOccupancy(ST);
-      if (ImproveOccupancy > MinOccupancy)
+      if (ImproveOccupancy > DAG.MinOccupancy)
         break;
     }
 
@@ -888,7 +1050,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
       for (auto TrackedIdx : RematDefToLiveInRegions[Def])
         RematerializableInsts[TrackedIdx].erase(Def);
 
-    if (ImproveOccupancy <= MinOccupancy)
+    if (ImproveOccupancy <= DAG.MinOccupancy)
       break;
 
     NewRescheduleRegions[I] = true;
@@ -917,7 +1079,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
     MachineInstr *OldMI = Entry.second;
 
     // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
-    BBLiveInMap.erase(OldMI);
+    DAG.BBLiveInMap.erase(OldMI);
 
     // Remove OldMI and update LIS
     Register Reg = MI->getOperand(0).getReg();
@@ -929,22 +1091,22 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
 
   // Update live-ins, register pressure, and regions caches.
   for (auto Idx : ImpactedRegions) {
-    LiveIns[Idx] = NewLiveIns[Idx];
-    Pressure[Idx] = NewPressure[Idx];
-    MBBLiveIns.erase(Regions[Idx].first->getParent());
+    DAG.LiveIns[Idx] = NewLiveIns[Idx];
+    DAG.Pressure[Idx] = NewPressure[Idx];
+    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
   }
-  Regions = NewRegions;
-  RescheduleRegions = NewRescheduleRegions;
+  DAG.Regions = NewRegions;
+  DAG.RescheduleRegions = NewRescheduleRegions;
 
   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-  MFI.increaseOccupancy(MF, ++MinOccupancy);
+  MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
 
   return true;
 }
 
 // Copied from MachineLICM
-bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI) {
-  if (!TII->isTriviallyReMaterializable(MI))
+bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
+  if (!DAG.TII->isTriviallyReMaterializable(MI))
     return false;
 
   for (const MachineOperand &MO : MI.operands())
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index c3db849cf81a..7aadf89e0bf7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -28,8 +28,6 @@ class GCNSubtarget;
 /// heuristics to determine excess/critical pressure sets.  Its goal is to
 /// maximize kernel occupancy (i.e. maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
-  friend class GCNScheduleDAGMILive;
-
   SUnit *pickNodeBidirectional(bool &IsTopNode);
 
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
@@ -42,15 +40,18 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
                      unsigned SGPRPressure, unsigned VGPRPressure);
 
   std::vector<unsigned> Pressure;
+
   std::vector<unsigned> MaxPressure;
 
   unsigned SGPRExcessLimit;
+
   unsigned VGPRExcessLimit;
-  unsigned SGPRCriticalLimit;
-  unsigned VGPRCriticalLimit;
 
   unsigned TargetOccupancy;
 
+  MachineFunction *MF;
+
+public:
   // schedule() have seen a clustered memory operation. Set it to false
   // before a region scheduling to know if the region had such clusters.
   bool HasClusteredNodes;
@@ -59,28 +60,53 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
   // register pressure for actual scheduling heuristics.
   bool HasExcessPressure;
 
-  MachineFunction *MF;
+  unsigned SGPRCriticalLimit;
+
+  unsigned VGPRCriticalLimit;
 
-public:
   GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
 
   SUnit *pickNode(bool &IsTopNode) override;
 
   void initialize(ScheduleDAGMI *DAG) override;
 
+  unsigned getTargetOccupancy() { return TargetOccupancy; }
+
   void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
 };
 
-class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
+enum class GCNSchedStageID : unsigned {
+  InitialSchedule = 0,
+  UnclusteredReschedule = 1,
+  ClusteredLowOccupancyReschedule = 2,
+  PreRARematerialize = 3,
+  LastStage = PreRARematerialize
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
+#endif
+
+inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) {
+  assert(Stage != GCNSchedStageID::PreRARematerialize);
+  Stage = static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
+  return Stage;
+}
+
+inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) {
+  return static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
+}
 
-  enum : unsigned {
-    Collect,
-    InitialSchedule,
-    UnclusteredReschedule,
-    ClusteredLowOccupancyReschedule,
-    PreRARematerialize,
-    LastStage = PreRARematerialize
-  };
+inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) {
+  return static_cast<unsigned>(LHS) > static_cast<unsigned>(RHS);
+}
+
+class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
+  friend class GCNSchedStage;
+  friend class InitialScheduleStage;
+  friend class UnclusteredRescheduleStage;
+  friend class ClusteredLowOccStage;
+  friend class PreRARematStage;
 
   const GCNSubtarget &ST;
 
@@ -92,12 +118,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Minimal real occupancy recorder for the function.
   unsigned MinOccupancy;
 
-  // Scheduling stage number.
-  unsigned Stage;
-
-  // Current region index.
-  size_t RegionIdx;
-
   // Vector of regions recorder for later rescheduling
   SmallVector<std::pair<MachineBasicBlock::iterator,
                         MachineBasicBlock::iterator>, 32> Regions;
@@ -121,6 +141,148 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Region pressure cache.
   SmallVector<GCNRegPressure, 32> Pressure;
 
+  // Temporary basic block live-in cache.
+  DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;
+
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
+
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+
+  // Return current region pressure.
+  GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;
+
+  // Compute and cache live-ins and pressure for all regions in block.
+  void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
+
+  // Update region boundaries when removing MI or inserting NewMI before MI.
+  void updateRegionBoundaries(
+      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+                                MachineBasicBlock::iterator>> &RegionBoundaries,
+      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
+      bool Removing = false);
+
+  void runSchedStages();
+
+public:
+  GCNScheduleDAGMILive(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S);
+
+  void schedule() override;
+
+  void finalizeSchedule() override;
+};
+
+// GCNSchedStrategy applies multiple scheduling stages to a function.
+class GCNSchedStage {
+protected:
+  GCNScheduleDAGMILive &DAG;
+
+  GCNMaxOccupancySchedStrategy &S;
+
+  MachineFunction &MF;
+
+  SIMachineFunctionInfo &MFI;
+
+  const GCNSubtarget &ST;
+
+  const GCNSchedStageID StageID;
+
+  // The current block being scheduled.
+  MachineBasicBlock *CurrentMBB = nullptr;
+
+  // Current region index.
+  unsigned RegionIdx = 0;
+
+  // Record the original order of instructions before scheduling.
+  std::vector<MachineInstr *> Unsched;
+
+  // RP before scheduling the current region.
+  GCNRegPressure PressureBefore;
+
+  // RP after scheduling the current region.
+  GCNRegPressure PressureAfter;
+
+  GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG);
+
+public:
+  // Initialize state for a scheduling stage. Returns false if the current stage
+  // should be skipped.
+  virtual bool initGCNSchedStage();
+
+  // Finalize state after finishing a scheduling pass on the function.
+  virtual void finalizeGCNSchedStage();
+
+  // Setup for scheduling a region. Returns false if the current region should
+  // be skipped.
+  virtual bool initGCNRegion();
+
+  // Track whether a new region is also a new MBB.
+  void setupNewBlock();
+
+  // Finalize state after scheudling a region.
+  virtual void finalizeGCNRegion();
+
+  // Check result of scheduling.
+  void checkScheduling();
+
+  // Returns true if scheduling should be reverted.
+  virtual bool shouldRevertScheduling(unsigned WavesAfter);
+
+  // Returns true if the new schedule may result in more spilling.
+  bool mayCauseSpilling(unsigned WavesAfter);
+
+  // Attempt to revert scheduling for this region.
+  void revertScheduling();
+
+  void advanceRegion() { RegionIdx++; }
+
+  virtual ~GCNSchedStage() = default;
+};
+
+class InitialScheduleStage : public GCNSchedStage {
+public:
+  void finalizeGCNRegion() override;
+
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
+class UnclusteredRescheduleStage : public GCNSchedStage {
+private:
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
+public:
+  bool initGCNSchedStage() override;
+
+  void finalizeGCNSchedStage() override;
+
+  bool initGCNRegion() override;
+
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
+// Retry function scheduling if we found resulting occupancy and it is
+// lower than used for other scheduling passes. This will give more freedom
+// to schedule low register pressure blocks.
+class ClusteredLowOccStage : public GCNSchedStage {
+public:
+  bool initGCNSchedStage() override;
+
+  bool initGCNRegion() override;
+
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  ClusteredLowOccStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
+class PreRARematStage : public GCNSchedStage {
+private:
   // Each region at MinOccupancy will have their own list of trivially
   // rematerializable instructions we can remat to reduce RP. The list maps an
   // instruction to the position we should remat before, usually the MI using
@@ -132,12 +294,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // that has the defined reg as a live-in.
   DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
 
-  // Temporary basic block live-in cache.
-  DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
-
-  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
-  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
-
   // Collect all trivially rematerializable VGPR instructions with a single def
   // and single use outside the defining block into RematerializableInsts.
   void collectRematerializableInstructions();
@@ -150,26 +306,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
                                const TargetInstrInfo *TII);
 
-  // Return current region pressure.
-  GCNRegPressure getRealRegPressure() const;
-
-  // Compute and cache live-ins and pressure for all regions in block.
-  void computeBlockPressure(const MachineBasicBlock *MBB);
-
-  // Update region boundaries when removing MI or inserting NewMI before MI.
-  void updateRegionBoundaries(
-      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
-                                MachineBasicBlock::iterator>> &RegionBoundaries,
-      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
-      bool Removing = false);
-
 public:
-  GCNScheduleDAGMILive(MachineSchedContext *C,
-                       std::unique_ptr<MachineSchedStrategy> S);
+  bool initGCNSchedStage() override;
 
-  void schedule() override;
+  bool initGCNRegion() override;
 
-  void finalizeSchedule() override;
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
 };
 
 } // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index e093d78b2cc6..d9d7d4efa8c3 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -309,6 +309,11 @@ uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
   return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2;
 }
 
+static bool isVCMPX64(const MCInstrDesc &Desc) {
+  return (Desc.TSFlags & SIInstrFlags::VOP3) &&
+         Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC);
+}
+
 void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                         SmallVectorImpl<MCFixup> &Fixups,
                                         const MCSubtargetInfo &STI) const {
@@ -326,6 +331,17 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     Encoding |= getImplicitOpSelHiEncoding(Opcode);
   }
 
+  // GFX11 v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
+  // Documentation requires dst to be encoded as EXEC (0x7E),
+  // but it looks like the actual value encoded for dst operand
+  // is ignored by HW. It was decided to define dst as "do not care"
+  // in td files to allow disassembler accept any dst value.
+  // However, dst is encoded as EXEC for compatibility with SP3.
+  if (AMDGPU::isGFX11Plus(STI) && isVCMPX64(Desc)) {
+    assert((Encoding & 0xFF) == 0);
+    Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO);
+  }
+
   for (unsigned i = 0; i < bytes; i++) {
     OS.write((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i));
   }
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index e7706fa0ef5c..1ed79add64c9 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -54,8 +54,8 @@ public:
       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
       bool *IsFast = nullptr) const override;
 
-  virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
-                                    bool LegalOperations) const override {
+  bool canCombineTruncStore(EVT ValVT, EVT MemVT,
+                            bool LegalOperations) const override {
     // R600 has "custom" lowering for truncating stores despite not supporting
     // those instructions. If we allow that custom lowering in the DAG combiner
     // then all truncates are merged into truncating stores, giving worse code
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 438e8b200ecc..f7d139adc63b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2132,7 +2132,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
                                            SIMachineFunctionInfo &Info,
                                            CallingConv::ID CallConv,
                                            bool IsShader) const {
-  if (Subtarget->hasUserSGPRInit16Bug()) {
+  if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
+    // Note: user SGPRs are handled by the front-end for graphics shaders
     // Pad up the used user SGPRs with dead inputs.
     unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
 
@@ -2195,7 +2196,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
   }
 
-  assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16);
+  assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
+         Info.getNumPreloadedSGPRs() >= 16);
 }
 
 static void reservePrivateMemoryRegs(const TargetMachine &TM,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d1fecc1afc7f..e0101f53880f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -487,10 +487,10 @@ public:
   AtomicExpansionKind
   shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
 
-  virtual const TargetRegisterClass *
-  getRegClassFor(MVT VT, bool isDivergent) const override;
-  virtual bool requiresUniformRegister(MachineFunction &MF,
-                                       const Value *V) const override;
+  const TargetRegisterClass *getRegClassFor(MVT VT,
+                                            bool isDivergent) const override;
+  bool requiresUniformRegister(MachineFunction &MF,
+                               const Value *V) const override;
   Align getPrefLoopAlignment(MachineLoop *ML) const override;
 
   void allocateHSAUserSGPRs(CCState &CCInfo,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index ffe8dce79816..fccb08f86e6d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -349,7 +349,7 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
 
 def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
                               (add (sequence "SGPR%u_LO16", 0, 105))> {
-  let AllocationPriority = 9;
+  let AllocationPriority = 0;
   let Size = 16;
   let GeneratePressureSet = 0;
   let HasSGPR = 1;
@@ -368,7 +368,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "SGPR%u", 0, 105))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
-  let AllocationPriority = 9;
+  let AllocationPriority = 0;
   let GeneratePressureSet = 0;
   let HasSGPR = 1;
 }
@@ -528,14 +528,14 @@ def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
 let HasVGPR = 1 in {
 def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
                               (add (sequence "VGPR%u_LO16", 0, 255))> {
-  let AllocationPriority = 1;
+  let AllocationPriority = 0;
   let Size = 16;
   let GeneratePressureSet = 0;
 }
 
 def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
                               (add (sequence "VGPR%u_HI16", 0, 255))> {
-  let AllocationPriority = 1;
+  let AllocationPriority = 0;
   let Size = 16;
   let GeneratePressureSet = 0;
 }
@@ -544,7 +544,7 @@ def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
 // i16/f16 only on VI+
 def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
                             (add (sequence "VGPR%u", 0, 255))> {
-  let AllocationPriority = 1;
+  let AllocationPriority = 0;
   let Size = 32;
   let Weight = 1;
 }
@@ -588,7 +588,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
 // AccVGPR 32-bit registers
 def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "AGPR%u", 0, 255))> {
-  let AllocationPriority = 1;
+  let AllocationPriority = 0;
   let Size = 32;
   let Weight = 1;
 }
@@ -653,7 +653,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2
    SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE,
    SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
    SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
-  let AllocationPriority = 10;
+  let AllocationPriority = 0;
 }
 
 def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
@@ -663,42 +663,42 @@ def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
    SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16,
    SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> {
   let Size = 16;
-  let AllocationPriority = 10;
+  let AllocationPriority = 0;
 }
 
 def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
-  let AllocationPriority = 10;
+  let AllocationPriority = 0;
 }
 
 def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> {
   let Size = 16;
-  let AllocationPriority = 10;
+  let AllocationPriority = 0;
 }
 
 def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
-  let AllocationPriority = 10;
+  let AllocationPriority = 0;
 }
 
 def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> {
   let Size = 16;
-  let AllocationPriority = 10;
+  let AllocationPriority = 0;
 }
 
 def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> {
   let Size = 16;
-  let AllocationPriority = 10;
+  let AllocationPriority = 0;
 }
 } // End GeneratePressureSet = 0
 
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
-  let AllocationPriority = 10;
+  let AllocationPriority = 0;
   let HasSGPR = 1;
 }
 
@@ -712,7 +712,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16],
 def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
                             (add SGPR_64Regs)> {
   let CopyCost = 1;
-  let AllocationPriority = 11;
+  let AllocationPriority = 1;
   let HasSGPR = 1;
 }
 
@@ -725,14 +725,14 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
 def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
   (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
-  let AllocationPriority = 13;
+  let AllocationPriority = 1;
   let HasSGPR = 1;
 }
 
 def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
   (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
-  let AllocationPriority = 13;
+  let AllocationPriority = 1;
   let HasSGPR = 1;
 }
 
@@ -750,7 +750,7 @@ def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32,
   let HasSGPR = 1;
 }
 
-multiclass SRegClass<int numRegs, int priority,
+multiclass SRegClass<int numRegs,
                      list<ValueType> regTypes,
                      SIRegisterTuples regList,
                      SIRegisterTuples ttmpList = regList,
@@ -760,7 +760,7 @@ multiclass SRegClass<int numRegs, int priority,
   defvar sgprName = !strconcat("SGPR_", suffix);
   defvar ttmpName = !strconcat("TTMP_", suffix);
 
-  let AllocationPriority = priority, CopyCost = copyCost, HasSGPR = 1 in {
+  let AllocationPriority = !sub(numRegs, 1), CopyCost = copyCost, HasSGPR = 1 in {
     def "" # sgprName : SIRegisterClass<"AMDGPU", regTypes, 32, (add regList)> {
     }
 
@@ -781,14 +781,14 @@ multiclass SRegClass<int numRegs, int priority,
   }
 }
 
-defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
-defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
-defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
-defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
-defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
-defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
-defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
+defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
+defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
+defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
+defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
+defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
 
 def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                                  (add VGPR_32, LDS_DIRECT_CLASS)> {
@@ -803,7 +803,7 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
 
   // Requires n v_mov_b32 to copy
   let CopyCost = numRegs;
-  let AllocationPriority = numRegs;
+  let AllocationPriority = !sub(numRegs, 1);
   let Weight = numRegs;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index d489a089ac78..5973d32c91d6 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -718,7 +718,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
   // DPP8 forbids modifiers and can inherit from VOPC_Profile
 
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
-  dag InsPartVOP3DPP = (ins Src0Mod:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1);
+  dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1);
   let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
                                                        (ins)));
   let Asm64 = "$sdst, $src0_modifiers, $src1";