diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2016-08-16 21:02:59 +0000 | 
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2016-08-16 21:02:59 +0000 | 
| commit | 3ca95b020283db6244cab92ede73c969253b6a31 (patch) | |
| tree | d16e791e58694facd8f68d3e2797a1eaa8018afc /contrib/llvm/lib/CodeGen/MachineScheduler.cpp | |
| parent | 27067774dce3388702a4cf744d7096c6fb71b688 (diff) | |
| parent | c3aee98e721333f265a88d6bf348e6e468f027d4 (diff) | |
Notes
Diffstat (limited to 'contrib/llvm/lib/CodeGen/MachineScheduler.cpp')
| -rw-r--r-- | contrib/llvm/lib/CodeGen/MachineScheduler.cpp | 736 | 
1 files changed, 438 insertions, 298 deletions
| diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp index bcee15c7c75f..d921e2977cc7 100644 --- a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp @@ -23,13 +23,13 @@  #include "llvm/CodeGen/RegisterClassInfo.h"  #include "llvm/CodeGen/ScheduleDFS.h"  #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/TargetPassConfig.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/GraphWriter.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h" -#include <queue>  using namespace llvm; @@ -65,14 +65,20 @@ static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,  static bool ViewMISchedDAGs = false;  #endif // NDEBUG +/// Avoid quadratic complexity in unusually large basic blocks by limiting the +/// size of the ready lists. +static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden, +  cl::desc("Limit ready list to N instructions"), cl::init(256)); +  static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,    cl::desc("Enable register pressure scheduling."), cl::init(true));  static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,    cl::desc("Enable cyclic critical path analysis."), cl::init(true)); -static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, -  cl::desc("Enable load clustering."), cl::init(true)); +static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden, +                                        cl::desc("Enable memop clustering."), +                                        cl::init(true));  // Experimental heuristics  static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, @@ -219,6 +225,11 @@ static cl::opt<bool> EnableMachineSched(      cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),      cl::Hidden); +static cl::opt<bool> EnablePostRAMachineSched( +    "enable-post-misched", +    cl::desc("Enable the post-ra machine instruction scheduling pass."), +    cl::init(true), cl::Hidden); +  /// Forward declare the standard machine scheduler. This will be used as the  /// default scheduler if the target does not set a default.  static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C); @@ -314,6 +325,9 @@ ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {  /// design would be to split blocks at scheduling boundaries, but LLVM has a  /// general bias against block splitting purely for implementation simplicity.  bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { +  if (skipFunction(*mf.getFunction())) +    return false; +    if (EnableMachineSched.getNumOccurrences()) {      if (!EnableMachineSched)        return false; @@ -349,10 +363,13 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {  }  bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { -  if (skipOptnoneFunction(*mf.getFunction())) +  if (skipFunction(*mf.getFunction()))      return false; -  if (!mf.getSubtarget().enablePostRAScheduler()) { +  if (EnablePostRAMachineSched.getNumOccurrences()) { +    if (!EnablePostRAMachineSched) +      return false; +  } else if (!mf.getSubtarget().enablePostRAScheduler()) {      DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");      return false;    } @@ -389,7 +406,7 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI,                              MachineBasicBlock *MBB,                              MachineFunction *MF,                              const TargetInstrInfo *TII) { -  return MI->isCall() || TII->isSchedulingBoundary(MI, MBB, *MF); +  return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF);  }  /// Main driver for both MachineScheduler and PostMachineScheduler. @@ -427,7 +444,6 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,      //      // MBB::size() uses instr_iterator to count. Here we need a bundle to count      // as a single instruction. -    unsigned RemainingInstrs = std::distance(MBB->begin(), MBB->end());      for(MachineBasicBlock::iterator RegionEnd = MBB->end();          RegionEnd != MBB->begin(); RegionEnd = Scheduler.begin()) { @@ -435,15 +451,13 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,        if (RegionEnd != MBB->end() ||            isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {          --RegionEnd; -        // Count the boundary instruction. -        --RemainingInstrs;        }        // The next region starts above the previous region. Look backward in the        // instruction stream until we find the nearest boundary.        unsigned NumRegionInstrs = 0;        MachineBasicBlock::iterator I = RegionEnd; -      for(;I != MBB->begin(); --I, --RemainingInstrs) { +      for (;I != MBB->begin(); --I) {          if (isSchedBoundary(&*std::prev(I), &*MBB, MF, TII))            break;          if (!I->isDebugValue()) @@ -466,8 +480,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,              << "\n  From: " << *I << "    To: ";              if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;              else dbgs() << "End"; -            dbgs() << " RegionInstrs: " << NumRegionInstrs -            << " Remaining: " << RemainingInstrs << "\n"); +            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');        if (DumpCriticalPathLength) {          errs() << MF->getName();          errs() << ":BB# " << MBB->getNumber(); @@ -485,7 +498,6 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,        // scheduler for the top of it's scheduled region.        RegionEnd = Scheduler.begin();      } -    assert(RemainingInstrs == 0 && "Instruction count mismatch!");      Scheduler.finishBlock();      // FIXME: Ideally, no further passes should rely on kill flags. However,      // thumb2 size reduction is currently an exception, so the PostMIScheduler @@ -640,7 +652,7 @@ void ScheduleDAGMI::moveInstruction(    // Update LiveIntervals    if (LIS) -    LIS->handleMove(MI, /*UpdateFlags=*/true); +    LIS->handleMove(*MI, /*UpdateFlags=*/true);    // Recede RegionBegin if an instruction moves above the first.    if (RegionBegin == InsertPos) @@ -704,8 +716,7 @@ void ScheduleDAGMI::schedule() {          CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);        else          moveInstruction(MI, CurrentTop); -    } -    else { +    } else {        assert(SU->isBottomReady() && "node still has unscheduled dependencies");        MachineBasicBlock::iterator priorII =          priorNonDebug(CurrentBottom, CurrentTop); @@ -869,13 +880,19 @@ void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,    SUPressureDiffs.clear();    ShouldTrackPressure = SchedImpl->shouldTrackPressure(); +  ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks(); + +  assert((!ShouldTrackLaneMasks || ShouldTrackPressure) && +         "ShouldTrackLaneMasks requires ShouldTrackPressure");  }  // Setup the register pressure trackers for the top scheduled top and bottom  // scheduled regions.  void ScheduleDAGMILive::initRegPressure() { -  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); -  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); +  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, +                    ShouldTrackLaneMasks, false); +  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd, +                    ShouldTrackLaneMasks, false);    // Close the RPTracker to finalize live ins.    RPTracker.closeRegion(); @@ -905,7 +922,7 @@ void ScheduleDAGMILive::initRegPressure() {    // Account for liveness generated by the region boundary.    if (LiveRegionEnd != RegionEnd) { -    SmallVector<unsigned, 8> LiveUses; +    SmallVector<RegisterMaskPair, 8> LiveUses;      BotRPTracker.recede(&LiveUses);      updatePressureDiffs(LiveUses);    } @@ -969,47 +986,74 @@ updateScheduledPressure(const SUnit *SU,  /// Update the PressureDiff array for liveness after scheduling this  /// instruction. -void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) { -  for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) { +void ScheduleDAGMILive::updatePressureDiffs( +    ArrayRef<RegisterMaskPair> LiveUses) { +  for (const RegisterMaskPair &P : LiveUses) { +    unsigned Reg = P.RegUnit;      /// FIXME: Currently assuming single-use physregs. -    unsigned Reg = LiveUses[LUIdx]; -    DEBUG(dbgs() << "  LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n");      if (!TRI->isVirtualRegister(Reg))        continue; -    // This may be called before CurrentBottom has been initialized. However, -    // BotRPTracker must have a valid position. We want the value live into the -    // instruction or live out of the block, so ask for the previous -    // instruction's live-out. -    const LiveInterval &LI = LIS->getInterval(Reg); -    VNInfo *VNI; -    MachineBasicBlock::const_iterator I = -      nextIfDebug(BotRPTracker.getPos(), BB->end()); -    if (I == BB->end()) -      VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); -    else { -      LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(I)); -      VNI = LRQ.valueIn(); -    } -    // RegisterPressureTracker guarantees that readsReg is true for LiveUses. -    assert(VNI && "No live value at use."); -    for (const VReg2SUnit &V2SU -         : make_range(VRegUses.find(Reg), VRegUses.end())) { -      SUnit *SU = V2SU.SU; -      // If this use comes before the reaching def, it cannot be a last use, so -      // descrease its pressure change. -      if (!SU->isScheduled && SU != &ExitSU) { -        LiveQueryResult LRQ -          = LI.Query(LIS->getInstructionIndex(SU->getInstr())); -        if (LRQ.valueIn() == VNI) { -          PressureDiff &PDiff = getPressureDiff(SU); -          PDiff.addPressureChange(Reg, true, &MRI); -          DEBUG( -            dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") " -                   << *SU->getInstr(); -            dbgs() << "              to "; -            PDiff.dump(*TRI); -          ); +    if (ShouldTrackLaneMasks) { +      // If the register has just become live then other uses won't change +      // this fact anymore => decrement pressure. +      // If the register has just become dead then other uses make it come +      // back to life => increment pressure. +      bool Decrement = P.LaneMask != 0; + +      for (const VReg2SUnit &V2SU +           : make_range(VRegUses.find(Reg), VRegUses.end())) { +        SUnit &SU = *V2SU.SU; +        if (SU.isScheduled || &SU == &ExitSU) +          continue; + +        PressureDiff &PDiff = getPressureDiff(&SU); +        PDiff.addPressureChange(Reg, Decrement, &MRI); +        DEBUG( +          dbgs() << "  UpdateRegP: SU(" << SU.NodeNum << ") " +                 << PrintReg(Reg, TRI) << ':' << PrintLaneMask(P.LaneMask) +                 << ' ' << *SU.getInstr(); +          dbgs() << "              to "; +          PDiff.dump(*TRI); +        ); +      } +    } else { +      assert(P.LaneMask != 0); +      DEBUG(dbgs() << "  LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n"); +      // This may be called before CurrentBottom has been initialized. However, +      // BotRPTracker must have a valid position. We want the value live into the +      // instruction or live out of the block, so ask for the previous +      // instruction's live-out. +      const LiveInterval &LI = LIS->getInterval(Reg); +      VNInfo *VNI; +      MachineBasicBlock::const_iterator I = +        nextIfDebug(BotRPTracker.getPos(), BB->end()); +      if (I == BB->end()) +        VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); +      else { +        LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I)); +        VNI = LRQ.valueIn(); +      } +      // RegisterPressureTracker guarantees that readsReg is true for LiveUses. +      assert(VNI && "No live value at use."); +      for (const VReg2SUnit &V2SU +           : make_range(VRegUses.find(Reg), VRegUses.end())) { +        SUnit *SU = V2SU.SU; +        // If this use comes before the reaching def, it cannot be a last use, +        // so decrease its pressure change. +        if (!SU->isScheduled && SU != &ExitSU) { +          LiveQueryResult LRQ = +              LI.Query(LIS->getInstructionIndex(*SU->getInstr())); +          if (LRQ.valueIn() == VNI) { +            PressureDiff &PDiff = getPressureDiff(SU); +            PDiff.addPressureChange(Reg, true, &MRI); +            DEBUG( +              dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") " +                     << *SU->getInstr(); +              dbgs() << "              to "; +              PDiff.dump(*TRI); +            ); +          }          }        }      } @@ -1057,11 +1101,6 @@ void ScheduleDAGMILive::schedule() {    // Initialize ready queues now that the DAG and priority data are finalized.    initQueues(TopRoots, BotRoots); -  if (ShouldTrackPressure) { -    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); -    TopRPTracker.setPos(CurrentTop); -  } -    bool IsTopNode = false;    while (true) {      DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n"); @@ -1111,14 +1150,14 @@ void ScheduleDAGMILive::buildDAGWithRegPressure() {    // Initialize the register pressure tracker used by buildSchedGraph.    RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd, -                 /*TrackUntiedDefs=*/true); +                 ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true);    // Account for liveness generate by the region boundary.    if (LiveRegionEnd != RegionEnd)      RPTracker.recede();    // Build the DAG, and compute current register pressure. -  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs); +  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks);    // Initialize top/bottom trackers after computing region pressure.    initRegPressure(); @@ -1167,10 +1206,8 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {    unsigned MaxCyclicLatency = 0;    // Visit each live out vreg def to find def/use pairs that cross iterations. -  ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs; -  for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end(); -       RI != RE; ++RI) { -    unsigned Reg = *RI; +  for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) { +    unsigned Reg = P.RegUnit;      if (!TRI->isVirtualRegister(Reg))          continue;      const LiveInterval &LI = LIS->getInterval(Reg); @@ -1193,8 +1230,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {          continue;        // Only consider uses of the phi. -      LiveQueryResult LRQ = -        LI.Query(LIS->getInstructionIndex(SU->getInstr())); +      LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr()));        if (!LRQ.valueIn()->isPHIDef())          continue; @@ -1209,8 +1245,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {        if (LiveInHeight > LiveOutHeight) {          if (LiveInHeight - LiveOutHeight < CyclicLatency)            CyclicLatency = LiveInHeight - LiveOutHeight; -      } -      else +      } else          CyclicLatency = 0;        DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU(" @@ -1223,6 +1258,17 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {    return MaxCyclicLatency;  } +/// Release ExitSU predecessors and setup scheduler queues. Re-position +/// the Top RP tracker in case the region beginning has changed. +void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots, +                                   ArrayRef<SUnit*> BotRoots) { +  ScheduleDAGMI::initQueues(TopRoots, BotRoots); +  if (ShouldTrackPressure) { +    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); +    TopRPTracker.setPos(CurrentTop); +  } +} +  /// Move an instruction and update register pressure.  void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {    // Move the instruction to its new location in the instruction stream. @@ -1239,7 +1285,18 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {      if (ShouldTrackPressure) {        // Update top scheduled pressure. -      TopRPTracker.advance(); +      RegisterOperands RegOpers; +      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); +      if (ShouldTrackLaneMasks) { +        // Adjust liveness and add missing dead+read-undef flags. +        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); +        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); +      } else { +        // Adjust for missing dead-def flags. +        RegOpers.detectDeadDefs(*MI, *LIS); +      } + +      TopRPTracker.advance(RegOpers);        assert(TopRPTracker.getPos() == CurrentTop && "out of sync");        DEBUG(          dbgs() << "Top Pressure:\n"; @@ -1248,8 +1305,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {        updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);      } -  } -  else { +  } else {      assert(SU->isBottomReady() && "node still has unscheduled dependencies");      MachineBasicBlock::iterator priorII =        priorNonDebug(CurrentBottom, CurrentTop); @@ -1264,9 +1320,20 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {        CurrentBottom = MI;      }      if (ShouldTrackPressure) { -      // Update bottom scheduled pressure. -      SmallVector<unsigned, 8> LiveUses; -      BotRPTracker.recede(&LiveUses); +      RegisterOperands RegOpers; +      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); +      if (ShouldTrackLaneMasks) { +        // Adjust liveness and add missing dead+read-undef flags. +        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); +        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); +      } else { +        // Adjust for missing dead-def flags. +        RegOpers.detectDeadDefs(*MI, *LIS); +      } + +      BotRPTracker.recedeSkipDebugValues(); +      SmallVector<RegisterMaskPair, 8> LiveUses; +      BotRPTracker.recede(RegOpers, &LiveUses);        assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");        DEBUG(          dbgs() << "Bottom Pressure:\n"; @@ -1280,64 +1347,81 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {  }  //===----------------------------------------------------------------------===// -// LoadClusterMutation - DAG post-processing to cluster loads. +// BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.  //===----------------------------------------------------------------------===//  namespace {  /// \brief Post-process the DAG to create cluster edges between neighboring -/// loads. -class LoadClusterMutation : public ScheduleDAGMutation { -  struct LoadInfo { +/// loads or between neighboring stores. +class BaseMemOpClusterMutation : public ScheduleDAGMutation { +  struct MemOpInfo {      SUnit *SU;      unsigned BaseReg; -    unsigned Offset; -    LoadInfo(SUnit *su, unsigned reg, unsigned ofs) -      : SU(su), BaseReg(reg), Offset(ofs) {} +    int64_t Offset; +    MemOpInfo(SUnit *su, unsigned reg, int64_t ofs) +        : SU(su), BaseReg(reg), Offset(ofs) {} -    bool operator<(const LoadInfo &RHS) const { +    bool operator<(const MemOpInfo&RHS) const {        return std::tie(BaseReg, Offset) < std::tie(RHS.BaseReg, RHS.Offset);      }    };    const TargetInstrInfo *TII;    const TargetRegisterInfo *TRI; +  bool IsLoad; +  public: -  LoadClusterMutation(const TargetInstrInfo *tii, -                      const TargetRegisterInfo *tri) -    : TII(tii), TRI(tri) {} +  BaseMemOpClusterMutation(const TargetInstrInfo *tii, +                           const TargetRegisterInfo *tri, bool IsLoad) +      : TII(tii), TRI(tri), IsLoad(IsLoad) {} + +  void apply(ScheduleDAGInstrs *DAGInstrs) override; -  void apply(ScheduleDAGMI *DAG) override;  protected: -  void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG); +  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG); +}; + +class StoreClusterMutation : public BaseMemOpClusterMutation { +public: +  StoreClusterMutation(const TargetInstrInfo *tii, +                       const TargetRegisterInfo *tri) +      : BaseMemOpClusterMutation(tii, tri, false) {} +}; + +class LoadClusterMutation : public BaseMemOpClusterMutation { +public: +  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri) +      : BaseMemOpClusterMutation(tii, tri, true) {}  };  } // anonymous -void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, -                                                  ScheduleDAGMI *DAG) { -  SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords; -  for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) { -    SUnit *SU = Loads[Idx]; +void BaseMemOpClusterMutation::clusterNeighboringMemOps( +    ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) { +  SmallVector<MemOpInfo, 32> MemOpRecords; +  for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) { +    SUnit *SU = MemOps[Idx];      unsigned BaseReg; -    unsigned Offset; -    if (TII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI)) -      LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset)); +    int64_t Offset; +    if (TII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseReg, Offset, TRI)) +      MemOpRecords.push_back(MemOpInfo(SU, BaseReg, Offset));    } -  if (LoadRecords.size() < 2) +  if (MemOpRecords.size() < 2)      return; -  std::sort(LoadRecords.begin(), LoadRecords.end()); + +  std::sort(MemOpRecords.begin(), MemOpRecords.end());    unsigned ClusterLength = 1; -  for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) { -    if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) { +  for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { +    if (MemOpRecords[Idx].BaseReg != MemOpRecords[Idx+1].BaseReg) {        ClusterLength = 1;        continue;      } -    SUnit *SUa = LoadRecords[Idx].SU; -    SUnit *SUb = LoadRecords[Idx+1].SU; -    if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength) -        && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { - -      DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU(" +    SUnit *SUa = MemOpRecords[Idx].SU; +    SUnit *SUb = MemOpRecords[Idx+1].SU; +    if (TII->shouldClusterMemOps(*SUa->getInstr(), *SUb->getInstr(), +                                 ClusterLength) && +        DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { +      DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("              << SUb->NodeNum << ")\n");        // Copy successor edges from SUa to SUb. Interleaving computation        // dependent on SUa can prevent load combining due to register reuse. @@ -1351,22 +1435,26 @@ void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,          DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));        }        ++ClusterLength; -    } -    else +    } else        ClusterLength = 1;    }  }  /// \brief Callback from DAG postProcessing to create cluster edges for loads. -void LoadClusterMutation::apply(ScheduleDAGMI *DAG) { +void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + +  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); +    // Map DAG NodeNum to store chain ID.    DenseMap<unsigned, unsigned> StoreChainIDs; -  // Map each store chain to a set of dependent loads. +  // Map each store chain to a set of dependent MemOps.    SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;    for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {      SUnit *SU = &DAG->SUnits[Idx]; -    if (!SU->getInstr()->mayLoad()) +    if ((IsLoad && !SU->getInstr()->mayLoad()) || +        (!IsLoad && !SU->getInstr()->mayStore()))        continue; +      unsigned ChainPredID = DAG->SUnits.size();      for (SUnit::const_pred_iterator             PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { @@ -1376,7 +1464,7 @@ void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {        }      }      // Check if this chain-like pred has been seen -    // before. ChainPredID==MaxNodeID for loads at the top of the schedule. +    // before. ChainPredID==MaxNodeID at the top of the schedule.      unsigned NumChains = StoreChainDependents.size();      std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =        StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); @@ -1384,9 +1472,10 @@ void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {        StoreChainDependents.resize(NumChains + 1);      StoreChainDependents[Result.first->second].push_back(SU);    } +    // Iterate over the store chains.    for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) -    clusterNeighboringLoads(StoreChainDependents[Idx], DAG); +    clusterNeighboringMemOps(StoreChainDependents[Idx], DAG);  }  //===----------------------------------------------------------------------===// @@ -1403,7 +1492,7 @@ public:    MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI)      : TII(TII), TRI(TRI) {} -  void apply(ScheduleDAGMI *DAG) override; +  void apply(ScheduleDAGInstrs *DAGInstrs) override;  };  } // anonymous @@ -1423,7 +1512,9 @@ static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI,  /// \brief Callback from DAG postProcessing to create cluster edges to encourage  /// fused operations. -void MacroFusion::apply(ScheduleDAGMI *DAG) { +void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { +  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); +    // For now, assume targets can only fuse with the branch.    SUnit &ExitSU = DAG->ExitSU;    MachineInstr *Branch = ExitSU.getInstr(); @@ -1439,7 +1530,7 @@ void MacroFusion::apply(ScheduleDAGMI *DAG) {      if (!HasDataDep(TRI, *Branch, *Pred))        continue; -    if (!TII.shouldScheduleAdjacent(Pred, Branch)) +    if (!TII.shouldScheduleAdjacent(*Pred, *Branch))        continue;      // Create a single weak edge from SU to ExitSU. The only effect is to cause @@ -1474,7 +1565,7 @@ class CopyConstrain : public ScheduleDAGMutation {  public:    CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} -  void apply(ScheduleDAGMI *DAG) override; +  void apply(ScheduleDAGInstrs *DAGInstrs) override;  protected:    void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG); @@ -1505,12 +1596,14 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {    MachineInstr *Copy = CopySU->getInstr();    // Check for pure vreg copies. -  unsigned SrcReg = Copy->getOperand(1).getReg(); -  if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) +  const MachineOperand &SrcOp = Copy->getOperand(1); +  unsigned SrcReg = SrcOp.getReg(); +  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || !SrcOp.readsReg())      return; -  unsigned DstReg = Copy->getOperand(0).getReg(); -  if (!TargetRegisterInfo::isVirtualRegister(DstReg)) +  const MachineOperand &DstOp = Copy->getOperand(0); +  unsigned DstReg = DstOp.getReg(); +  if (!TargetRegisterInfo::isVirtualRegister(DstReg) || DstOp.isDead())      return;    // Check if either the dest or source is local. If it's live across a back @@ -1627,15 +1720,16 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {  /// \brief Callback from DAG postProcessing to create weak edges to encourage  /// copy elimination. -void CopyConstrain::apply(ScheduleDAGMI *DAG) { +void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) { +  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);    assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals");    MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());    if (FirstPos == DAG->end())      return; -  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos); +  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos);    RegionEndIdx = DAG->getLIS()->getInstructionIndex( -    &*priorNonDebug(DAG->end(), DAG->begin())); +      *priorNonDebug(DAG->end(), DAG->begin()));    for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {      SUnit *SU = &DAG->SUnits[Idx]; @@ -1862,7 +1956,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {    // Check for interlocks first. For the purpose of other heuristics, an    // instruction that cannot issue appears as if it's not in the ReadyQueue.    bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; -  if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU)) +  if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU) || +      Available.size() >= ReadyListLimit)      Pending.push(SU);    else      Available.push(SU); @@ -1905,8 +2000,7 @@ void SchedBoundary::bumpCycle(unsigned NextCycle) {    if (!HazardRec->isEnabled()) {      // Bypass HazardRec virtual calls.      CurrCycle = NextCycle; -  } -  else { +  } else {      // Bypass getHazardType calls in case of long latency.      for (; CurrCycle != NextCycle; ++CurrCycle) {        if (isTop()) @@ -2074,8 +2168,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {    // If we stall for any reason, bump the cycle.    if (NextCycle > CurrCycle) {      bumpCycle(NextCycle); -  } -  else { +  } else {      // After updating ZoneCritResIdx and ExpectedLatency, check if we're      // resource limited. If a stall occurred, bumpCycle does this.      unsigned LFactor = SchedModel->getLatencyFactor(); @@ -2119,11 +2212,13 @@ void SchedBoundary::releasePending() {      if (checkHazard(SU))        continue; +    if (Available.size() >= ReadyListLimit) +      break; +      Available.push(SU);      Pending.remove(Pending.begin()+i);      --i; --e;    } -  DEBUG(if (!Pending.empty()) Pending.dump());    CheckPending = false;  } @@ -2163,6 +2258,10 @@ SUnit *SchedBoundary::pickOnlyChoice() {      bumpCycle(CurrCycle + 1);      releasePending();    } + +  DEBUG(Pending.dump()); +  DEBUG(Available.dump()); +    if (Available.size() == 1)      return *Available.begin();    return nullptr; @@ -2177,8 +2276,7 @@ void SchedBoundary::dumpScheduledState() {    if (ZoneCritResIdx) {      ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);      ResCount = getResourceCount(ZoneCritResIdx); -  } -  else { +  } else {      ResFactor = SchedModel->getMicroOpFactor();      ResCount = RetiredMOps * SchedModel->getMicroOpFactor();    } @@ -2218,8 +2316,7 @@ initResourceDelta(const ScheduleDAGMI *DAG,  /// Set the CandPolicy given a scheduling zone given the current resources and  /// latencies inside and outside the zone. -void GenericSchedulerBase::setPolicy(CandPolicy &Policy, -                                     bool IsPostRA, +void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,                                       SchedBoundary &CurrZone,                                       SchedBoundary *OtherZone) {    // Apply preemptive heuristics based on the total latency and resources @@ -2295,7 +2392,8 @@ const char *GenericSchedulerBase::getReasonStr(    GenericSchedulerBase::CandReason Reason) {    switch (Reason) {    case NoCand:         return "NOCAND    "; -  case PhysRegCopy:    return "PREG-COPY"; +  case Only1:          return "ONLY1     "; +  case PhysRegCopy:    return "PREG-COPY ";    case RegExcess:      return "REG-EXCESS";    case RegCritical:    return "REG-CRIT  ";    case Stall:          return "STALL     "; @@ -2381,7 +2479,6 @@ static bool tryLess(int TryVal, int CandVal,        Cand.Reason = Reason;      return true;    } -  Cand.setRepeat(Reason);    return false;  } @@ -2398,7 +2495,6 @@ static bool tryGreater(int TryVal, int CandVal,        Cand.Reason = Reason;      return true;    } -  Cand.setRepeat(Reason);    return false;  } @@ -2414,8 +2510,7 @@ static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,      if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),                     TryCand, Cand, GenericSchedulerBase::TopPathReduce))        return true; -  } -  else { +  } else {      if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {        if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),                    TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) @@ -2428,10 +2523,13 @@ static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,    return false;  } -static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand, -                      bool IsTop) { +static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {    DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") -        << GenericSchedulerBase::getReasonStr(Cand.Reason) << '\n'); +        << GenericSchedulerBase::getReasonStr(Reason) << '\n'); +} + +static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) { +  tracePick(Cand.Reason, Cand.AtTop);  }  void GenericScheduler::initialize(ScheduleDAGMI *dag) { @@ -2460,6 +2558,8 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {          DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(              Itin, DAG);    } +  TopCand.SU = nullptr; +  BotCand.SU = nullptr;  }  /// Initialize the per-region scheduling policy. @@ -2487,8 +2587,7 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,    RegionPolicy.OnlyBottomUp = true;    // Allow the subtarget to override default policy. -  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, Begin, End, -                                        NumRegionInstrs); +  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);    // After subtarget overrides, apply command line options.    if (!EnableRegPressure) @@ -2582,19 +2681,25 @@ static bool tryPressure(const PressureChange &TryP,                          GenericSchedulerBase::CandReason Reason,                          const TargetRegisterInfo *TRI,                          const MachineFunction &MF) { -  unsigned TryPSet = TryP.getPSetOrMax(); -  unsigned CandPSet = CandP.getPSetOrMax(); -  // If both candidates affect the same set, go with the smallest increase. -  if (TryPSet == CandPSet) { -    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, -                   Reason); -  }    // If one candidate decreases and the other increases, go with it.    // Invalid candidates have UnitInc==0.    if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,                   Reason)) {      return true;    } +  // Do not compare the magnitude of pressure changes between top and bottom +  // boundary. +  if (Cand.AtTop != TryCand.AtTop) +    return false; + +  // If both candidates affect the same set in the same boundary, go with the +  // smallest increase. +  unsigned TryPSet = TryP.getPSetOrMax(); +  unsigned CandPSet = CandP.getPSetOrMax(); +  if (TryPSet == CandPSet) { +    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, +                   Reason); +  }    int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :                                   std::numeric_limits<int>::max(); @@ -2640,64 +2745,64 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {    return 0;  } -/// Apply a set of heursitics to a new candidate. Heuristics are currently -/// hierarchical. This may be more efficient than a graduated cost model because -/// we don't need to evaluate all aspects of the model for each node in the -/// queue. But it's really done to make the heuristics easier to debug and -/// statistically analyze. -/// -/// \param Cand provides the policy and current best candidate. -/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. -/// \param Zone describes the scheduled zone that we are extending. -/// \param RPTracker describes reg pressure within the scheduled zone. -/// \param TempTracker is a scratch pressure tracker to reuse in queries. -void GenericScheduler::tryCandidate(SchedCandidate &Cand, -                                    SchedCandidate &TryCand, -                                    SchedBoundary &Zone, -                                    const RegPressureTracker &RPTracker, -                                    RegPressureTracker &TempTracker) { - +void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU, +                                     bool AtTop, +                                     const RegPressureTracker &RPTracker, +                                     RegPressureTracker &TempTracker) { +  Cand.SU = SU; +  Cand.AtTop = AtTop;    if (DAG->isTrackingPressure()) { -    // Always initialize TryCand's RPDelta. -    if (Zone.isTop()) { +    if (AtTop) {        TempTracker.getMaxDownwardPressureDelta( -        TryCand.SU->getInstr(), -        TryCand.RPDelta, +        Cand.SU->getInstr(), +        Cand.RPDelta,          DAG->getRegionCriticalPSets(),          DAG->getRegPressure().MaxSetPressure); -    } -    else { +    } else {        if (VerifyScheduling) {          TempTracker.getMaxUpwardPressureDelta( -          TryCand.SU->getInstr(), -          &DAG->getPressureDiff(TryCand.SU), -          TryCand.RPDelta, +          Cand.SU->getInstr(), +          &DAG->getPressureDiff(Cand.SU), +          Cand.RPDelta,            DAG->getRegionCriticalPSets(),            DAG->getRegPressure().MaxSetPressure); -      } -      else { +      } else {          RPTracker.getUpwardPressureDelta( -          TryCand.SU->getInstr(), -          DAG->getPressureDiff(TryCand.SU), -          TryCand.RPDelta, +          Cand.SU->getInstr(), +          DAG->getPressureDiff(Cand.SU), +          Cand.RPDelta,            DAG->getRegionCriticalPSets(),            DAG->getRegPressure().MaxSetPressure);        }      }    } -  DEBUG(if (TryCand.RPDelta.Excess.isValid()) -          dbgs() << "  Try  SU(" << TryCand.SU->NodeNum << ") " -                 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet()) -                 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n"); +  DEBUG(if (Cand.RPDelta.Excess.isValid()) +          dbgs() << "  Try  SU(" << Cand.SU->NodeNum << ") " +                 << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) +                 << ":" << Cand.RPDelta.Excess.getUnitInc() << "\n"); +} +/// Apply a set of heursitics to a new candidate. Heuristics are currently +/// hierarchical. This may be more efficient than a graduated cost model because +/// we don't need to evaluate all aspects of the model for each node in the +/// queue. But it's really done to make the heuristics easier to debug and +/// statistically analyze. +/// +/// \param Cand provides the policy and current best candidate. +/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. +/// \param Zone describes the scheduled zone that we are extending, or nullptr +//              if Cand is from a different zone than TryCand. +void GenericScheduler::tryCandidate(SchedCandidate &Cand, +                                    SchedCandidate &TryCand, +                                    SchedBoundary *Zone) {    // Initialize the candidate if needed.    if (!Cand.isValid()) {      TryCand.Reason = NodeOrder;      return;    } -  if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()), -                 biasPhysRegCopy(Cand.SU, Zone.isTop()), +  if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop), +                 biasPhysRegCopy(Cand.SU, Cand.AtTop),                   TryCand, Cand, PhysRegCopy))      return; @@ -2715,17 +2820,26 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,                                                 DAG->MF))      return; -  // For loops that are acyclic path limited, aggressively schedule for latency. -  // This can result in very long dependence chains scheduled in sequence, so -  // once every cycle (when CurrMOps == 0), switch to normal heuristics. -  if (Rem.IsAcyclicLatencyLimited && !Zone.getCurrMOps() -      && tryLatency(TryCand, Cand, Zone)) -    return; +  // We only compare a subset of features when comparing nodes between +  // Top and Bottom boundary. Some properties are simply incomparable, in many +  // other instances we should only override the other boundary if something +  // is a clear good pick on one boundary. Skip heuristics that are more +  // "tie-breaking" in nature. +  bool SameBoundary = Zone != nullptr; +  if (SameBoundary) { +    // For loops that are acyclic path limited, aggressively schedule for +    // latency.  This can result in very long dependence chains scheduled in +    // sequence, so once every cycle (when CurrMOps == 0), switch to normal +    // heuristics. +    if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && +        tryLatency(TryCand, Cand, *Zone)) +      return; -  // Prioritize instructions that read unbuffered resources by stall cycles. -  if (tryLess(Zone.getLatencyStallCycles(TryCand.SU), -              Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) -    return; +    // Prioritize instructions that read unbuffered resources by stall cycles. +    if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), +                Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) +      return; +  }    // Keep clustered nodes together to encourage downstream peephole    // optimizations which may reduce resource requirements. @@ -2733,18 +2847,23 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,    // This is a best effort to set things up for a post-RA pass. Optimizations    // like generating loads of multiple registers should ideally be done within    // the scheduler pass by combining the loads during DAG postprocessing. -  const SUnit *NextClusterSU = -    Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); -  if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU, +  const SUnit *CandNextClusterSU = +    Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); +  const SUnit *TryCandNextClusterSU = +    TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); +  if (tryGreater(TryCand.SU == TryCandNextClusterSU, +                 Cand.SU == CandNextClusterSU,                   TryCand, Cand, Cluster))      return; -  // Weak edges are for clustering and other constraints. -  if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()), -              getWeakLeft(Cand.SU, Zone.isTop()), -              TryCand, Cand, Weak)) { -    return; +  if (SameBoundary) { +    // Weak edges are for clustering and other constraints. +    if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), +                getWeakLeft(Cand.SU, Cand.AtTop), +                TryCand, Cand, Weak)) +      return;    } +    // Avoid increasing the max pressure of the entire region.    if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,                                                 Cand.RPDelta.CurrentMax, @@ -2752,34 +2871,35 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,                                                 DAG->MF))      return; -  // Avoid critical resource consumption and balance the schedule. -  TryCand.initResourceDelta(DAG, SchedModel); -  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, -              TryCand, Cand, ResourceReduce)) -    return; -  if (tryGreater(TryCand.ResDelta.DemandedResources, -                 Cand.ResDelta.DemandedResources, -                 TryCand, Cand, ResourceDemand)) -    return; +  if (SameBoundary) { +    // Avoid critical resource consumption and balance the schedule. +    TryCand.initResourceDelta(DAG, SchedModel); +    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, +                TryCand, Cand, ResourceReduce)) +      return; +    if (tryGreater(TryCand.ResDelta.DemandedResources, +                   Cand.ResDelta.DemandedResources, +                   TryCand, Cand, ResourceDemand)) +      return; -  // Avoid serializing long latency dependence chains. -  // For acyclic path limited loops, latency was already checked above. -  if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency && -      !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) { -    return; -  } +    // Avoid serializing long latency dependence chains. +    // For acyclic path limited loops, latency was already checked above. +    if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && +        !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) +      return; -  // Prefer immediate defs/users of the last scheduled instruction. This is a -  // local pressure avoidance strategy that also makes the machine code -  // readable. -  if (tryGreater(Zone.isNextSU(TryCand.SU), Zone.isNextSU(Cand.SU), -                 TryCand, Cand, NextDefUse)) -    return; +    // Prefer immediate defs/users of the last scheduled instruction. This is a +    // local pressure avoidance strategy that also makes the machine code +    // readable. +    if (tryGreater(Zone->isNextSU(TryCand.SU), Zone->isNextSU(Cand.SU), +                   TryCand, Cand, NextDefUse)) +      return; -  // Fall through to original instruction order. -  if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) -      || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { -    TryCand.Reason = NodeOrder; +    // Fall through to original instruction order. +    if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) +        || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { +      TryCand.Reason = NodeOrder; +    }    }  } @@ -2789,20 +2909,20 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,  /// DAG building. To adjust for the current scheduling location we need to  /// maintain the number of vreg uses remaining to be top-scheduled.  void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, +                                         const CandPolicy &ZonePolicy,                                           const RegPressureTracker &RPTracker,                                           SchedCandidate &Cand) { -  ReadyQueue &Q = Zone.Available; - -  DEBUG(Q.dump()); -    // getMaxPressureDelta temporarily modifies the tracker.    RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); +  ReadyQueue &Q = Zone.Available;    for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { -    SchedCandidate TryCand(Cand.Policy); -    TryCand.SU = *I; -    tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker); +    SchedCandidate TryCand(ZonePolicy); +    initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker); +    // Pass SchedBoundary only when comparing nodes from the same boundary. +    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; +    tryCandidate(Cand, TryCand, ZoneArg);      if (TryCand.Reason != NoCand) {        // Initialize resource delta if needed in case future heuristics query it.        if (TryCand.ResDelta == SchedResourceDelta()) @@ -2819,57 +2939,77 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {    // efficient, but also provides the best heuristics for CriticalPSets.    if (SUnit *SU = Bot.pickOnlyChoice()) {      IsTopNode = false; -    DEBUG(dbgs() << "Pick Bot ONLY1\n"); +    tracePick(Only1, false);      return SU;    }    if (SUnit *SU = Top.pickOnlyChoice()) {      IsTopNode = true; -    DEBUG(dbgs() << "Pick Top ONLY1\n"); +    tracePick(Only1, true);      return SU;    } -  CandPolicy NoPolicy; -  SchedCandidate BotCand(NoPolicy); -  SchedCandidate TopCand(NoPolicy);    // Set the bottom-up policy based on the state of the current bottom zone and    // the instructions outside the zone, including the top zone. -  setPolicy(BotCand.Policy, /*IsPostRA=*/false, Bot, &Top); +  CandPolicy BotPolicy; +  setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);    // Set the top-down policy based on the state of the current top zone and    // the instructions outside the zone, including the bottom zone. -  setPolicy(TopCand.Policy, /*IsPostRA=*/false, Top, &Bot); - -  // Prefer bottom scheduling when heuristics are silent. -  pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); -  assert(BotCand.Reason != NoCand && "failed to find the first candidate"); - -  // If either Q has a single candidate that provides the least increase in -  // Excess pressure, we can immediately schedule from that Q. -  // -  // RegionCriticalPSets summarizes the pressure within the scheduled region and -  // affects picking from either Q. If scheduling in one direction must -  // increase pressure for one of the excess PSets, then schedule in that -  // direction first to provide more freedom in the other direction. -  if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess)) -      || (BotCand.Reason == RegCritical -          && !BotCand.isRepeat(RegCritical))) -  { -    IsTopNode = false; -    tracePick(BotCand, IsTopNode); -    return BotCand.SU; +  CandPolicy TopPolicy; +  setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + +  // See if BotCand is still valid (because we previously scheduled from Top). +  DEBUG(dbgs() << "Picking from Bot:\n"); +  if (!BotCand.isValid() || BotCand.SU->isScheduled || +      BotCand.Policy != BotPolicy) { +    BotCand.reset(CandPolicy()); +    pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); +    assert(BotCand.Reason != NoCand && "failed to find the first candidate"); +  } else { +    DEBUG(traceCandidate(BotCand)); +#ifndef NDEBUG +    if (VerifyScheduling) { +      SchedCandidate TCand; +      TCand.reset(CandPolicy()); +      pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); +      assert(TCand.SU == BotCand.SU && +             "Last pick result should correspond to re-picking right now"); +    } +#endif    } +    // Check if the top Q has a better candidate. -  pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); -  assert(TopCand.Reason != NoCand && "failed to find the first candidate"); +  DEBUG(dbgs() << "Picking from Top:\n"); +  if (!TopCand.isValid() || TopCand.SU->isScheduled || +      TopCand.Policy != TopPolicy) { +    TopCand.reset(CandPolicy()); +    pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); +    assert(TopCand.Reason != NoCand && "failed to find the first candidate"); +  } else { +    DEBUG(traceCandidate(TopCand)); +#ifndef NDEBUG +    if (VerifyScheduling) { +      SchedCandidate TCand; +      TCand.reset(CandPolicy()); +      pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); +      assert(TCand.SU == TopCand.SU && +           "Last pick result should correspond to re-picking right now"); +    } +#endif +  } -  // Choose the queue with the most important (lowest enum) reason. -  if (TopCand.Reason < BotCand.Reason) { -    IsTopNode = true; -    tracePick(TopCand, IsTopNode); -    return TopCand.SU; +  // Pick best from BotCand and TopCand. +  assert(BotCand.isValid()); +  assert(TopCand.isValid()); +  SchedCandidate Cand = BotCand; +  TopCand.Reason = NoCand; +  tryCandidate(Cand, TopCand, nullptr); +  if (TopCand.Reason != NoCand) { +    Cand.setBest(TopCand); +    DEBUG(traceCandidate(Cand));    } -  // Otherwise prefer the bottom candidate, in node order if all else failed. -  IsTopNode = false; -  tracePick(BotCand, IsTopNode); -  return BotCand.SU; + +  IsTopNode = Cand.AtTop; +  tracePick(Cand); +  return Cand.SU;  }  /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. @@ -2885,27 +3025,25 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {        SU = Top.pickOnlyChoice();        if (!SU) {          CandPolicy NoPolicy; -        SchedCandidate TopCand(NoPolicy); -        pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); +        TopCand.reset(NoPolicy); +        pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);          assert(TopCand.Reason != NoCand && "failed to find a candidate"); -        tracePick(TopCand, true); +        tracePick(TopCand);          SU = TopCand.SU;        }        IsTopNode = true; -    } -    else if (RegionPolicy.OnlyBottomUp) { +    } else if (RegionPolicy.OnlyBottomUp) {        SU = Bot.pickOnlyChoice();        if (!SU) {          CandPolicy NoPolicy; -        SchedCandidate BotCand(NoPolicy); -        pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); +        BotCand.reset(NoPolicy); +        pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);          assert(BotCand.Reason != NoCand && "failed to find a candidate"); -        tracePick(BotCand, false); +        tracePick(BotCand);          SU = BotCand.SU;        }        IsTopNode = false; -    } -    else { +    } else {        SU = pickNodeBidirectional(IsTopNode);      }    } while (SU->isScheduled); @@ -2957,8 +3095,7 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {      Top.bumpNode(SU);      if (SU->hasPhysRegUses)        reschedulePhysRegCopies(SU, true); -  } -  else { +  } else {      SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());      Bot.bumpNode(SU);      if (SU->hasPhysRegDefs) @@ -2976,8 +3113,12 @@ static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {    // data and pass it to later mutations. Have a single mutation that gathers    // the interesting nodes in one pass.    DAG->addMutation(make_unique<CopyConstrain>(DAG->TII, DAG->TRI)); -  if (EnableLoadCluster && DAG->TII->enableClusterLoads()) -    DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI)); +  if (EnableMemOpCluster) { +    if (DAG->TII->enableClusterLoads()) +      DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI)); +    if (DAG->TII->enableClusterStores()) +      DAG->addMutation(make_unique<StoreClusterMutation>(DAG->TII, DAG->TRI)); +  }    if (EnableMacroFusion)      DAG->addMutation(make_unique<MacroFusion>(*DAG->TII, *DAG->TRI));    return DAG; @@ -3065,12 +3206,10 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand,  void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {    ReadyQueue &Q = Top.Available; - -  DEBUG(Q.dump()); -    for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {      SchedCandidate TryCand(Cand.Policy);      TryCand.SU = *I; +    TryCand.AtTop = true;      TryCand.initResourceDelta(DAG, SchedModel);      tryCandidate(Cand, TryCand);      if (TryCand.Reason != NoCand) { @@ -3089,7 +3228,9 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {    SUnit *SU;    do {      SU = Top.pickOnlyChoice(); -    if (!SU) { +    if (SU) { +      tracePick(Only1, true); +    } else {        CandPolicy NoPolicy;        SchedCandidate TopCand(NoPolicy);        // Set the top-down policy based on the state of the current top zone and @@ -3097,7 +3238,7 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {        setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);        pickNodeFromQueue(TopCand);        assert(TopCand.Reason != NoCand && "failed to find a candidate"); -      tracePick(TopCand, true); +      tracePick(TopCand);        SU = TopCand.SU;      }    } while (SU->isScheduled); @@ -3285,8 +3426,7 @@ public:          TopQ.pop();        } while (SU->isScheduled);        IsTopNode = true; -    } -    else { +    } else {        do {          if (BottomQ.empty()) return nullptr;          SU = BottomQ.top(); | 
