diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/GCNILPSched.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNILPSched.cpp | 363 | 
1 files changed, 363 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNILPSched.cpp b/llvm/lib/Target/AMDGPU/GCNILPSched.cpp new file mode 100644 index 000000000000..39072af7d871 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNILPSched.cpp @@ -0,0 +1,363 @@ +//===---------------------------- GCNILPSched.cpp - -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +namespace { + +class GCNILPScheduler { +  struct Candidate : ilist_node<Candidate> { +    SUnit *SU; + +    Candidate(SUnit *SU_) +      : SU(SU_) {} +  }; + +  SpecificBumpPtrAllocator<Candidate> Alloc; +  typedef simple_ilist<Candidate> Queue; +  Queue PendingQueue; +  Queue AvailQueue; +  unsigned CurQueueId = 0; + +  std::vector<unsigned> SUNumbers; + +  /// CurCycle - The current scheduler state corresponds to this cycle. +  unsigned CurCycle = 0; + +  unsigned getNodePriority(const SUnit *SU) const; + +  const SUnit *pickBest(const SUnit *left, const SUnit *right); +  Candidate* pickCandidate(); + +  void releasePending(); +  void advanceToCycle(unsigned NextCycle); +  void releasePredecessors(const SUnit* SU); + +public: +  std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots, +                                     const ScheduleDAG &DAG); +}; +} // namespace + +/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number. +/// Smaller number is the higher priority. +static unsigned +CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) { +  unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum]; +  if (SethiUllmanNumber != 0) +    return SethiUllmanNumber; + +  unsigned Extra = 0; +  for (const SDep &Pred : SU->Preds) { +    if (Pred.isCtrl()) continue;  // ignore chain preds +    SUnit *PredSU = Pred.getSUnit(); +    unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers); +    if (PredSethiUllman > SethiUllmanNumber) { +      SethiUllmanNumber = PredSethiUllman; +      Extra = 0; +    } +    else if (PredSethiUllman == SethiUllmanNumber) +      ++Extra; +  } + +  SethiUllmanNumber += Extra; + +  if (SethiUllmanNumber == 0) +    SethiUllmanNumber = 1; + +  return SethiUllmanNumber; +} + +// Lower priority means schedule further down. For bottom-up scheduling, lower +// priority SUs are scheduled before higher priority SUs. +unsigned GCNILPScheduler::getNodePriority(const SUnit *SU) const { +  assert(SU->NodeNum < SUNumbers.size()); +  if (SU->NumSuccs == 0 && SU->NumPreds != 0) +    // If SU does not have a register use, i.e. it doesn't produce a value +    // that would be consumed (e.g. store), then it terminates a chain of +    // computation.  Give it a large SethiUllman number so it will be +    // scheduled right before its predecessors that it doesn't lengthen +    // their live ranges. +    return 0xffff; + +  if (SU->NumPreds == 0 && SU->NumSuccs != 0) +    // If SU does not have a register def, schedule it close to its uses +    // because it does not lengthen any live ranges. +    return 0; + +  return SUNumbers[SU->NodeNum]; +} + +/// closestSucc - Returns the scheduled cycle of the successor which is +/// closest to the current cycle. +static unsigned closestSucc(const SUnit *SU) { +  unsigned MaxHeight = 0; +  for (const SDep &Succ : SU->Succs) { +    if (Succ.isCtrl()) continue;  // ignore chain succs +    unsigned Height = Succ.getSUnit()->getHeight(); +    // If there are bunch of CopyToRegs stacked up, they should be considered +    // to be at the same position. +    if (Height > MaxHeight) +      MaxHeight = Height; +  } +  return MaxHeight; +} + +/// calcMaxScratches - Returns an cost estimate of the worse case requirement +/// for scratch registers, i.e. number of data dependencies. +static unsigned calcMaxScratches(const SUnit *SU) { +  unsigned Scratches = 0; +  for (const SDep &Pred : SU->Preds) { +    if (Pred.isCtrl()) continue;  // ignore chain preds +    Scratches++; +  } +  return Scratches; +} + +// Return -1 if left has higher priority, 1 if right has higher priority. +// Return 0 if latency-based priority is equivalent. +static int BUCompareLatency(const SUnit *left, const SUnit *right) { +  // Scheduling an instruction that uses a VReg whose postincrement has not yet +  // been scheduled will induce a copy. Model this as an extra cycle of latency. +  int LHeight = (int)left->getHeight(); +  int RHeight = (int)right->getHeight(); + +  // If either node is scheduling for latency, sort them by height/depth +  // and latency. + +  // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer +  // is enabled, grouping instructions by cycle, then its height is already +  // covered so only its depth matters. We also reach this point if both stall +  // but have the same height. +  if (LHeight != RHeight) +    return LHeight > RHeight ? 1 : -1; + +  int LDepth = left->getDepth(); +  int RDepth = right->getDepth(); +  if (LDepth != RDepth) { +    LLVM_DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum +                      << ") depth " << LDepth << " vs SU (" << right->NodeNum +                      << ") depth " << RDepth << "\n"); +    return LDepth < RDepth ? 1 : -1; +  } +  if (left->Latency != right->Latency) +    return left->Latency > right->Latency ? 1 : -1; + +  return 0; +} + +const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right) +{ +  // TODO: add register pressure lowering checks + +  bool const DisableSchedCriticalPath = false; +  int MaxReorderWindow = 6; +  if (!DisableSchedCriticalPath) { +    int spread = (int)left->getDepth() - (int)right->getDepth(); +    if (std::abs(spread) > MaxReorderWindow) { +      LLVM_DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): " +                        << left->getDepth() << " != SU(" << right->NodeNum +                        << "): " << right->getDepth() << "\n"); +      return left->getDepth() < right->getDepth() ? right : left; +    } +  } + +  bool const DisableSchedHeight = false; +  if (!DisableSchedHeight && left->getHeight() != right->getHeight()) { +    int spread = (int)left->getHeight() - (int)right->getHeight(); +    if (std::abs(spread) > MaxReorderWindow) +      return left->getHeight() > right->getHeight() ? right : left; +  } + +  // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down. +  unsigned LPriority = getNodePriority(left); +  unsigned RPriority = getNodePriority(right); + +  if (LPriority != RPriority) +    return LPriority > RPriority ? right : left; + +  // Try schedule def + use closer when Sethi-Ullman numbers are the same. +  // e.g. +  // t1 = op t2, c1 +  // t3 = op t4, c2 +  // +  // and the following instructions are both ready. +  // t2 = op c3 +  // t4 = op c4 +  // +  // Then schedule t2 = op first. +  // i.e. +  // t4 = op c4 +  // t2 = op c3 +  // t1 = op t2, c1 +  // t3 = op t4, c2 +  // +  // This creates more short live intervals. +  unsigned LDist = closestSucc(left); +  unsigned RDist = closestSucc(right); +  if (LDist != RDist) +    return LDist < RDist ? right : left; + +  // How many registers becomes live when the node is scheduled. +  unsigned LScratch = calcMaxScratches(left); +  unsigned RScratch = calcMaxScratches(right); +  if (LScratch != RScratch) +    return LScratch > RScratch ? right : left; + +  bool const DisableSchedCycles = false; +  if (!DisableSchedCycles) { +    int result = BUCompareLatency(left, right); +    if (result != 0) +      return result > 0 ? right : left; +    return left; +  } +  else { +    if (left->getHeight() != right->getHeight()) +      return (left->getHeight() > right->getHeight()) ? right : left; + +    if (left->getDepth() != right->getDepth()) +      return (left->getDepth() < right->getDepth()) ? right : left; +  } + +  assert(left->NodeQueueId && right->NodeQueueId && +        "NodeQueueId cannot be zero"); +  return (left->NodeQueueId > right->NodeQueueId) ? right : left; +} + +GCNILPScheduler::Candidate* GCNILPScheduler::pickCandidate() { +  if (AvailQueue.empty()) +    return nullptr; +  auto Best = AvailQueue.begin(); +  for (auto I = std::next(AvailQueue.begin()), E = AvailQueue.end(); I != E; ++I) { +    auto NewBestSU = pickBest(Best->SU, I->SU); +    if (NewBestSU != Best->SU) { +      assert(NewBestSU == I->SU); +      Best = I; +    } +  } +  return &*Best; +} + +void GCNILPScheduler::releasePending() { +  // Check to see if any of the pending instructions are ready to issue.  If +  // so, add them to the available queue. +  for(auto I = PendingQueue.begin(), E = PendingQueue.end(); I != E;) { +    auto &C = *I++; +    if (C.SU->getHeight() <= CurCycle) { +      PendingQueue.remove(C); +      AvailQueue.push_back(C); +      C.SU->NodeQueueId = CurQueueId++; +    } +  } +} + +/// Move the scheduler state forward by the specified number of Cycles. +void GCNILPScheduler::advanceToCycle(unsigned NextCycle) { +  if (NextCycle <= CurCycle) +    return; +  CurCycle = NextCycle; +  releasePending(); +} + +void GCNILPScheduler::releasePredecessors(const SUnit* SU) { +  for (const auto &PredEdge : SU->Preds) { +    auto PredSU = PredEdge.getSUnit(); +    if (PredEdge.isWeak()) +      continue; +    assert(PredSU->isBoundaryNode() || PredSU->NumSuccsLeft > 0); + +    PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge.getLatency()); + +    if (!PredSU->isBoundaryNode() && --PredSU->NumSuccsLeft == 0) +      PendingQueue.push_front(*new (Alloc.Allocate()) Candidate(PredSU)); +  } +} + +std::vector<const SUnit*> +GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots, +                          const ScheduleDAG &DAG) { +  auto &SUnits = const_cast<ScheduleDAG&>(DAG).SUnits; + +  std::vector<SUnit> SUSavedCopy; +  SUSavedCopy.resize(SUnits.size()); + +  // we cannot save only those fields we touch: some of them are private +  // so save units verbatim: this assumes SUnit should have value semantics +  for (const SUnit &SU : SUnits) +    SUSavedCopy[SU.NodeNum] = SU; + +  SUNumbers.assign(SUnits.size(), 0); +  for (const SUnit &SU : SUnits) +    CalcNodeSethiUllmanNumber(&SU, SUNumbers); + +  for (auto SU : BotRoots) { +    AvailQueue.push_back( +      *new (Alloc.Allocate()) Candidate(const_cast<SUnit*>(SU))); +  } +  releasePredecessors(&DAG.ExitSU); + +  std::vector<const SUnit*> Schedule; +  Schedule.reserve(SUnits.size()); +  while (true) { +    if (AvailQueue.empty() && !PendingQueue.empty()) { +      auto EarliestSU = std::min_element( +        PendingQueue.begin(), PendingQueue.end(), +        [=](const Candidate& C1, const Candidate& C2) { +        return C1.SU->getHeight() < C2.SU->getHeight(); +      })->SU; +      advanceToCycle(std::max(CurCycle + 1, EarliestSU->getHeight())); +    } +    if (AvailQueue.empty()) +      break; + +    LLVM_DEBUG(dbgs() << "\n=== Picking candidate\n" +                         "Ready queue:"; +               for (auto &C +                    : AvailQueue) dbgs() +               << ' ' << C.SU->NodeNum; +               dbgs() << '\n';); + +    auto C = pickCandidate(); +    assert(C); +    AvailQueue.remove(*C); +    auto SU = C->SU; +    LLVM_DEBUG(dbgs() << "Selected "; DAG.dumpNode(*SU)); + +    advanceToCycle(SU->getHeight()); + +    releasePredecessors(SU); +    Schedule.push_back(SU); +    SU->isScheduled = true; +  } +  assert(SUnits.size() == Schedule.size()); + +  std::reverse(Schedule.begin(), Schedule.end()); + +  // restore units +  for (auto &SU : SUnits) +    SU = SUSavedCopy[SU.NodeNum]; + +  return Schedule; +} + +namespace llvm { +std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots, +                                              const ScheduleDAG &DAG) { +  GCNILPScheduler S; +  return S.schedule(BotRoots, DAG); +} +}  | 
