diff options
Diffstat (limited to 'llvm/lib/CodeGen/MachineScheduler.cpp')
| -rw-r--r-- | llvm/lib/CodeGen/MachineScheduler.cpp | 142 | 
1 files changed, 94 insertions, 48 deletions
| diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 8d51bb26103a..4f42a2c8aeff 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -297,7 +297,7 @@ priorNonDebug(MachineBasicBlock::const_iterator I,                MachineBasicBlock::const_iterator Beg) {    assert(I != Beg && "reached the top of the region, cannot decrement");    while (--I != Beg) { -    if (!I->isDebugInstr()) +    if (!I->isDebugOrPseudoInstr())        break;    }    return I; @@ -317,7 +317,7 @@ static MachineBasicBlock::const_iterator  nextIfDebug(MachineBasicBlock::const_iterator I,              MachineBasicBlock::const_iterator End) {    for(; I != End; ++I) { -    if (!I->isDebugInstr()) +    if (!I->isDebugOrPseudoInstr())        break;    }    return I; @@ -508,7 +508,7 @@ getSchedRegions(MachineBasicBlock *MBB,        MachineInstr &MI = *std::prev(I);        if (isSchedBoundary(&MI, &*MBB, MF, TII))          break; -      if (!MI.isDebugInstr()) { +      if (!MI.isDebugOrPseudoInstr()) {          // MBB::size() uses instr_iterator to count. Here we need a bundle to          // count as a single instruction.          ++NumRegionInstrs; @@ -927,8 +927,8 @@ void ScheduleDAGMI::placeDebugValues() {  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const { -  for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) { -    if (SUnit *SU = getSUnit(&(*MI))) +  for (MachineInstr &MI : *this) { +    if (SUnit *SU = getSUnit(&MI))        dumpNode(*SU);      else        dbgs() << "Missing SUnit\n"; @@ -1927,17 +1927,15 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {    }    LLVM_DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");    // Add the weak edges. -  for (SmallVectorImpl<SUnit*>::const_iterator -         I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) { -    LLVM_DEBUG(dbgs() << "  Local use SU(" << (*I)->NodeNum << ") -> SU(" +  for (SUnit *LU : LocalUses) { +    LLVM_DEBUG(dbgs() << "  Local use SU(" << LU->NodeNum << ") -> SU("                        << GlobalSU->NodeNum << ")\n"); -    DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak)); +    DAG->addEdge(GlobalSU, SDep(LU, SDep::Weak));    } -  for (SmallVectorImpl<SUnit*>::const_iterator -         I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) { -    LLVM_DEBUG(dbgs() << "  Global use SU(" << (*I)->NodeNum << ") -> SU(" +  for (SUnit *GU : GlobalUses) { +    LLVM_DEBUG(dbgs() << "  Global use SU(" << GU->NodeNum << ") -> SU("                        << FirstLocalSU->NodeNum << ")\n"); -    DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak)); +    DAG->addEdge(FirstLocalSU, SDep(GU, SDep::Weak));    }  } @@ -2006,6 +2004,7 @@ void SchedBoundary::reset() {    IsResourceLimited = false;    ReservedCycles.clear();    ReservedCyclesIndex.clear(); +  ResourceGroupSubUnitMasks.clear();  #ifndef NDEBUG    // Track the maximum number of stall cycles that could arise either from the    // latency of a DAG edge or the number of cycles that a processor resource is @@ -2047,11 +2046,18 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {      unsigned ResourceCount = SchedModel->getNumProcResourceKinds();      ReservedCyclesIndex.resize(ResourceCount);      ExecutedResCounts.resize(ResourceCount); +    ResourceGroupSubUnitMasks.resize(ResourceCount, APInt(ResourceCount, 0));      unsigned NumUnits = 0;      for (unsigned i = 0; i < ResourceCount; ++i) {        ReservedCyclesIndex[i] = NumUnits;        NumUnits += SchedModel->getProcResource(i)->NumUnits; +      if (isUnbufferedGroup(i)) { +        auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin; +        for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits; +             U != UE; ++U) +          ResourceGroupSubUnitMasks[i].setBit(SubUnits[U]); +      }      }      ReservedCycles.resize(NumUnits, InvalidCycle); @@ -2093,7 +2099,9 @@ unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx,  /// scheduled.  Returns the next cycle and the index of the processor resource  /// instance in the reserved cycles vector.  std::pair<unsigned, unsigned> -SchedBoundary::getNextResourceCycle(unsigned PIdx, unsigned Cycles) { +SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, +                                    unsigned Cycles) { +    unsigned MinNextUnreserved = InvalidCycle;    unsigned InstanceIdx = 0;    unsigned StartIndex = ReservedCyclesIndex[PIdx]; @@ -2101,6 +2109,35 @@ SchedBoundary::getNextResourceCycle(unsigned PIdx, unsigned Cycles) {    assert(NumberOfInstances > 0 &&           "Cannot have zero instances of a ProcResource"); +  if (isUnbufferedGroup(PIdx)) { +    // If any subunits are used by the instruction, report that the resource +    // group is available at 0, effectively removing the group record from +    // hazarding and basing the hazarding decisions on the subunit records. +    // Otherwise, choose the first available instance from among the subunits. +    // Specifications which assign cycles to both the subunits and the group or +    // which use an unbuffered group with buffered subunits will appear to +    // schedule strangely. In the first case, the additional cycles for the +    // group will be ignored.  In the second, the group will be ignored +    // entirely. +    for (const MCWriteProcResEntry &PE : +         make_range(SchedModel->getWriteProcResBegin(SC), +                    SchedModel->getWriteProcResEnd(SC))) +      if (ResourceGroupSubUnitMasks[PIdx][PE.ProcResourceIdx]) +        return std::make_pair(0u, StartIndex); + +    auto SubUnits = SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin; +    for (unsigned I = 0, End = NumberOfInstances; I < End; ++I) { +      unsigned NextUnreserved, NextInstanceIdx; +      std::tie(NextUnreserved, NextInstanceIdx) = +          getNextResourceCycle(SC, SubUnits[I], Cycles); +      if (MinNextUnreserved > NextUnreserved) { +        InstanceIdx = NextInstanceIdx; +        MinNextUnreserved = NextUnreserved; +      } +    } +    return std::make_pair(MinNextUnreserved, InstanceIdx); +  } +    for (unsigned I = StartIndex, End = StartIndex + NumberOfInstances; I < End;         ++I) {      unsigned NextUnreserved = getNextResourceCycleByInstance(I, Cycles); @@ -2154,7 +2191,7 @@ bool SchedBoundary::checkHazard(SUnit *SU) {        unsigned ResIdx = PE.ProcResourceIdx;        unsigned Cycles = PE.Cycles;        unsigned NRCycle, InstanceIdx; -      std::tie(NRCycle, InstanceIdx) = getNextResourceCycle(ResIdx, Cycles); +      std::tie(NRCycle, InstanceIdx) = getNextResourceCycle(SC, ResIdx, Cycles);        if (NRCycle > CurrCycle) {  #ifndef NDEBUG          MaxObservedStall = std::max(Cycles, MaxObservedStall); @@ -2304,8 +2341,8 @@ void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) {  ///  /// \return the next cycle at which the instruction may execute without  /// oversubscribing resources. -unsigned SchedBoundary:: -countResource(unsigned PIdx, unsigned Cycles, unsigned NextCycle) { +unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx, +                                      unsigned Cycles, unsigned NextCycle) {    unsigned Factor = SchedModel->getResourceFactor(PIdx);    unsigned Count = Factor * Cycles;    LLVM_DEBUG(dbgs() << "  " << SchedModel->getResourceName(PIdx) << " +" @@ -2327,7 +2364,7 @@ countResource(unsigned PIdx, unsigned Cycles, unsigned NextCycle) {    }    // For reserved resources, record the highest cycle using the resource.    unsigned NextAvailable, InstanceIdx; -  std::tie(NextAvailable, InstanceIdx) = getNextResourceCycle(PIdx, Cycles); +  std::tie(NextAvailable, InstanceIdx) = getNextResourceCycle(SC, PIdx, Cycles);    if (NextAvailable > CurrCycle) {      LLVM_DEBUG(dbgs() << "  Resource conflict: "                        << SchedModel->getResourceName(PIdx) @@ -2407,7 +2444,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {             PI = SchedModel->getWriteProcResBegin(SC),             PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {        unsigned RCycle = -        countResource(PI->ProcResourceIdx, PI->Cycles, NextCycle); +        countResource(SC, PI->ProcResourceIdx, PI->Cycles, NextCycle);        if (RCycle > NextCycle)          NextCycle = RCycle;      } @@ -2422,7 +2459,8 @@ void SchedBoundary::bumpNode(SUnit *SU) {          unsigned PIdx = PI->ProcResourceIdx;          if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {            unsigned ReservedUntil, InstanceIdx; -          std::tie(ReservedUntil, InstanceIdx) = getNextResourceCycle(PIdx, 0); +          std::tie(ReservedUntil, InstanceIdx) = +              getNextResourceCycle(SC, PIdx, 0);            if (isTop()) {              ReservedCycles[InstanceIdx] =                  std::max(ReservedUntil, NextCycle + PI->Cycles); @@ -2780,6 +2818,8 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {  namespace llvm {  /// Return true if this heuristic determines order. +/// TODO: Consider refactor return type of these functions as integer or enum, +/// as we may need to differentiate whether TryCand is better than Cand.  bool tryLess(int TryVal, int CandVal,               GenericSchedulerBase::SchedCandidate &TryCand,               GenericSchedulerBase::SchedCandidate &Cand, @@ -3138,34 +3178,35 @@ void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,  /// \param Cand provides the policy and current best candidate.  /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.  /// \param Zone describes the scheduled zone that we are extending, or nullptr -//              if Cand is from a different zone than TryCand. -void GenericScheduler::tryCandidate(SchedCandidate &Cand, +///             if Cand is from a different zone than TryCand. +/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) +bool GenericScheduler::tryCandidate(SchedCandidate &Cand,                                      SchedCandidate &TryCand,                                      SchedBoundary *Zone) const {    // Initialize the candidate if needed.    if (!Cand.isValid()) {      TryCand.Reason = NodeOrder; -    return; +    return true;    }    // Bias PhysReg Defs and copies to their uses and defined respectively.    if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),                   biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) -    return; +    return TryCand.Reason != NoCand;    // Avoid exceeding the target's limit.    if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,                                                 Cand.RPDelta.Excess,                                                 TryCand, Cand, RegExcess, TRI,                                                 DAG->MF)) -    return; +    return TryCand.Reason != NoCand;    // Avoid increasing the max critical pressure in the scheduled region.    if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,                                                 Cand.RPDelta.CriticalMax,                                                 TryCand, Cand, RegCritical, TRI,                                                 DAG->MF)) -    return; +    return TryCand.Reason != NoCand;    // We only compare a subset of features when comparing nodes between    // Top and Bottom boundary. Some properties are simply incomparable, in many @@ -3179,12 +3220,12 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,      // heuristics to take precedence.      if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&          tryLatency(TryCand, Cand, *Zone)) -      return; +      return TryCand.Reason != NoCand;      // Prioritize instructions that read unbuffered resources by stall cycles.      if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),                  Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) -      return; +      return TryCand.Reason != NoCand;    }    // Keep clustered nodes together to encourage downstream peephole @@ -3200,14 +3241,14 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,    if (tryGreater(TryCand.SU == TryCandNextClusterSU,                   Cand.SU == CandNextClusterSU,                   TryCand, Cand, Cluster)) -    return; +    return TryCand.Reason != NoCand;    if (SameBoundary) {      // Weak edges are for clustering and other constraints.      if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),                  getWeakLeft(Cand.SU, Cand.AtTop),                  TryCand, Cand, Weak)) -      return; +      return TryCand.Reason != NoCand;    }    // Avoid increasing the max pressure of the entire region. @@ -3215,31 +3256,34 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,                                                 Cand.RPDelta.CurrentMax,                                                 TryCand, Cand, RegMax, TRI,                                                 DAG->MF)) -    return; +    return TryCand.Reason != NoCand;    if (SameBoundary) {      // Avoid critical resource consumption and balance the schedule.      TryCand.initResourceDelta(DAG, SchedModel);      if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,                  TryCand, Cand, ResourceReduce)) -      return; +      return TryCand.Reason != NoCand;      if (tryGreater(TryCand.ResDelta.DemandedResources,                     Cand.ResDelta.DemandedResources,                     TryCand, Cand, ResourceDemand)) -      return; +      return TryCand.Reason != NoCand;      // Avoid serializing long latency dependence chains.      // For acyclic path limited loops, latency was already checked above.      if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&          !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) -      return; +      return TryCand.Reason != NoCand;      // Fall through to original instruction order.      if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)          || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {        TryCand.Reason = NodeOrder; +      return true;      }    } + +  return false;  }  /// Pick the best candidate from the queue. @@ -3261,8 +3305,7 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,      initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker);      // Pass SchedBoundary only when comparing nodes from the same boundary.      SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; -    tryCandidate(Cand, TryCand, ZoneArg); -    if (TryCand.Reason != NoCand) { +    if (tryCandidate(Cand, TryCand, ZoneArg)) {        // Initialize resource delta if needed in case future heuristics query it.        if (TryCand.ResDelta == SchedResourceDelta())          TryCand.initResourceDelta(DAG, SchedModel); @@ -3340,8 +3383,7 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {    assert(TopCand.isValid());    SchedCandidate Cand = BotCand;    TopCand.Reason = NoCand; -  tryCandidate(Cand, TopCand, nullptr); -  if (TopCand.Reason != NoCand) { +  if (tryCandidate(Cand, TopCand, nullptr)) {      Cand.setBest(TopCand);      LLVM_DEBUG(traceCandidate(Cand));    } @@ -3505,42 +3547,47 @@ void PostGenericScheduler::registerRoots() {  ///  /// \param Cand provides the policy and current best candidate.  /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. -void PostGenericScheduler::tryCandidate(SchedCandidate &Cand, +/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) +bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,                                          SchedCandidate &TryCand) {    // Initialize the candidate if needed.    if (!Cand.isValid()) {      TryCand.Reason = NodeOrder; -    return; +    return true;    }    // Prioritize instructions that read unbuffered resources by stall cycles.    if (tryLess(Top.getLatencyStallCycles(TryCand.SU),                Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) -    return; +    return TryCand.Reason != NoCand;    // Keep clustered nodes together.    if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),                   Cand.SU == DAG->getNextClusterSucc(),                   TryCand, Cand, Cluster)) -    return; +    return TryCand.Reason != NoCand;    // Avoid critical resource consumption and balance the schedule.    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,                TryCand, Cand, ResourceReduce)) -    return; +    return TryCand.Reason != NoCand;    if (tryGreater(TryCand.ResDelta.DemandedResources,                   Cand.ResDelta.DemandedResources,                   TryCand, Cand, ResourceDemand)) -    return; +    return TryCand.Reason != NoCand;    // Avoid serializing long latency dependence chains.    if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) { -    return; +    return TryCand.Reason != NoCand;    }    // Fall through to original instruction order. -  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) +  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {      TryCand.Reason = NodeOrder; +    return true; +  } + +  return false;  }  void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) { @@ -3550,8 +3597,7 @@ void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {      TryCand.SU = SU;      TryCand.AtTop = true;      TryCand.initResourceDelta(DAG, SchedModel); -    tryCandidate(Cand, TryCand); -    if (TryCand.Reason != NoCand) { +    if (tryCandidate(Cand, TryCand)) {        Cand.setBest(TryCand);        LLVM_DEBUG(traceCandidate(Cand));      } | 
