diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/CodeGen/WindowScheduler.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/CodeGen/WindowScheduler.cpp | 702 |
1 files changed, 702 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/WindowScheduler.cpp b/contrib/llvm-project/llvm/lib/CodeGen/WindowScheduler.cpp new file mode 100644 index 000000000000..0777480499e5 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/CodeGen/WindowScheduler.cpp @@ -0,0 +1,702 @@ +//======----------- WindowScheduler.cpp - window scheduler -------------======// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// An implementation of the Window Scheduling software pipelining algorithm. +// +// The fundamental concept of the window scheduling algorithm involves folding +// the original MBB at a specific position, followed by list scheduling on the +// folded MIs. The optimal scheduling result is then chosen from various folding +// positions as the final scheduling outcome. +// +// The primary challenge in this algorithm lies in generating the folded MIs and +// establishing their dependencies. We have innovatively employed a new MBB, +// created by copying the original MBB three times, known as TripleMBB. This +// TripleMBB enables the convenient implementation of MI folding and dependency +// establishment. To facilitate the algorithm's implementation, we have also +// devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle. +// +// Another challenge in the algorithm is the scheduling of phis. Semantically, +// it is difficult to place the phis in the window and perform list scheduling. +// Therefore, we schedule these phis separately after each list scheduling. +// +// The provided implementation is designed for use before the Register Allocator +// (RA). If the target requires implementation after RA, it is recommended to +// reimplement analyseII(), schedulePhi(), and expand(). Additionally, +// target-specific logic can be added in initialize(), preProcess(), and +// postProcess(). +// +// Lastly, it is worth mentioning that getSearchIndexes() is an important +// function. We have experimented with more complex heuristics on downstream +// target and achieved favorable results. +// +//===----------------------------------------------------------------------===// +#include "llvm/CodeGen/WindowScheduler.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePipeliner.h" +#include "llvm/CodeGen/ModuloSchedule.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TimeProfiler.h" + +using namespace llvm; + +#define DEBUG_TYPE "pipeliner" + +namespace { +STATISTIC(NumTryWindowSchedule, + "Number of loops that we attempt to use window scheduling"); +STATISTIC(NumTryWindowSearch, + "Number of times that we run list schedule in the window scheduling"); +STATISTIC(NumWindowSchedule, + "Number of loops that we successfully use window scheduling"); +STATISTIC(NumFailAnalyseII, + "Window scheduling abort due to the failure of the II analysis"); + +cl::opt<unsigned> + WindowSearchNum("window-search-num", + cl::desc("The number of searches per loop in the window " + "algorithm. 0 means no search number limit."), + cl::Hidden, cl::init(6)); + +cl::opt<unsigned> WindowSearchRatio( + "window-search-ratio", + cl::desc("The ratio of searches per loop in the window algorithm. 100 " + "means search all positions in the loop, while 0 means not " + "performing any search."), + cl::Hidden, cl::init(40)); + +cl::opt<unsigned> WindowIICoeff( + "window-ii-coeff", + cl::desc( + "The coefficient used when initializing II in the window algorithm."), + cl::Hidden, cl::init(5)); + +cl::opt<unsigned> WindowRegionLimit( + "window-region-limit", + cl::desc( + "The lower limit of the scheduling region in the window algorithm."), + cl::Hidden, cl::init(3)); + +cl::opt<unsigned> WindowDiffLimit( + "window-diff-limit", + cl::desc("The lower limit of the difference between best II and base II in " + "the window algorithm. If the difference is smaller than " + "this lower limit, window scheduling will not be performed."), + cl::Hidden, cl::init(2)); +} // namespace + +// WindowIILimit serves as an indicator of abnormal scheduling results and could +// potentially be referenced by the derived target window scheduler. +cl::opt<unsigned> + WindowIILimit("window-ii-limit", + cl::desc("The upper limit of II in the window algorithm."), + cl::Hidden, cl::init(1000)); + +WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML) + : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML), + Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()), + TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) { + TripleDAG = std::unique_ptr<ScheduleDAGInstrs>( + createMachineScheduler(/*OnlyBuildGraph=*/true)); +} + +bool WindowScheduler::run() { + if (!initialize()) { + LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n"); + return false; + } + // The window algorithm is time-consuming, and its compilation time should be + // taken into consideration. + TimeTraceScope Scope("WindowSearch"); + ++NumTryWindowSchedule; + // Performing the relevant processing before window scheduling. + preProcess(); + // The main window scheduling begins. + std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler()); + auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio); + for (unsigned Idx : SearchIndexes) { + OriToCycle.clear(); + ++NumTryWindowSearch; + // The scheduling starts with non-phi instruction, so SchedPhiNum needs to + // be added to Idx. + unsigned Offset = Idx + SchedPhiNum; + auto Range = getScheduleRange(Offset, SchedInstrNum); + SchedDAG->startBlock(MBB); + SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum); + SchedDAG->schedule(); + LLVM_DEBUG(SchedDAG->dump()); + unsigned II = analyseII(*SchedDAG, Offset); + if (II == WindowIILimit) { + restoreTripleMBB(); + LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n"); + ++NumFailAnalyseII; + continue; + } + schedulePhi(Offset, II); + updateScheduleResult(Offset, II); + restoreTripleMBB(); + LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is " + << II << ".\n"); + } + // Performing the relevant processing after window scheduling. + postProcess(); + // Check whether the scheduling result is valid. + if (!isScheduleValid()) { + LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n"); + return false; + } + LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset + << " and Best II is " << BestII << ".\n"); + // Expand the scheduling result to prologue, kernel, and epilogue. + expand(); + ++NumWindowSchedule; + return true; +} + +ScheduleDAGInstrs * +WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) { + return OnlyBuildGraph + ? new ScheduleDAGMI( + Context, std::make_unique<PostGenericScheduler>(Context), + true) + : Context->PassConfig->createMachineScheduler(Context); +} + +bool WindowScheduler::initialize() { + if (!Subtarget->enableWindowScheduler()) { + LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n"); + return false; + } + // Initialized the member variables used by window algorithm. + OriMIs.clear(); + TriMIs.clear(); + TriToOri.clear(); + OriToCycle.clear(); + SchedResult.clear(); + SchedPhiNum = 0; + SchedInstrNum = 0; + BestII = UINT_MAX; + BestOffset = 0; + BaseII = 0; + // List scheduling used in the window algorithm depends on LiveIntervals. + if (!Context->LIS) { + LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n"); + return false; + } + // Check each MI in MBB. + SmallSet<Register, 8> PrevDefs; + SmallSet<Register, 8> PrevUses; + auto IsLoopCarried = [&](MachineInstr &Phi) { + // Two cases are checked here: (1)The virtual register defined by the + // preceding phi is used by the succeeding phi;(2)The preceding phi uses the + // virtual register defined by the succeeding phi. + if (PrevUses.count(Phi.getOperand(0).getReg())) + return true; + PrevDefs.insert(Phi.getOperand(0).getReg()); + for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { + if (PrevDefs.count(Phi.getOperand(I).getReg())) + return true; + PrevUses.insert(Phi.getOperand(I).getReg()); + } + return false; + }; + auto PLI = TII->analyzeLoopForPipelining(MBB); + for (auto &MI : *MBB) { + if (MI.isMetaInstruction() || MI.isTerminator()) + continue; + if (MI.isPHI()) { + if (IsLoopCarried(MI)) { + LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n"); + return false; + } + ++SchedPhiNum; + ++BestOffset; + } else + ++SchedInstrNum; + if (TII->isSchedulingBoundary(MI, MBB, *MF)) { + LLVM_DEBUG( + dbgs() << "Boundary MI is not allowed in window scheduling!\n"); + return false; + } + if (PLI->shouldIgnoreForPipelining(&MI)) { + LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in " + "window scheduling!\n"); + return false; + } + for (auto &Def : MI.all_defs()) + if (Def.isReg() && Def.getReg().isPhysical()) + return false; + } + if (SchedInstrNum <= WindowRegionLimit) { + LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n"); + return false; + } + return true; +} + +void WindowScheduler::preProcess() { + // Prior to window scheduling, it's necessary to backup the original MBB, + // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB. + backupMBB(); + generateTripleMBB(); + TripleDAG->startBlock(MBB); + TripleDAG->enterRegion( + MBB, MBB->begin(), MBB->getFirstTerminator(), + std::distance(MBB->begin(), MBB->getFirstTerminator())); + TripleDAG->buildSchedGraph(Context->AA); +} + +void WindowScheduler::postProcess() { + // After window scheduling, it's necessary to clear the TripleDAG and restore + // to the original MBB. + TripleDAG->exitRegion(); + TripleDAG->finishBlock(); + restoreMBB(); +} + +void WindowScheduler::backupMBB() { + for (auto &MI : MBB->instrs()) + OriMIs.push_back(&MI); + // Remove MIs and the corresponding live intervals. + for (auto &MI : make_early_inc_range(*MBB)) { + Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true); + MBB->remove(&MI); + } +} + +void WindowScheduler::restoreMBB() { + // Erase MIs and the corresponding live intervals. + for (auto &MI : make_early_inc_range(*MBB)) { + Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true); + MI.eraseFromParent(); + } + // Restore MBB to the state before window scheduling. + for (auto *MI : OriMIs) + MBB->push_back(MI); + updateLiveIntervals(); +} + +void WindowScheduler::generateTripleMBB() { + const unsigned DuplicateNum = 3; + TriMIs.clear(); + TriToOri.clear(); + assert(OriMIs.size() > 0 && "The Original MIs were not backed up!"); + // Step 1: Performing the first copy of MBB instructions, excluding + // terminators. At the same time, we back up the anti-register of phis. + // DefPairs hold the old and new define register pairs. + DenseMap<Register, Register> DefPairs; + for (auto *MI : OriMIs) { + if (MI->isMetaInstruction() || MI->isTerminator()) + continue; + if (MI->isPHI()) + if (Register AntiReg = getAntiRegister(MI)) + DefPairs[MI->getOperand(0).getReg()] = AntiReg; + auto *NewMI = MF->CloneMachineInstr(MI); + MBB->push_back(NewMI); + TriMIs.push_back(NewMI); + TriToOri[NewMI] = MI; + } + // Step 2: Performing the remaining two copies of MBB instructions excluding + // phis, and the last one contains terminators. At the same time, registers + // are updated accordingly. + for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) { + for (auto *MI : OriMIs) { + if (MI->isPHI() || MI->isMetaInstruction() || + (MI->isTerminator() && Cnt < DuplicateNum - 1)) + continue; + auto *NewMI = MF->CloneMachineInstr(MI); + DenseMap<Register, Register> NewDefs; + // New defines are updated. + for (auto MO : NewMI->all_defs()) + if (MO.isReg() && MO.getReg().isVirtual()) { + Register NewDef = + MRI->createVirtualRegister(MRI->getRegClass(MO.getReg())); + NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI); + NewDefs[MO.getReg()] = NewDef; + } + // New uses are updated. + for (auto DefRegPair : DefPairs) + if (NewMI->readsRegister(DefRegPair.first, TRI)) { + Register NewUse = DefRegPair.second; + // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3': + // + // BB.3: DefPairs + // ================================== + // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7) + // ... + // ================================== + // ... + // %4 = sub i32 %1, %3 + // ... + // %7 = add i32 %5, %6 + // ... + // ---------------------------------- + // ... + // %8 = sub i32 %7, %3 (%1,%7),(%4,%8) + // ... + // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9) + // ... + // ---------------------------------- + // ... + // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9) + // ... ^ + // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11) + // ... + // ================================== + // < Terminators > + // ================================== + if (DefPairs.count(NewUse)) + NewUse = DefPairs[NewUse]; + NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI); + } + // DefPairs is updated at last. + for (auto &NewDef : NewDefs) + DefPairs[NewDef.first] = NewDef.second; + MBB->push_back(NewMI); + TriMIs.push_back(NewMI); + TriToOri[NewMI] = MI; + } + } + // Step 3: The registers used by phis are updated, and they are generated in + // the third copy of MBB. + // In the privious example, the old phi is: + // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] + // The new phi is: + // %1 = phi i32 [%2, %BB.1], [%11, %BB.3] + for (auto &Phi : MBB->phis()) { + for (auto DefRegPair : DefPairs) + if (Phi.readsRegister(DefRegPair.first, TRI)) + Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI); + } + updateLiveIntervals(); +} + +void WindowScheduler::restoreTripleMBB() { + // After list scheduling, the MBB is restored in one traversal. + for (size_t I = 0; I < TriMIs.size(); ++I) { + auto *MI = TriMIs[I]; + auto OldPos = MBB->begin(); + std::advance(OldPos, I); + auto CurPos = MI->getIterator(); + if (CurPos != OldPos) { + MBB->splice(OldPos, MBB, CurPos); + Context->LIS->handleMove(*MI, /*UpdateFlags=*/false); + } + } +} + +SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum, + unsigned SearchRatio) { + // We use SearchRatio to get the index range, and then evenly get the indexes + // according to the SearchNum. This is a simple huristic. Depending on the + // characteristics of the target, more complex algorithms can be used for both + // performance and compilation time. + assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!"); + unsigned MaxIdx = SchedInstrNum * SearchRatio / 100; + unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1; + SmallVector<unsigned> SearchIndexes; + for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step) + SearchIndexes.push_back(Idx); + return SearchIndexes; +} + +int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) { + // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1. + unsigned MaxDepth = 1; + for (auto &SU : DAG.SUnits) + MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth); + return MaxDepth * WindowIICoeff; +} + +int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG, + unsigned Offset) { + int InitII = getEstimatedII(DAG); + ResourceManager RM(Subtarget, &DAG); + RM.init(InitII); + // ResourceManager and DAG are used to calculate the maximum cycle for the + // scheduled MIs. Since MIs in the Region have already been scheduled, the + // emit cycles can be estimated in order here. + int CurCycle = 0; + auto Range = getScheduleRange(Offset, SchedInstrNum); + for (auto &MI : Range) { + auto *SU = DAG.getSUnit(&MI); + int ExpectCycle = CurCycle; + // The predecessors of current MI determine its earliest issue cycle. + for (auto &Pred : SU->Preds) { + if (Pred.isWeak()) + continue; + auto *PredMI = Pred.getSUnit()->getInstr(); + int PredCycle = getOriCycle(PredMI); + ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency()); + } + // ResourceManager can be used to detect resource conflicts between the + // current MI and the previously inserted MIs. + while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) { + ++CurCycle; + if (CurCycle == (int)WindowIILimit) + return CurCycle; + } + RM.reserveResources(*SU, CurCycle); + OriToCycle[getOriMI(&MI)] = CurCycle; + LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S." + << getOriStage(getOriMI(&MI), Offset) << "]: " << MI); + } + LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n"); + return CurCycle; +} + +// By utilizing TripleDAG, we can easily establish dependencies between A and B. +// Based on the MaxCycle and the issue cycle of A and B, we can determine +// whether it is necessary to add a stall cycle. This is because, without +// inserting the stall cycle, the latency constraint between A and B cannot be +// satisfied. The details are as follows: +// +// New MBB: +// ======================================== +// < Phis > +// ======================================== (sliding direction) +// MBB copy 1 | +// V +// +// ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window----- +// | | +// ===================V==================== | +// MBB copy 2 < MI B > | +// | +// < MI A > V +// ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------ +// : +// ===================V==================== +// MBB copy 3 < MI B'> +// +// +// +// +// ======================================== +// < Terminators > +// ======================================== +int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) { + int MaxStallCycle = 0; + auto Range = getScheduleRange(Offset, SchedInstrNum); + for (auto &MI : Range) { + auto *SU = TripleDAG->getSUnit(&MI); + int DefCycle = getOriCycle(&MI); + for (auto &Succ : SU->Succs) { + if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU) + continue; + // If the expected cycle does not exceed MaxCycle, no check is needed. + if (DefCycle + (int)Succ.getLatency() <= MaxCycle) + continue; + // If the cycle of the scheduled MI A is less than that of the scheduled + // MI B, the scheduling will fail because the lifetime of the + // corresponding register exceeds II. + auto *SuccMI = Succ.getSUnit()->getInstr(); + int UseCycle = getOriCycle(SuccMI); + if (DefCycle < UseCycle) + return WindowIILimit; + // Get the stall cycle introduced by the register between two trips. + int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle; + MaxStallCycle = std::max(MaxStallCycle, StallCycle); + } + } + LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n"); + return MaxStallCycle; +} + +unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) { + LLVM_DEBUG(dbgs() << "Start analyzing II:\n"); + int MaxCycle = calculateMaxCycle(DAG, Offset); + if (MaxCycle == (int)WindowIILimit) + return MaxCycle; + int StallCycle = calculateStallCycle(Offset, MaxCycle); + if (StallCycle == (int)WindowIILimit) + return StallCycle; + // The value of II is equal to the maximum execution cycle plus 1. + return MaxCycle + StallCycle + 1; +} + +void WindowScheduler::schedulePhi(int Offset, unsigned &II) { + LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n"); + for (auto &Phi : MBB->phis()) { + int LateCycle = INT_MAX; + auto *SU = TripleDAG->getSUnit(&Phi); + for (auto &Succ : SU->Succs) { + // Phi doesn't have any Anti successors. + if (Succ.getKind() != SDep::Data) + continue; + // Phi is scheduled before the successor of stage 0. The issue cycle of + // phi is the latest cycle in this interval. + auto *SuccMI = Succ.getSUnit()->getInstr(); + int Cycle = getOriCycle(SuccMI); + if (getOriStage(getOriMI(SuccMI), Offset) == 0) + LateCycle = std::min(LateCycle, Cycle); + } + // The anti-dependency of phi need to be handled separately in the same way. + if (Register AntiReg = getAntiRegister(&Phi)) { + auto *AntiMI = MRI->getVRegDef(AntiReg); + // AntiReg may be defined outside the kernel MBB. + if (AntiMI->getParent() == MBB) { + auto AntiCycle = getOriCycle(AntiMI); + if (getOriStage(getOriMI(AntiMI), Offset) == 0) + LateCycle = std::min(LateCycle, AntiCycle); + } + } + // If there is no limit to the late cycle, a default value is given. + if (LateCycle == INT_MAX) + LateCycle = (int)(II - 1); + LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi); + // The issue cycle of phi is set to the latest cycle in the interval. + auto *OriPhi = getOriMI(&Phi); + OriToCycle[OriPhi] = LateCycle; + } +} + +DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset, + unsigned II) { + // At each issue cycle, phi is placed before MIs in stage 0. So the simplest + // way is to put phi at the beginning of the current cycle. + DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs; + auto Range = getScheduleRange(Offset, SchedInstrNum); + for (auto &Phi : MBB->phis()) + CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi)); + for (auto &MI : Range) + CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI)); + // Each MI is assigned a separate ordered Id, which is used as a sort marker + // in the following expand process. + DenseMap<MachineInstr *, int> IssueOrder; + int Id = 0; + for (int Cycle = 0; Cycle < (int)II; ++Cycle) { + if (!CycleToMIs.count(Cycle)) + continue; + for (auto *MI : CycleToMIs[Cycle]) + IssueOrder[MI] = Id++; + } + return IssueOrder; +} + +void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) { + // At the first update, Offset is equal to SchedPhiNum. At this time, only + // BestII, BestOffset, and BaseII need to be updated. + if (Offset == SchedPhiNum) { + BestII = II; + BestOffset = SchedPhiNum; + BaseII = II; + return; + } + // The update will only continue if the II is smaller than BestII and the II + // is sufficiently small. + if ((II >= BestII) || (II + WindowDiffLimit > BaseII)) + return; + BestII = II; + BestOffset = Offset; + // Record the result of the current list scheduling, noting that each MI is + // stored unordered in SchedResult. + SchedResult.clear(); + auto IssueOrder = getIssueOrder(Offset, II); + for (auto &Pair : OriToCycle) { + assert(IssueOrder.count(Pair.first) && "Cannot find original MI!"); + SchedResult.push_back(std::make_tuple(Pair.first, Pair.second, + getOriStage(Pair.first, Offset), + IssueOrder[Pair.first])); + } +} + +void WindowScheduler::expand() { + // The MIs in the SchedResult are sorted by the issue order ID. + llvm::stable_sort(SchedResult, + [](const std::tuple<MachineInstr *, int, int, int> &A, + const std::tuple<MachineInstr *, int, int, int> &B) { + return std::get<3>(A) < std::get<3>(B); + }); + // Use the scheduling infrastructure for expansion, noting that InstrChanges + // is not supported here. + DenseMap<MachineInstr *, int> Cycles, Stages; + std::vector<MachineInstr *> OrderedInsts; + for (auto &Info : SchedResult) { + auto *MI = std::get<0>(Info); + OrderedInsts.push_back(MI); + Cycles[MI] = std::get<1>(Info); + Stages[MI] = std::get<2>(Info); + LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI] + << "]: " << *MI); + } + ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles), + std::move(Stages)); + ModuloScheduleExpander MSE(*MF, MS, *Context->LIS, + ModuloScheduleExpander::InstrChangesTy()); + MSE.expand(); + MSE.cleanup(); +} + +void WindowScheduler::updateLiveIntervals() { + SmallVector<Register, 128> UsedRegs; + for (MachineInstr &MI : *MBB) + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.getReg() == 0) + continue; + Register Reg = MO.getReg(); + if (!is_contained(UsedRegs, Reg)) + UsedRegs.push_back(Reg); + } + Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs); +} + +iterator_range<MachineBasicBlock::iterator> +WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) { + auto RegionBegin = MBB->begin(); + std::advance(RegionBegin, Offset); + auto RegionEnd = RegionBegin; + std::advance(RegionEnd, Num); + return make_range(RegionBegin, RegionEnd); +} + +int WindowScheduler::getOriCycle(MachineInstr *NewMI) { + assert(TriToOri.count(NewMI) && "Cannot find original MI!"); + auto *OriMI = TriToOri[NewMI]; + assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!"); + return OriToCycle[OriMI]; +} + +MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) { + assert(TriToOri.count(NewMI) && "Cannot find original MI!"); + return TriToOri[NewMI]; +} + +unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) { + assert(llvm::find(OriMIs, OriMI) != OriMIs.end() && + "Cannot find OriMI in OriMIs!"); + // If there is no instruction fold, all MI stages are 0. + if (Offset == SchedPhiNum) + return 0; + // For those MIs with an ID less than the Offset, their stages are set to 0, + // while the rest are set to 1. + unsigned Id = 0; + for (auto *MI : OriMIs) { + if (MI->isMetaInstruction()) + continue; + if (MI == OriMI) + break; + ++Id; + } + return Id >= (size_t)Offset ? 1 : 0; +} + +Register WindowScheduler::getAntiRegister(MachineInstr *Phi) { + assert(Phi->isPHI() && "Expecting PHI!"); + Register AntiReg; + for (auto MO : Phi->uses()) { + if (MO.isReg()) + AntiReg = MO.getReg(); + else if (MO.isMBB() && MO.getMBB() == MBB) + return AntiReg; + } + return 0; +} |
