diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-01-19 10:01:25 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-01-19 10:01:25 +0000 |
commit | d8e91e46262bc44006913e6796843909f1ac7bcd (patch) | |
tree | 7d0c143d9b38190e0fa0180805389da22cd834c5 /lib/CodeGen/MachinePipeliner.cpp | |
parent | b7eb8e35e481a74962664b63dfb09483b200209a (diff) |
Notes
Diffstat (limited to 'lib/CodeGen/MachinePipeliner.cpp')
-rw-r--r-- | lib/CodeGen/MachinePipeliner.cpp | 818 |
1 files changed, 174 insertions, 644 deletions
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp index 9bb00aaef86d..4d451bdd7f69 100644 --- a/lib/CodeGen/MachinePipeliner.cpp +++ b/lib/CodeGen/MachinePipeliner.cpp @@ -9,34 +9,6 @@ // // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner. // -// Software pipelining (SWP) is an instruction scheduling technique for loops -// that overlap loop iterations and exploits ILP via a compiler transformation. -// -// Swing Modulo Scheduling is an implementation of software pipelining -// that generates schedules that are near optimal in terms of initiation -// interval, register requirements, and stage count. See the papers: -// -// "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa, -// A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996 -// Conference on Parallel Architectures and Compilation Techiniques. -// -// "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J. -// Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE -// Transactions on Computers, Vol. 50, No. 3, 2001. -// -// "An Implementation of Swing Modulo Scheduling With Extensions for -// Superblocks", by T. Lattner, Master's Thesis, University of Illinois at -// Urbana-Chambpain, 2005. -// -// -// The SMS algorithm consists of three main steps after computing the minimal -// initiation interval (MII). -// 1) Analyze the dependence graph and compute information about each -// instruction in the graph. -// 2) Order the nodes (instructions) by priority based upon the heuristics -// described in the algorithm. -// 3) Attempt to schedule the nodes in the specified order using the MII. -// // This SMS implementation is a target-independent back-end pass. When enabled, // the pass runs just prior to the register allocation pass, while the machine // IR is in SSA form. If software pipelining is successful, then the original @@ -83,13 +55,11 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePipeliner.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/ScheduleDAGMutation.h" -#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -171,552 +141,15 @@ static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::init(false), cl::ZeroOrMore, cl::desc("Ignore RecMII")); -namespace { - -class NodeSet; -class SMSchedule; - -/// The main class in the implementation of the target independent -/// software pipeliner pass. -class MachinePipeliner : public MachineFunctionPass { -public: - MachineFunction *MF = nullptr; - const MachineLoopInfo *MLI = nullptr; - const MachineDominatorTree *MDT = nullptr; - const InstrItineraryData *InstrItins; - const TargetInstrInfo *TII = nullptr; - RegisterClassInfo RegClassInfo; - -#ifndef NDEBUG - static int NumTries; -#endif - - /// Cache the target analysis information about the loop. - struct LoopInfo { - MachineBasicBlock *TBB = nullptr; - MachineBasicBlock *FBB = nullptr; - SmallVector<MachineOperand, 4> BrCond; - MachineInstr *LoopInductionVar = nullptr; - MachineInstr *LoopCompare = nullptr; - }; - LoopInfo LI; - - static char ID; - - MachinePipeliner() : MachineFunctionPass(ID) { - initializeMachinePipelinerPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AAResultsWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addRequired<MachineLoopInfo>(); - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<LiveIntervals>(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - void preprocessPhiNodes(MachineBasicBlock &B); - bool canPipelineLoop(MachineLoop &L); - bool scheduleLoop(MachineLoop &L); - bool swingModuloScheduler(MachineLoop &L); -}; - -/// This class builds the dependence graph for the instructions in a loop, -/// and attempts to schedule the instructions using the SMS algorithm. -class SwingSchedulerDAG : public ScheduleDAGInstrs { - MachinePipeliner &Pass; - /// The minimum initiation interval between iterations for this schedule. - unsigned MII = 0; - /// Set to true if a valid pipelined schedule is found for the loop. - bool Scheduled = false; - MachineLoop &Loop; - LiveIntervals &LIS; - const RegisterClassInfo &RegClassInfo; - - /// A toplogical ordering of the SUnits, which is needed for changing - /// dependences and iterating over the SUnits. - ScheduleDAGTopologicalSort Topo; - - struct NodeInfo { - int ASAP = 0; - int ALAP = 0; - int ZeroLatencyDepth = 0; - int ZeroLatencyHeight = 0; - - NodeInfo() = default; - }; - /// Computed properties for each node in the graph. - std::vector<NodeInfo> ScheduleInfo; - - enum OrderKind { BottomUp = 0, TopDown = 1 }; - /// Computed node ordering for scheduling. - SetVector<SUnit *> NodeOrder; - - using NodeSetType = SmallVector<NodeSet, 8>; - using ValueMapTy = DenseMap<unsigned, unsigned>; - using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>; - using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>; - - /// Instructions to change when emitting the final schedule. - DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges; - - /// We may create a new instruction, so remember it because it - /// must be deleted when the pass is finished. - SmallPtrSet<MachineInstr *, 4> NewMIs; - - /// Ordered list of DAG postprocessing steps. - std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations; - - /// Helper class to implement Johnson's circuit finding algorithm. - class Circuits { - std::vector<SUnit> &SUnits; - SetVector<SUnit *> Stack; - BitVector Blocked; - SmallVector<SmallPtrSet<SUnit *, 4>, 10> B; - SmallVector<SmallVector<int, 4>, 16> AdjK; - unsigned NumPaths; - static unsigned MaxPaths; - - public: - Circuits(std::vector<SUnit> &SUs) - : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {} - - /// Reset the data structures used in the circuit algorithm. - void reset() { - Stack.clear(); - Blocked.reset(); - B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>()); - NumPaths = 0; - } - - void createAdjacencyStructure(SwingSchedulerDAG *DAG); - bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false); - void unblock(int U); - }; - -public: - SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis, - const RegisterClassInfo &rci) - : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis), - RegClassInfo(rci), Topo(SUnits, &ExitSU) { - P.MF->getSubtarget().getSMSMutations(Mutations); - } - - void schedule() override; - void finishBlock() override; - - /// Return true if the loop kernel has been scheduled. - bool hasNewSchedule() { return Scheduled; } - - /// Return the earliest time an instruction may be scheduled. - int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; } - - /// Return the latest time an instruction my be scheduled. - int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; } - - /// The mobility function, which the number of slots in which - /// an instruction may be scheduled. - int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); } - - /// The depth, in the dependence graph, for a node. - unsigned getDepth(SUnit *Node) { return Node->getDepth(); } - - /// The maximum unweighted length of a path from an arbitrary node to the - /// given node in which each edge has latency 0 - int getZeroLatencyDepth(SUnit *Node) { - return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth; - } - - /// The height, in the dependence graph, for a node. - unsigned getHeight(SUnit *Node) { return Node->getHeight(); } - - /// The maximum unweighted length of a path from the given node to an - /// arbitrary node in which each edge has latency 0 - int getZeroLatencyHeight(SUnit *Node) { - return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight; - } - - /// Return true if the dependence is a back-edge in the data dependence graph. - /// Since the DAG doesn't contain cycles, we represent a cycle in the graph - /// using an anti dependence from a Phi to an instruction. - bool isBackedge(SUnit *Source, const SDep &Dep) { - if (Dep.getKind() != SDep::Anti) - return false; - return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI(); - } - - bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true); - - /// The distance function, which indicates that operation V of iteration I - /// depends on operations U of iteration I-distance. - unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) { - // Instructions that feed a Phi have a distance of 1. Computing larger - // values for arrays requires data dependence information. - if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti) - return 1; - return 0; - } - - /// Set the Minimum Initiation Interval for this schedule attempt. - void setMII(unsigned mii) { MII = mii; } - - void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule); - - void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs); - - /// Return the new base register that was stored away for the changed - /// instruction. - unsigned getInstrBaseReg(SUnit *SU) { - DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It = - InstrChanges.find(SU); - if (It != InstrChanges.end()) - return It->second.first; - return 0; - } - - void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) { - Mutations.push_back(std::move(Mutation)); - } - -private: - void addLoopCarriedDependences(AliasAnalysis *AA); - void updatePhiDependences(); - void changeDependences(); - unsigned calculateResMII(); - unsigned calculateRecMII(NodeSetType &RecNodeSets); - void findCircuits(NodeSetType &NodeSets); - void fuseRecs(NodeSetType &NodeSets); - void removeDuplicateNodes(NodeSetType &NodeSets); - void computeNodeFunctions(NodeSetType &NodeSets); - void registerPressureFilter(NodeSetType &NodeSets); - void colocateNodeSets(NodeSetType &NodeSets); - void checkNodeSets(NodeSetType &NodeSets); - void groupRemainingNodes(NodeSetType &NodeSets); - void addConnectedNodes(SUnit *SU, NodeSet &NewSet, - SetVector<SUnit *> &NodesAdded); - void computeNodeOrder(NodeSetType &NodeSets); - void checkValidNodeOrder(const NodeSetType &Circuits) const; - bool schedulePipeline(SMSchedule &Schedule); - void generatePipelinedLoop(SMSchedule &Schedule); - void generateProlog(SMSchedule &Schedule, unsigned LastStage, - MachineBasicBlock *KernelBB, ValueMapTy *VRMap, - MBBVectorTy &PrologBBs); - void generateEpilog(SMSchedule &Schedule, unsigned LastStage, - MachineBasicBlock *KernelBB, ValueMapTy *VRMap, - MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs); - void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, - MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, - SMSchedule &Schedule, ValueMapTy *VRMap, - InstrMapTy &InstrMap, unsigned LastStageNum, - unsigned CurStageNum, bool IsLast); - void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, - MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, - SMSchedule &Schedule, ValueMapTy *VRMap, - InstrMapTy &InstrMap, unsigned LastStageNum, - unsigned CurStageNum, bool IsLast); - void removeDeadInstructions(MachineBasicBlock *KernelBB, - MBBVectorTy &EpilogBBs); - void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, - SMSchedule &Schedule); - void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB, - MBBVectorTy &EpilogBBs, SMSchedule &Schedule, - ValueMapTy *VRMap); - bool computeDelta(MachineInstr &MI, unsigned &Delta); - void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI, - unsigned Num); - MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum, - unsigned InstStageNum); - MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum, - unsigned InstStageNum, - SMSchedule &Schedule); - void updateInstruction(MachineInstr *NewMI, bool LastDef, - unsigned CurStageNum, unsigned InstrStageNum, - SMSchedule &Schedule, ValueMapTy *VRMap); - MachineInstr *findDefInLoop(unsigned Reg); - unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal, - unsigned LoopStage, ValueMapTy *VRMap, - MachineBasicBlock *BB); - void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum, - SMSchedule &Schedule, ValueMapTy *VRMap, - InstrMapTy &InstrMap); - void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule, - InstrMapTy &InstrMap, unsigned CurStageNum, - unsigned PhiNum, MachineInstr *Phi, - unsigned OldReg, unsigned NewReg, - unsigned PrevReg = 0); - bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos, - unsigned &OffsetPos, unsigned &NewBase, - int64_t &NewOffset); - void postprocessDAG(); -}; - -/// A NodeSet contains a set of SUnit DAG nodes with additional information -/// that assigns a priority to the set. -class NodeSet { - SetVector<SUnit *> Nodes; - bool HasRecurrence = false; - unsigned RecMII = 0; - int MaxMOV = 0; - unsigned MaxDepth = 0; - unsigned Colocate = 0; - SUnit *ExceedPressure = nullptr; - unsigned Latency = 0; - -public: - using iterator = SetVector<SUnit *>::const_iterator; - - NodeSet() = default; - NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) { - Latency = 0; - for (unsigned i = 0, e = Nodes.size(); i < e; ++i) - for (const SDep &Succ : Nodes[i]->Succs) - if (Nodes.count(Succ.getSUnit())) - Latency += Succ.getLatency(); - } - - bool insert(SUnit *SU) { return Nodes.insert(SU); } - - void insert(iterator S, iterator E) { Nodes.insert(S, E); } - - template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) { - return Nodes.remove_if(P); - } - - unsigned count(SUnit *SU) const { return Nodes.count(SU); } - - bool hasRecurrence() { return HasRecurrence; }; - - unsigned size() const { return Nodes.size(); } - - bool empty() const { return Nodes.empty(); } - - SUnit *getNode(unsigned i) const { return Nodes[i]; }; - - void setRecMII(unsigned mii) { RecMII = mii; }; - - void setColocate(unsigned c) { Colocate = c; }; - - void setExceedPressure(SUnit *SU) { ExceedPressure = SU; } - - bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; } - - int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; } - - int getRecMII() { return RecMII; } - - /// Summarize node functions for the entire node set. - void computeNodeSetInfo(SwingSchedulerDAG *SSD) { - for (SUnit *SU : *this) { - MaxMOV = std::max(MaxMOV, SSD->getMOV(SU)); - MaxDepth = std::max(MaxDepth, SSD->getDepth(SU)); - } - } - - unsigned getLatency() { return Latency; } - - unsigned getMaxDepth() { return MaxDepth; } - - void clear() { - Nodes.clear(); - RecMII = 0; - HasRecurrence = false; - MaxMOV = 0; - MaxDepth = 0; - Colocate = 0; - ExceedPressure = nullptr; - } - - operator SetVector<SUnit *> &() { return Nodes; } - - /// Sort the node sets by importance. First, rank them by recurrence MII, - /// then by mobility (least mobile done first), and finally by depth. - /// Each node set may contain a colocate value which is used as the first - /// tie breaker, if it's set. - bool operator>(const NodeSet &RHS) const { - if (RecMII == RHS.RecMII) { - if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate) - return Colocate < RHS.Colocate; - if (MaxMOV == RHS.MaxMOV) - return MaxDepth > RHS.MaxDepth; - return MaxMOV < RHS.MaxMOV; - } - return RecMII > RHS.RecMII; - } - - bool operator==(const NodeSet &RHS) const { - return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV && - MaxDepth == RHS.MaxDepth; - } - - bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); } - - iterator begin() { return Nodes.begin(); } - iterator end() { return Nodes.end(); } - - void print(raw_ostream &os) const { - os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV - << " depth " << MaxDepth << " col " << Colocate << "\n"; - for (const auto &I : Nodes) - os << " SU(" << I->NodeNum << ") " << *(I->getInstr()); - os << "\n"; - } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - LLVM_DUMP_METHOD void dump() const { print(dbgs()); } -#endif -}; - -/// This class represents the scheduled code. The main data structure is a -/// map from scheduled cycle to instructions. During scheduling, the -/// data structure explicitly represents all stages/iterations. When -/// the algorithm finshes, the schedule is collapsed into a single stage, -/// which represents instructions from different loop iterations. -/// -/// The SMS algorithm allows negative values for cycles, so the first cycle -/// in the schedule is the smallest cycle value. -class SMSchedule { -private: - /// Map from execution cycle to instructions. - DenseMap<int, std::deque<SUnit *>> ScheduledInstrs; - - /// Map from instruction to execution cycle. - std::map<SUnit *, int> InstrToCycle; - - /// Map for each register and the max difference between its uses and def. - /// The first element in the pair is the max difference in stages. The - /// second is true if the register defines a Phi value and loop value is - /// scheduled before the Phi. - std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff; - - /// Keep track of the first cycle value in the schedule. It starts - /// as zero, but the algorithm allows negative values. - int FirstCycle = 0; - - /// Keep track of the last cycle value in the schedule. - int LastCycle = 0; - - /// The initiation interval (II) for the schedule. - int InitiationInterval = 0; - - /// Target machine information. - const TargetSubtargetInfo &ST; - - /// Virtual register information. - MachineRegisterInfo &MRI; - - std::unique_ptr<DFAPacketizer> Resources; - -public: - SMSchedule(MachineFunction *mf) - : ST(mf->getSubtarget()), MRI(mf->getRegInfo()), - Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {} - - void reset() { - ScheduledInstrs.clear(); - InstrToCycle.clear(); - RegToStageDiff.clear(); - FirstCycle = 0; - LastCycle = 0; - InitiationInterval = 0; - } - - /// Set the initiation interval for this schedule. - void setInitiationInterval(int ii) { InitiationInterval = ii; } - - /// Return the first cycle in the completed schedule. This - /// can be a negative value. - int getFirstCycle() const { return FirstCycle; } - - /// Return the last cycle in the finalized schedule. - int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; } - - /// Return the cycle of the earliest scheduled instruction in the dependence - /// chain. - int earliestCycleInChain(const SDep &Dep); - - /// Return the cycle of the latest scheduled instruction in the dependence - /// chain. - int latestCycleInChain(const SDep &Dep); +namespace llvm { - void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, - int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG); - bool insert(SUnit *SU, int StartCycle, int EndCycle, int II); +// A command line option to enable the CopyToPhi DAG mutation. +cl::opt<bool> + SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden, + cl::init(true), cl::ZeroOrMore, + cl::desc("Enable CopyToPhi DAG Mutation")); - /// Iterators for the cycle to instruction map. - using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator; - using const_sched_iterator = - DenseMap<int, std::deque<SUnit *>>::const_iterator; - - /// Return true if the instruction is scheduled at the specified stage. - bool isScheduledAtStage(SUnit *SU, unsigned StageNum) { - return (stageScheduled(SU) == (int)StageNum); - } - - /// Return the stage for a scheduled instruction. Return -1 if - /// the instruction has not been scheduled. - int stageScheduled(SUnit *SU) const { - std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU); - if (it == InstrToCycle.end()) - return -1; - return (it->second - FirstCycle) / InitiationInterval; - } - - /// Return the cycle for a scheduled instruction. This function normalizes - /// the first cycle to be 0. - unsigned cycleScheduled(SUnit *SU) const { - std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU); - assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled."); - return (it->second - FirstCycle) % InitiationInterval; - } - - /// Return the maximum stage count needed for this schedule. - unsigned getMaxStageCount() { - return (LastCycle - FirstCycle) / InitiationInterval; - } - - /// Return the max. number of stages/iterations that can occur between a - /// register definition and its uses. - unsigned getStagesForReg(int Reg, unsigned CurStage) { - std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; - if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second) - return 1; - return Stages.first; - } - - /// The number of stages for a Phi is a little different than other - /// instructions. The minimum value computed in RegToStageDiff is 1 - /// because we assume the Phi is needed for at least 1 iteration. - /// This is not the case if the loop value is scheduled prior to the - /// Phi in the same stage. This function returns the number of stages - /// or iterations needed between the Phi definition and any uses. - unsigned getStagesForPhi(int Reg) { - std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; - if (Stages.second) - return Stages.first; - return Stages.first - 1; - } - - /// Return the instructions that are scheduled at the specified cycle. - std::deque<SUnit *> &getInstructions(int cycle) { - return ScheduledInstrs[cycle]; - } - - bool isValidSchedule(SwingSchedulerDAG *SSD); - void finalizeSchedule(SwingSchedulerDAG *SSD); - void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, - std::deque<SUnit *> &Insts); - bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi); - bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def, - MachineOperand &MO); - void print(raw_ostream &os) const; - void dump() const; -}; - -} // end anonymous namespace +} // end namespace llvm unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5; char MachinePipeliner::ID = 0; @@ -884,12 +317,9 @@ void SwingSchedulerDAG::schedule() { addLoopCarriedDependences(AA); updatePhiDependences(); Topo.InitDAGTopologicalSorting(); - postprocessDAG(); changeDependences(); - LLVM_DEBUG({ - for (unsigned su = 0, e = SUnits.size(); su != e; ++su) - SUnits[su].dumpAll(this); - }); + postprocessDAG(); + LLVM_DEBUG(dump()); NodeSetType NodeSets; findCircuits(NodeSets); @@ -1101,11 +531,12 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { // First, perform the cheaper check that compares the base register. // If they are the same and the load offset is less than the store // offset, then mark the dependence as loop carried potentially. - unsigned BaseReg1, BaseReg2; + MachineOperand *BaseOp1, *BaseOp2; int64_t Offset1, Offset2; - if (TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) && - TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) { - if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) { + if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, TRI) && + TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) { + if (BaseOp1->isIdenticalTo(*BaseOp2) && + (int)Offset1 < (int)Offset2) { assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) && "What happened to the chain edge?"); SDep Dep(Load, SDep::Barrier); @@ -1139,9 +570,9 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { continue; } AliasResult AAResult = AA->alias( - MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize, + MemoryLocation(MMO1->getValue(), LocationSize::unknown(), MMO1->getAAInfo()), - MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize, + MemoryLocation(MMO2->getValue(), LocationSize::unknown(), MMO2->getAAInfo())); if (AAResult != NoAlias) { @@ -1298,6 +729,7 @@ void SwingSchedulerDAG::changeDependences() { // Add a dependence between the new instruction and the instruction // that defines the new base. SDep Dep(&I, SDep::Anti, NewBase); + Topo.AddPred(LastSU, &I); LastSU->addPred(Dep); // Remember the base and offset information so that we can update the @@ -1509,9 +941,9 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure( } OutputDeps[N] = BackEdge; } - // Do not process a boundary node and a back-edge is processed only - // if it goes to a Phi. - if (SI.getSUnit()->isBoundaryNode() || + // Do not process a boundary node, an artificial node. + // A back-edge is processed only if it goes to a Phi. + if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() || (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI())) continue; int N = SI.getSUnit()->NodeNum; @@ -1535,7 +967,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure( } } } - // Add back-eges in the adjacency matrix for the output dependences. + // Add back-edges in the adjacency matrix for the output dependences. for (auto &OD : OutputDeps) if (!Added.test(OD.second)) { AdjK[OD.first].push_back(OD.second); @@ -1564,7 +996,8 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets, ++NumPaths; break; } else if (!Blocked.test(W)) { - if (circuit(W, S, NodeSets, W < V ? true : HasBackedge)) + if (circuit(W, S, NodeSets, + Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge)) F = true; } } @@ -1604,7 +1037,7 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) { // but we do this to find the circuits, and then change them back. swapAntiDependences(SUnits); - Circuits Cir(SUnits); + Circuits Cir(SUnits, Topo); // Create the adjacency structure. Cir.createAdjacencyStructure(this); for (int i = 0, e = SUnits.size(); i != e; ++i) { @@ -1616,6 +1049,85 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) { swapAntiDependences(SUnits); } +// Create artificial dependencies between the source of COPY/REG_SEQUENCE that +// is loop-carried to the USE in next iteration. This will help pipeliner avoid +// additional copies that are needed across iterations. An artificial dependence +// edge is added from USE to SOURCE of COPY/REG_SEQUENCE. + +// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried) +// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE +// PHI-------True-Dep------> USEOfPhi + +// The mutation creates +// USEOfPHI -------Artificial-Dep---> SRCOfCopy + +// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy +// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled +// late to avoid additional copies across iterations. The possible scheduling +// order would be +// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE. + +void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) { + for (SUnit &SU : DAG->SUnits) { + // Find the COPY/REG_SEQUENCE instruction. + if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence()) + continue; + + // Record the loop carried PHIs. + SmallVector<SUnit *, 4> PHISUs; + // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions. + SmallVector<SUnit *, 4> SrcSUs; + + for (auto &Dep : SU.Preds) { + SUnit *TmpSU = Dep.getSUnit(); + MachineInstr *TmpMI = TmpSU->getInstr(); + SDep::Kind DepKind = Dep.getKind(); + // Save the loop carried PHI. + if (DepKind == SDep::Anti && TmpMI->isPHI()) + PHISUs.push_back(TmpSU); + // Save the source of COPY/REG_SEQUENCE. + // If the source has no pre-decessors, we will end up creating cycles. + else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0) + SrcSUs.push_back(TmpSU); + } + + if (PHISUs.size() == 0 || SrcSUs.size() == 0) + continue; + + // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this + // SUnit to the container. + SmallVector<SUnit *, 8> UseSUs; + for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) { + for (auto &Dep : (*I)->Succs) { + if (Dep.getKind() != SDep::Data) + continue; + + SUnit *TmpSU = Dep.getSUnit(); + MachineInstr *TmpMI = TmpSU->getInstr(); + if (TmpMI->isPHI() || TmpMI->isRegSequence()) { + PHISUs.push_back(TmpSU); + continue; + } + UseSUs.push_back(TmpSU); + } + } + + if (UseSUs.size() == 0) + continue; + + SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG); + // Add the artificial dependencies if it does not form a cycle. + for (auto I : UseSUs) { + for (auto Src : SrcSUs) { + if (!SDAG->Topo.IsReachable(I, Src) && Src != I) { + Src->addPred(SDep(I, SDep::Artificial)); + SDAG->Topo.AddPred(Src, I); + } + } + } + } +} + /// Return true for DAG nodes that we ignore when computing the cost functions. /// We ignore the back-edge recurrence in order to avoid unbounded recursion /// in the calculation of the ASAP, ALAP, etc functions. @@ -1638,8 +1150,8 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) { for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(), E = Topo.end(); I != E; ++I) { - SUnit *SU = &SUnits[*I]; - SU->dump(this); + const SUnit &SU = SUnits[*I]; + dumpNode(SU); } }); @@ -1864,8 +1376,7 @@ void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) { RecRPTracker.closeBottom(); std::vector<SUnit *> SUnits(NS.begin(), NS.end()); - llvm::sort(SUnits.begin(), SUnits.end(), - [](const SUnit *A, const SUnit *B) { + llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); @@ -2672,7 +2183,7 @@ void SwingSchedulerDAG::generateExistingPhis( else if (PrologStage >= AccessStage + StageDiff + np && VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0) PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal]; - // Check if the Phi has already been scheduled, but the loop intruction + // Check if the Phi has already been scheduled, but the loop instruction // is either another Phi, or doesn't occur in the loop. else if (PrologStage >= AccessStage + StageDiff + np) { // If the Phi references another Phi, we need to examine the other @@ -2725,7 +2236,7 @@ void SwingSchedulerDAG::generateExistingPhis( VRMap[PrevStage - np + 1].count(Def)) PhiOp2 = VRMap[PrevStage - np + 1][Def]; // Use the loop value defined in the kernel. - else if ((unsigned)LoopValStage + StageDiffAdj > PrologStage + 1 && + else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 && VRMap[PrevStage - StageDiffAdj - np].count(LoopVal)) PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal]; // Use the value defined by the Phi, unless we're generating the first @@ -2739,35 +2250,38 @@ void SwingSchedulerDAG::generateExistingPhis( // references another Phi, and the other Phi is scheduled in an // earlier stage. We can try to reuse an existing Phi up until the last // stage of the current Phi. - if (LoopDefIsPhi && (int)(PrologStage - np) >= StageScheduled) { - int LVNumStages = Schedule.getStagesForPhi(LoopVal); - int StageDiff = (StageScheduled - LoopValStage); - LVNumStages -= StageDiff; - // Make sure the loop value Phi has been processed already. - if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) { - NewReg = PhiOp2; - unsigned ReuseStage = CurStageNum; - if (Schedule.isLoopCarried(this, *PhiInst)) - ReuseStage -= LVNumStages; - // Check if the Phi to reuse has been generated yet. If not, then - // there is nothing to reuse. - if (VRMap[ReuseStage - np].count(LoopVal)) { - NewReg = VRMap[ReuseStage - np][LoopVal]; - - rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, - &*BBI, Def, NewReg); - // Update the map with the new Phi name. - VRMap[CurStageNum - np][Def] = NewReg; - PhiOp2 = NewReg; - if (VRMap[LastStageNum - np - 1].count(LoopVal)) - PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal]; - - if (IsLast && np == NumPhis - 1) - replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS); - continue; + if (LoopDefIsPhi) { + if (static_cast<int>(PrologStage - np) >= StageScheduled) { + int LVNumStages = Schedule.getStagesForPhi(LoopVal); + int StageDiff = (StageScheduled - LoopValStage); + LVNumStages -= StageDiff; + // Make sure the loop value Phi has been processed already. + if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) { + NewReg = PhiOp2; + unsigned ReuseStage = CurStageNum; + if (Schedule.isLoopCarried(this, *PhiInst)) + ReuseStage -= LVNumStages; + // Check if the Phi to reuse has been generated yet. If not, then + // there is nothing to reuse. + if (VRMap[ReuseStage - np].count(LoopVal)) { + NewReg = VRMap[ReuseStage - np][LoopVal]; + + rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, + &*BBI, Def, NewReg); + // Update the map with the new Phi name. + VRMap[CurStageNum - np][Def] = NewReg; + PhiOp2 = NewReg; + if (VRMap[LastStageNum - np - 1].count(LoopVal)) + PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal]; + + if (IsLast && np == NumPhis - 1) + replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS); + continue; + } } - } else if (InKernel && StageDiff > 0 && - VRMap[CurStageNum - StageDiff - np].count(LoopVal)) + } + if (InKernel && StageDiff > 0 && + VRMap[CurStageNum - StageDiff - np].count(LoopVal)) PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal]; } @@ -3143,11 +2657,16 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs, /// during each iteration. Set Delta to the amount of the change. bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - unsigned BaseReg; + MachineOperand *BaseOp; int64_t Offset; - if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) + if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI)) return false; + if (!BaseOp->isReg()) + return false; + + unsigned BaseReg = BaseOp->getReg(); + MachineRegisterInfo &MRI = MF.getRegInfo(); // Check if there is a Phi. If so, get the definition in the loop. MachineInstr *BaseDef = MRI.getVRegDef(BaseReg); @@ -3175,28 +2694,26 @@ void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI, return; // If the instruction has memory operands, then adjust the offset // when the instruction appears in different stages. - unsigned NumRefs = NewMI.memoperands_end() - NewMI.memoperands_begin(); - if (NumRefs == 0) + if (NewMI.memoperands_empty()) return; - MachineInstr::mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NumRefs); - unsigned Refs = 0; + SmallVector<MachineMemOperand *, 2> NewMMOs; for (MachineMemOperand *MMO : NewMI.memoperands()) { if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) || (!MMO->getValue())) { - NewMemRefs[Refs++] = MMO; + NewMMOs.push_back(MMO); continue; } unsigned Delta; if (Num != UINT_MAX && computeDelta(OldMI, Delta)) { int64_t AdjOffset = Delta * Num; - NewMemRefs[Refs++] = - MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()); + NewMMOs.push_back( + MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize())); } else { - NewMI.dropMemRefs(); - return; + NewMMOs.push_back( + MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize)); } } - NewMI.setMemRefs(NewMemRefs, NewMemRefs + NumRefs); + NewMI.setMemRefs(MF, NewMMOs); } /// Clone the instruction for the new pipelined loop and update the @@ -3552,19 +3069,19 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep, if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD)) return true; - unsigned BaseRegS, BaseRegD; + MachineOperand *BaseOpS, *BaseOpD; int64_t OffsetS, OffsetD; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) || - !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI)) + if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, TRI) || + !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, TRI)) return true; - if (BaseRegS != BaseRegD) + if (!BaseOpS->isIdenticalTo(*BaseOpD)) return true; // Check that the base register is incremented by a constant value for each // iteration. - MachineInstr *Def = MRI.getVRegDef(BaseRegS); + MachineInstr *Def = MRI.getVRegDef(BaseOpS->getReg()); if (!Def || !Def->isPHI()) return true; unsigned InitVal = 0; @@ -3983,7 +3500,7 @@ void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const { }; // sort, so that we can perform a binary search - llvm::sort(Indices.begin(), Indices.end(), CompareKey); + llvm::sort(Indices, CompareKey); bool Valid = true; (void)Valid; @@ -4193,6 +3710,14 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) { LLVM_DEBUG(dump();); } +void NodeSet::print(raw_ostream &os) const { + os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV + << " depth " << MaxDepth << " col " << Colocate << "\n"; + for (const auto &I : Nodes) + os << " SU(" << I->NodeNum << ") " << *(I->getInstr()); + os << "\n"; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the schedule information to the given output. void SMSchedule::print(raw_ostream &os) const { @@ -4211,4 +3736,9 @@ void SMSchedule::print(raw_ostream &os) const { /// Utility function used for debugging to print the schedule. LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); } + #endif + + + |