diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
commit | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch) | |
tree | 599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/CodeGen/MachinePipeliner.cpp | |
parent | 1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff) |
Diffstat (limited to 'lib/CodeGen/MachinePipeliner.cpp')
-rw-r--r-- | lib/CodeGen/MachinePipeliner.cpp | 534 |
1 files changed, 439 insertions, 95 deletions
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp index 4d451bdd7f69..54df522d371a 100644 --- a/lib/CodeGen/MachinePipeliner.cpp +++ b/lib/CodeGen/MachinePipeliner.cpp @@ -1,9 +1,8 @@ //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -97,6 +96,14 @@ using namespace llvm; STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline"); STATISTIC(NumPipelined, "Number of loops software pipelined"); STATISTIC(NumNodeOrderIssues, "Number of node order issues found"); +STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch"); +STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop"); +STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader"); +STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large"); +STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII"); +STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found"); +STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage"); +STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages"); /// A command line option to turn software pipelining on or off. static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), @@ -141,6 +148,11 @@ static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::init(false), cl::ZeroOrMore, cl::desc("Ignore RecMII")); +static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden, + cl::init(false)); +static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden, + cl::init(false)); + namespace llvm { // A command line option to enable the CopyToPhi DAG mutation. @@ -180,6 +192,16 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { !EnableSWPOptSize.getPosition()) return false; + if (!mf.getSubtarget().enableMachinePipeliner()) + return false; + + // Cannot pipeline loops without instruction itineraries if we are using + // DFA for the pipeliner. + if (mf.getSubtarget().useDFAforSMS() && + (!mf.getSubtarget().getInstrItineraryData() || + mf.getSubtarget().getInstrItineraryData()->isEmpty())) + return false; + MF = &mf; MLI = &getAnalysis<MachineLoopInfo>(); MDT = &getAnalysis<MachineDominatorTree>(); @@ -211,8 +233,11 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) { } #endif - if (!canPipelineLoop(L)) + setPragmaPipelineOptions(L); + if (!canPipelineLoop(L)) { + LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n"); return Changed; + } ++NumTrytoPipeline; @@ -221,6 +246,50 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) { return Changed; } +void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) { + MachineBasicBlock *LBLK = L.getTopBlock(); + + if (LBLK == nullptr) + return; + + const BasicBlock *BBLK = LBLK->getBasicBlock(); + if (BBLK == nullptr) + return; + + const Instruction *TI = BBLK->getTerminator(); + if (TI == nullptr) + return; + + MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop); + if (LoopID == nullptr) + return; + + assert(LoopID->getNumOperands() > 0 && "requires atleast one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop"); + + for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + + if (MD == nullptr) + continue; + + MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + + if (S == nullptr) + continue; + + if (S->getString() == "llvm.loop.pipeline.initiationinterval") { + assert(MD->getNumOperands() == 2 && + "Pipeline initiation interval hint metadata should have two operands."); + II_setByPragma = + mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue(); + assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive."); + } else if (S->getString() == "llvm.loop.pipeline.disable") { + disabledByPragma = true; + } + } +} + /// Return true if the loop can be software pipelined. The algorithm is /// restricted to loops with a single basic block. Make sure that the /// branch in the loop can be analyzed. @@ -228,21 +297,36 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) { if (L.getNumBlocks() != 1) return false; + if (disabledByPragma) + return false; + // Check if the branch can't be understood because we can't do pipelining // if that's the case. LI.TBB = nullptr; LI.FBB = nullptr; LI.BrCond.clear(); - if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) + if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) { + LLVM_DEBUG( + dbgs() << "Unable to analyzeBranch, can NOT pipeline current Loop\n"); + NumFailBranch++; return false; + } LI.LoopInductionVar = nullptr; LI.LoopCompare = nullptr; - if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare)) + if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare)) { + LLVM_DEBUG( + dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n"); + NumFailLoop++; return false; + } - if (!L.getLoopPreheader()) + if (!L.getLoopPreheader()) { + LLVM_DEBUG( + dbgs() << "Preheader not found, can NOT pipeline current Loop\n"); + NumFailPreheader++; return false; + } // Remove any subregisters from inputs to phi nodes. preprocessPhiNodes(*L.getHeader()); @@ -286,7 +370,8 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) { bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) { assert(L.getBlocks().size() == 1 && "SMS works on single blocks only."); - SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo); + SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo, + II_setByPragma); MachineBasicBlock *MBB = L.getHeader(); // The kernel should not include any terminator instructions. These @@ -309,6 +394,20 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) { return SMS.hasNewSchedule(); } +void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) { + if (II_setByPragma > 0) + MII = II_setByPragma; + else + MII = std::max(ResMII, RecMII); +} + +void SwingSchedulerDAG::setMAX_II() { + if (II_setByPragma > 0) + MAX_II = II_setByPragma; + else + MAX_II = MII + 10; +} + /// We override the schedule function in ScheduleDAGInstrs to implement the /// scheduling part of the Swing Modulo Scheduling algorithm. void SwingSchedulerDAG::schedule() { @@ -335,17 +434,28 @@ void SwingSchedulerDAG::schedule() { if (SwpIgnoreRecMII) RecMII = 0; - MII = std::max(ResMII, RecMII); - LLVM_DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII - << ", res=" << ResMII << ")\n"); + setMII(ResMII, RecMII); + setMAX_II(); + + LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II + << " (rec=" << RecMII << ", res=" << ResMII << ")\n"); // Can't schedule a loop without a valid MII. - if (MII == 0) + if (MII == 0) { + LLVM_DEBUG( + dbgs() + << "0 is not a valid Minimal Initiation Interval, can NOT schedule\n"); + NumFailZeroMII++; return; + } // Don't pipeline large loops. - if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) + if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) { + LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii + << ", we don't pipleline large loops\n"); + NumFailLargeMaxMII++; return; + } computeNodeFunctions(NodeSets); @@ -362,7 +472,7 @@ void SwingSchedulerDAG::schedule() { } }); - std::stable_sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>()); + llvm::stable_sort(NodeSets, std::greater<NodeSet>()); groupRemainingNodes(NodeSets); @@ -383,17 +493,27 @@ void SwingSchedulerDAG::schedule() { SMSchedule Schedule(Pass.MF); Scheduled = schedulePipeline(Schedule); - if (!Scheduled) + if (!Scheduled){ + LLVM_DEBUG(dbgs() << "No schedule found, return\n"); + NumFailNoSchedule++; return; + } unsigned numStages = Schedule.getMaxStageCount(); // No need to generate pipeline if there are no overlapped iterations. - if (numStages == 0) + if (numStages == 0) { + LLVM_DEBUG( + dbgs() << "No overlapped iterations, no need to generate pipeline\n"); + NumFailZeroStage++; return; - + } // Check that the maximum stage count is less than user-defined limit. - if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) + if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) { + LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages + << " : too many stages, abort\n"); + NumFailLargeMaxStage++; return; + } generatePipelinedLoop(Schedule); ++NumPipelined; @@ -467,7 +587,8 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) { /// Return true if the instruction causes a chain between memory /// references before and after it. static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) { - return MI.isCall() || MI.hasUnmodeledSideEffects() || + return MI.isCall() || MI.mayRaiseFPException() || + MI.hasUnmodeledSideEffects() || (MI.hasOrderedMemoryRef() && (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA))); } @@ -475,16 +596,16 @@ static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) { /// Return the underlying objects for the memory references of an instruction. /// This function calls the code in ValueTracking, but first checks that the /// instruction has a memory operand. -static void getUnderlyingObjects(MachineInstr *MI, - SmallVectorImpl<Value *> &Objs, +static void getUnderlyingObjects(const MachineInstr *MI, + SmallVectorImpl<const Value *> &Objs, const DataLayout &DL) { if (!MI->hasOneMemOperand()) return; MachineMemOperand *MM = *MI->memoperands_begin(); if (!MM->getValue()) return; - GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL); - for (Value *V : Objs) { + GetUnderlyingObjects(MM->getValue(), Objs, DL); + for (const Value *V : Objs) { if (!isIdentifiedObject(V)) { Objs.clear(); return; @@ -498,7 +619,7 @@ static void getUnderlyingObjects(MachineInstr *MI, /// dependence. This code is very similar to the code in ScheduleDAGInstrs /// but that code doesn't create loop carried dependences. void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { - MapVector<Value *, SmallVector<SUnit *, 4>> PendingLoads; + MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads; Value *UnknownValue = UndefValue::get(Type::getVoidTy(MF.getFunction().getContext())); for (auto &SU : SUnits) { @@ -506,7 +627,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { if (isDependenceBarrier(MI, AA)) PendingLoads.clear(); else if (MI.mayLoad()) { - SmallVector<Value *, 4> Objs; + SmallVector<const Value *, 4> Objs; getUnderlyingObjects(&MI, Objs, MF.getDataLayout()); if (Objs.empty()) Objs.push_back(UnknownValue); @@ -515,12 +636,12 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { SUs.push_back(&SU); } } else if (MI.mayStore()) { - SmallVector<Value *, 4> Objs; + SmallVector<const Value *, 4> Objs; getUnderlyingObjects(&MI, Objs, MF.getDataLayout()); if (Objs.empty()) Objs.push_back(UnknownValue); for (auto V : Objs) { - MapVector<Value *, SmallVector<SUnit *, 4>>::iterator I = + MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I = PendingLoads.find(V); if (I == PendingLoads.end()) continue; @@ -531,7 +652,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { // First, perform the cheaper check that compares the base register. // If they are the same and the load offset is less than the store // offset, then mark the dependence as loop carried potentially. - MachineOperand *BaseOp1, *BaseOp2; + const MachineOperand *BaseOp1, *BaseOp2; int64_t Offset1, Offset2; if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, TRI) && TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) { @@ -744,27 +865,55 @@ namespace { // the number of functional unit choices. struct FuncUnitSorter { const InstrItineraryData *InstrItins; + const MCSubtargetInfo *STI; DenseMap<unsigned, unsigned> Resources; - FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {} + FuncUnitSorter(const TargetSubtargetInfo &TSI) + : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {} // Compute the number of functional unit alternatives needed // at each stage, and take the minimum value. We prioritize the // instructions by the least number of choices first. unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const { - unsigned schedClass = Inst->getDesc().getSchedClass(); + unsigned SchedClass = Inst->getDesc().getSchedClass(); unsigned min = UINT_MAX; - for (const InstrStage *IS = InstrItins->beginStage(schedClass), - *IE = InstrItins->endStage(schedClass); - IS != IE; ++IS) { - unsigned funcUnits = IS->getUnits(); - unsigned numAlternatives = countPopulation(funcUnits); - if (numAlternatives < min) { - min = numAlternatives; - F = funcUnits; + if (InstrItins && !InstrItins->isEmpty()) { + for (const InstrStage &IS : + make_range(InstrItins->beginStage(SchedClass), + InstrItins->endStage(SchedClass))) { + unsigned funcUnits = IS.getUnits(); + unsigned numAlternatives = countPopulation(funcUnits); + if (numAlternatives < min) { + min = numAlternatives; + F = funcUnits; + } } + return min; + } + if (STI && STI->getSchedModel().hasInstrSchedModel()) { + const MCSchedClassDesc *SCDesc = + STI->getSchedModel().getSchedClassDesc(SchedClass); + if (!SCDesc->isValid()) + // No valid Schedule Class Desc for schedClass, should be + // Pseudo/PostRAPseudo + return min; + + for (const MCWriteProcResEntry &PRE : + make_range(STI->getWriteProcResBegin(SCDesc), + STI->getWriteProcResEnd(SCDesc))) { + if (!PRE.Cycles) + continue; + const MCProcResourceDesc *ProcResource = + STI->getSchedModel().getProcResource(PRE.ProcResourceIdx); + unsigned NumUnits = ProcResource->NumUnits; + if (NumUnits < min) { + min = NumUnits; + F = PRE.ProcResourceIdx; + } + } + return min; } - return min; + llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!"); } // Compute the critical resources needed by the instruction. This @@ -774,13 +923,34 @@ struct FuncUnitSorter { // the same, highly used, functional unit have high priority. void calcCriticalResources(MachineInstr &MI) { unsigned SchedClass = MI.getDesc().getSchedClass(); - for (const InstrStage *IS = InstrItins->beginStage(SchedClass), - *IE = InstrItins->endStage(SchedClass); - IS != IE; ++IS) { - unsigned FuncUnits = IS->getUnits(); - if (countPopulation(FuncUnits) == 1) - Resources[FuncUnits]++; + if (InstrItins && !InstrItins->isEmpty()) { + for (const InstrStage &IS : + make_range(InstrItins->beginStage(SchedClass), + InstrItins->endStage(SchedClass))) { + unsigned FuncUnits = IS.getUnits(); + if (countPopulation(FuncUnits) == 1) + Resources[FuncUnits]++; + } + return; + } + if (STI && STI->getSchedModel().hasInstrSchedModel()) { + const MCSchedClassDesc *SCDesc = + STI->getSchedModel().getSchedClassDesc(SchedClass); + if (!SCDesc->isValid()) + // No valid Schedule Class Desc for schedClass, should be + // Pseudo/PostRAPseudo + return; + + for (const MCWriteProcResEntry &PRE : + make_range(STI->getWriteProcResBegin(SCDesc), + STI->getWriteProcResEnd(SCDesc))) { + if (!PRE.Cycles) + continue; + Resources[PRE.ProcResourceIdx]++; + } + return; } + llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!"); } /// Return true if IS1 has less priority than IS2. @@ -803,14 +973,15 @@ struct FuncUnitSorter { /// to add it to each existing DFA, until a legal space is found. If the /// instruction cannot be reserved in an existing DFA, we create a new one. unsigned SwingSchedulerDAG::calculateResMII() { - SmallVector<DFAPacketizer *, 8> Resources; + + LLVM_DEBUG(dbgs() << "calculateResMII:\n"); + SmallVector<ResourceManager*, 8> Resources; MachineBasicBlock *MBB = Loop.getHeader(); - Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget())); + Resources.push_back(new ResourceManager(&MF.getSubtarget())); // Sort the instructions by the number of available choices for scheduling, // least to most. Use the number of critical resources as the tie breaker. - FuncUnitSorter FUS = - FuncUnitSorter(MF.getSubtarget().getInstrItineraryData()); + FuncUnitSorter FUS = FuncUnitSorter(MF.getSubtarget()); for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(), E = MBB->getFirstTerminator(); I != E; ++I) @@ -832,33 +1003,40 @@ unsigned SwingSchedulerDAG::calculateResMII() { // DFA is needed for each cycle. unsigned NumCycles = getSUnit(MI)->Latency; unsigned ReservedCycles = 0; - SmallVectorImpl<DFAPacketizer *>::iterator RI = Resources.begin(); - SmallVectorImpl<DFAPacketizer *>::iterator RE = Resources.end(); + SmallVectorImpl<ResourceManager *>::iterator RI = Resources.begin(); + SmallVectorImpl<ResourceManager *>::iterator RE = Resources.end(); + LLVM_DEBUG({ + dbgs() << "Trying to reserve resource for " << NumCycles + << " cycles for \n"; + MI->dump(); + }); for (unsigned C = 0; C < NumCycles; ++C) while (RI != RE) { - if ((*RI++)->canReserveResources(*MI)) { + if ((*RI)->canReserveResources(*MI)) { + (*RI)->reserveResources(*MI); ++ReservedCycles; break; } + RI++; } - // Start reserving resources using existing DFAs. - for (unsigned C = 0; C < ReservedCycles; ++C) { - --RI; - (*RI)->reserveResources(*MI); - } + LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles + << ", NumCycles:" << NumCycles << "\n"); // Add new DFAs, if needed, to reserve resources. for (unsigned C = ReservedCycles; C < NumCycles; ++C) { - DFAPacketizer *NewResource = - TII->CreateTargetScheduleState(MF.getSubtarget()); + LLVM_DEBUG(if (SwpDebugResource) dbgs() + << "NewResource created to reserve resources" + << "\n"); + ResourceManager *NewResource = new ResourceManager(&MF.getSubtarget()); assert(NewResource->canReserveResources(*MI) && "Reserve error."); NewResource->reserveResources(*MI); Resources.push_back(NewResource); } } int Resmii = Resources.size(); + LLVM_DEBUG(dbgs() << "Retrun Res MII:" << Resmii << "\n"); // Delete the memory for each of the DFAs that were created earlier. - for (DFAPacketizer *RI : Resources) { - DFAPacketizer *D = RI; + for (ResourceManager *RI : Resources) { + ResourceManager *D = RI; delete D; } Resources.clear(); @@ -1517,7 +1695,7 @@ void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) { } } -/// Add the node to the set, and add all is its connected nodes to the set. +/// Add the node to the set, and add all of its connected nodes to the set. void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet, SetVector<SUnit *> &NodesAdded) { NewSet.insert(SU); @@ -1741,12 +1919,16 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) { /// Process the nodes in the computed order and create the pipelined schedule /// of the instructions, if possible. Return true if a schedule is found. bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { - if (NodeOrder.empty()) + + if (NodeOrder.empty()){ + LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" ); return false; + } bool scheduleFound = false; + unsigned II = 0; // Keep increasing II until a valid schedule is found. - for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) { + for (II = MII; II <= MAX_II && !scheduleFound; ++II) { Schedule.reset(); Schedule.setInitiationInterval(II); LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n"); @@ -1767,13 +1949,14 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart, II, this); LLVM_DEBUG({ + dbgs() << "\n"; dbgs() << "Inst (" << SU->NodeNum << ") "; SU->getInstr()->dump(); dbgs() << "\n"; }); LLVM_DEBUG({ - dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart - << " me: " << SchedEnd << " ms: " << SchedStart << "\n"; + dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart, + LateStart, SchedEnd, SchedStart); }); if (EarlyStart > LateStart || SchedEnd < EarlyStart || @@ -1818,7 +2001,8 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { scheduleFound = Schedule.isValidSchedule(this); } - LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n"); + LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II + << ")\n"); if (scheduleFound) Schedule.finalizeSchedule(this); @@ -1847,6 +2031,10 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) { InstrMapTy InstrMap; SmallVector<MachineBasicBlock *, 4> PrologBBs; + + MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader(); + assert(PreheaderBB != nullptr && + "Need to add code to handle loops w/o preheader"); // Generate the prolog instructions that set up the pipeline. generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs); MF.insert(BB->getIterator(), KernelBB); @@ -1903,7 +2091,7 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) { removeDeadInstructions(KernelBB, EpilogBBs); // Add branches between prolog and epilog blocks. - addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap); + addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap); // Remove the original loop since it's no longer referenced. for (auto &I : *BB) @@ -2242,7 +2430,7 @@ void SwingSchedulerDAG::generateExistingPhis( // Use the value defined by the Phi, unless we're generating the first // epilog and the Phi refers to a Phi in a different stage. else if (VRMap[PrevStage - np].count(Def) && - (!LoopDefIsPhi || PrevStage != LastStageNum)) + (!LoopDefIsPhi || (PrevStage != LastStageNum) || (LoopValStage == StageScheduled))) PhiOp2 = VRMap[PrevStage - np][Def]; } @@ -2588,7 +2776,8 @@ static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) { /// Create branches from each prolog basic block to the appropriate epilog /// block. These edges are needed if the loop ends before reaching the /// kernel. -void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs, +void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB, + MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, SMSchedule &Schedule, ValueMapTy *VRMap) { @@ -2615,8 +2804,8 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs, // Check if the LOOP0 has already been removed. If so, then there is no need // to reduce the trip count. if (LC != 0) - LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j, - MaxIter); + LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond, + PrevInsts, j, MaxIter); // Record the value of the first trip count, which is used to determine if // branches and blocks can be removed for constant trip counts. @@ -2657,7 +2846,7 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs, /// during each iteration. Set Delta to the amount of the change. bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - MachineOperand *BaseOp; + const MachineOperand *BaseOp; int64_t Offset; if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI)) return false; @@ -2698,7 +2887,9 @@ void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI, return; SmallVector<MachineMemOperand *, 2> NewMMOs; for (MachineMemOperand *MMO : NewMI.memoperands()) { - if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) || + // TODO: Figure out whether isAtomic is really necessary (see D57601). + if (MMO->isVolatile() || MMO->isAtomic() || + (MMO->isInvariant() && MMO->isDereferenceable()) || (!MMO->getValue())) { NewMMOs.push_back(MMO); continue; @@ -3058,6 +3249,7 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep, // Assume ordered loads and stores may have a loop carried dependence. if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() || + SI->mayRaiseFPException() || DI->mayRaiseFPException() || SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef()) return true; @@ -3069,7 +3261,7 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep, if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD)) return true; - MachineOperand *BaseOpS, *BaseOpD; + const MachineOperand *BaseOpS, *BaseOpD; int64_t OffsetS, OffsetD; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, TRI) || @@ -3097,12 +3289,14 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep, // This is the main test, which checks the offset values and the loop // increment value to determine if the accesses may be loop carried. - if (OffsetS >= OffsetD) - return OffsetS + AccessSizeS > DeltaS; - else - return OffsetD + AccessSizeD > DeltaD; + if (AccessSizeS == MemoryLocation::UnknownSize || + AccessSizeD == MemoryLocation::UnknownSize) + return true; - return true; + if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD) + return true; + + return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD); } void SwingSchedulerDAG::postprocessDAG() { @@ -3117,6 +3311,10 @@ void SwingSchedulerDAG::postprocessDAG() { /// the relative values of StartCycle and EndCycle. bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) { bool forward = true; + LLVM_DEBUG({ + dbgs() << "Trying to insert node between " << StartCycle << " and " + << EndCycle << " II: " << II << "\n"; + }); if (StartCycle > EndCycle) forward = false; @@ -3125,8 +3323,9 @@ bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) { for (int curCycle = StartCycle; curCycle != termCycle; forward ? ++curCycle : --curCycle) { - // Add the already scheduled instructions at the specified cycle to the DFA. - Resources->clearResources(); + // Add the already scheduled instructions at the specified cycle to the + // DFA. + ProcItinResources.clearResources(); for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II); checkCycle <= LastCycle; checkCycle += II) { std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle]; @@ -3136,13 +3335,13 @@ bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) { I != E; ++I) { if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode())) continue; - assert(Resources->canReserveResources(*(*I)->getInstr()) && + assert(ProcItinResources.canReserveResources(*(*I)->getInstr()) && "These instructions have already been scheduled."); - Resources->reserveResources(*(*I)->getInstr()); + ProcItinResources.reserveResources(*(*I)->getInstr()); } } if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) || - Resources->canReserveResources(*SU->getInstr())) { + ProcItinResources.canReserveResources(*SU->getInstr())) { LLVM_DEBUG({ dbgs() << "\tinsert at cycle " << curCycle << " "; SU->getInstr()->dump(); @@ -3360,6 +3559,14 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, if (Pos < MoveUse) MoveUse = Pos; } + // We did not handle HW dependences in previous for loop, + // and we normally set Latency = 0 for Anti deps, + // so may have nodes in same cycle with Anti denpendent on HW regs. + else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) { + OrderBeforeUse = true; + if ((MoveUse == 0) || (Pos < MoveUse)) + MoveUse = Pos; + } } for (auto &P : SU->Preds) { if (P.getSUnit() != *I) @@ -3523,9 +3730,8 @@ void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const { for (SDep &PredEdge : SU->Preds) { SUnit *PredSU = PredEdge.getSUnit(); - unsigned PredIndex = - std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(), - std::make_pair(PredSU, 0), CompareKey)); + unsigned PredIndex = std::get<1>( + *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey)); if (!PredSU->getInstr()->isPHI() && PredIndex < Index) { PredBefore = true; Pred = PredSU; @@ -3535,9 +3741,13 @@ void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const { for (SDep &SuccEdge : SU->Succs) { SUnit *SuccSU = SuccEdge.getSUnit(); - unsigned SuccIndex = - std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(), - std::make_pair(SuccSU, 0), CompareKey)); + // Do not process a boundary node, it was not included in NodeOrder, + // hence not in Indices either, call to std::lower_bound() below will + // return Indices.end(). + if (SuccSU->isBoundaryNode()) + continue; + unsigned SuccIndex = std::get<1>( + *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey)); if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) { SuccBefore = true; Succ = SuccSU; @@ -3548,9 +3758,8 @@ void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const { if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) { // instructions in circuits are allowed to be scheduled // after both a successor and predecessor. - bool InCircuit = std::any_of( - Circuits.begin(), Circuits.end(), - [SU](const NodeSet &Circuit) { return Circuit.count(SU); }); + bool InCircuit = llvm::any_of( + Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); }); if (InCircuit) LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";); else { @@ -3740,5 +3949,140 @@ LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); } #endif +void ResourceManager::initProcResourceVectors( + const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) { + unsigned ProcResourceID = 0; + + // We currently limit the resource kinds to 64 and below so that we can use + // uint64_t for Masks + assert(SM.getNumProcResourceKinds() < 64 && + "Too many kinds of resources, unsupported"); + // Create a unique bitmask for every processor resource unit. + // Skip resource at index 0, since it always references 'InvalidUnit'. + Masks.resize(SM.getNumProcResourceKinds()); + for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &Desc = *SM.getProcResource(I); + if (Desc.SubUnitsIdxBegin) + continue; + Masks[I] = 1ULL << ProcResourceID; + ProcResourceID++; + } + // Create a unique bitmask for every processor resource group. + for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &Desc = *SM.getProcResource(I); + if (!Desc.SubUnitsIdxBegin) + continue; + Masks[I] = 1ULL << ProcResourceID; + for (unsigned U = 0; U < Desc.NumUnits; ++U) + Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]]; + ProcResourceID++; + } + LLVM_DEBUG({ + if (SwpShowResMask) { + dbgs() << "ProcResourceDesc:\n"; + for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc *ProcResource = SM.getProcResource(I); + dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n", + ProcResource->Name, I, Masks[I], + ProcResource->NumUnits); + } + dbgs() << " -----------------\n"; + } + }); +} + +bool ResourceManager::canReserveResources(const MCInstrDesc *MID) const { + + LLVM_DEBUG({ + if (SwpDebugResource) + dbgs() << "canReserveResources:\n"; + }); + if (UseDFA) + return DFAResources->canReserveResources(MID); + + unsigned InsnClass = MID->getSchedClass(); + const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass); + if (!SCDesc->isValid()) { + LLVM_DEBUG({ + dbgs() << "No valid Schedule Class Desc for schedClass!\n"; + dbgs() << "isPseduo:" << MID->isPseudo() << "\n"; + }); + return true; + } + + const MCWriteProcResEntry *I = STI->getWriteProcResBegin(SCDesc); + const MCWriteProcResEntry *E = STI->getWriteProcResEnd(SCDesc); + for (; I != E; ++I) { + if (!I->Cycles) + continue; + const MCProcResourceDesc *ProcResource = + SM.getProcResource(I->ProcResourceIdx); + unsigned NumUnits = ProcResource->NumUnits; + LLVM_DEBUG({ + if (SwpDebugResource) + dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n", + ProcResource->Name, I->ProcResourceIdx, + ProcResourceCount[I->ProcResourceIdx], NumUnits, + I->Cycles); + }); + if (ProcResourceCount[I->ProcResourceIdx] >= NumUnits) + return false; + } + LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return true\n\n";); + return true; +} + +void ResourceManager::reserveResources(const MCInstrDesc *MID) { + LLVM_DEBUG({ + if (SwpDebugResource) + dbgs() << "reserveResources:\n"; + }); + if (UseDFA) + return DFAResources->reserveResources(MID); + unsigned InsnClass = MID->getSchedClass(); + const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass); + if (!SCDesc->isValid()) { + LLVM_DEBUG({ + dbgs() << "No valid Schedule Class Desc for schedClass!\n"; + dbgs() << "isPseduo:" << MID->isPseudo() << "\n"; + }); + return; + } + for (const MCWriteProcResEntry &PRE : + make_range(STI->getWriteProcResBegin(SCDesc), + STI->getWriteProcResEnd(SCDesc))) { + if (!PRE.Cycles) + continue; + ++ProcResourceCount[PRE.ProcResourceIdx]; + LLVM_DEBUG({ + if (SwpDebugResource) { + const MCProcResourceDesc *ProcResource = + SM.getProcResource(PRE.ProcResourceIdx); + dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n", + ProcResource->Name, PRE.ProcResourceIdx, + ProcResourceCount[PRE.ProcResourceIdx], + ProcResource->NumUnits, PRE.Cycles); + } + }); + } + LLVM_DEBUG({ + if (SwpDebugResource) + dbgs() << "reserveResources: done!\n\n"; + }); +} + +bool ResourceManager::canReserveResources(const MachineInstr &MI) const { + return canReserveResources(&MI.getDesc()); +} + +void ResourceManager::reserveResources(const MachineInstr &MI) { + return reserveResources(&MI.getDesc()); +} + +void ResourceManager::clearResources() { + if (UseDFA) + return DFAResources->clearResources(); + std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0); +} |