src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
commit	e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree	599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/CodeGen/MachinePipeliner.cpp
parent	1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)

vendor/llvm/llvm-trunk-r366426

Diffstat (limited to 'lib/CodeGen/MachinePipeliner.cpp')

-rw-r--r--

lib/CodeGen/MachinePipeliner.cpp

534

1 files changed, 439 insertions, 95 deletions

diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 4d451bdd7f69..54df522d371a 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp

@@ -1,9 +1,8 @@

//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -97,6 +96,14 @@ using namespace llvm;

STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");

STATISTIC(NumPipelined, "Number of loops software pipelined");

STATISTIC(NumNodeOrderIssues, "Number of node order issues found");

+STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");

+STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");

+STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");

+STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");

+STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");

+STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");

+STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");

+STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");

/// A command line option to turn software pipelining on or off.

static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),

@@ -141,6 +148,11 @@ static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",

cl::ReallyHidden, cl::init(false),

cl::ZeroOrMore, cl::desc("Ignore RecMII"));

+static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,

+ cl::init(false));

+static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,

+ cl::init(false));

namespace llvm {

// A command line option to enable the CopyToPhi DAG mutation.

@@ -180,6 +192,16 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {

!EnableSWPOptSize.getPosition())

return false;

+ if (!mf.getSubtarget().enableMachinePipeliner())

+ return false;

+ // Cannot pipeline loops without instruction itineraries if we are using

+ // DFA for the pipeliner.

+ if (mf.getSubtarget().useDFAforSMS() &&

+ (!mf.getSubtarget().getInstrItineraryData() ||

+ mf.getSubtarget().getInstrItineraryData()->isEmpty()))

+ return false;

MF = &mf;

MLI = &getAnalysis<MachineLoopInfo>();

MDT = &getAnalysis<MachineDominatorTree>();

@@ -211,8 +233,11 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {

}

#endif

- if (!canPipelineLoop(L))

+ setPragmaPipelineOptions(L);

+ if (!canPipelineLoop(L)) {

+ LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");

return Changed;

+ }

++NumTrytoPipeline;

@@ -221,6 +246,50 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {

return Changed;

}

+void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {

+ MachineBasicBlock *LBLK = L.getTopBlock();

+ if (LBLK == nullptr)

+ return;

+ const BasicBlock *BBLK = LBLK->getBasicBlock();

+ if (BBLK == nullptr)

+ return;

+ const Instruction *TI = BBLK->getTerminator();

+ if (TI == nullptr)

+ return;

+ MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);

+ if (LoopID == nullptr)

+ return;

+ assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");

+ assert(LoopID->getOperand(0) == LoopID && "invalid loop");

+ for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {

+ MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));

+ if (MD == nullptr)

+ continue;

+ MDString *S = dyn_cast<MDString>(MD->getOperand(0));

+ if (S == nullptr)

+ continue;

+ if (S->getString() == "llvm.loop.pipeline.initiationinterval") {

+ assert(MD->getNumOperands() == 2 &&

+ "Pipeline initiation interval hint metadata should have two operands.");

+ II_setByPragma =

+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();

+ assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");

+ } else if (S->getString() == "llvm.loop.pipeline.disable") {

+ disabledByPragma = true;

+ }

/// Return true if the loop can be software pipelined. The algorithm is

/// restricted to loops with a single basic block. Make sure that the

/// branch in the loop can be analyzed.

@@ -228,21 +297,36 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {

if (L.getNumBlocks() != 1)

return false;

+ if (disabledByPragma)

+ return false;

// Check if the branch can't be understood because we can't do pipelining

// if that's the case.

LI.TBB = nullptr;

LI.FBB = nullptr;

LI.BrCond.clear();

- if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))

+ if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {

+ LLVM_DEBUG(

+ dbgs() << "Unable to analyzeBranch, can NOT pipeline current Loop\n");

+ NumFailBranch++;

return false;

+ }

LI.LoopInductionVar = nullptr;

LI.LoopCompare = nullptr;

- if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))

+ if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare)) {

+ LLVM_DEBUG(

+ dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n");

+ NumFailLoop++;

return false;

+ }

- if (!L.getLoopPreheader())

+ if (!L.getLoopPreheader()) {

+ LLVM_DEBUG(

+ dbgs() << "Preheader not found, can NOT pipeline current Loop\n");

+ NumFailPreheader++;

return false;

+ }

// Remove any subregisters from inputs to phi nodes.

preprocessPhiNodes(*L.getHeader());

@@ -286,7 +370,8 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {

bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {

assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");

- SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo);

+ SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,

+ II_setByPragma);

MachineBasicBlock *MBB = L.getHeader();

// The kernel should not include any terminator instructions. These

@@ -309,6 +394,20 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {

return SMS.hasNewSchedule();

}

+void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {

+ if (II_setByPragma > 0)

+ MII = II_setByPragma;

+ else

+ MII = std::max(ResMII, RecMII);

+void SwingSchedulerDAG::setMAX_II() {

+ if (II_setByPragma > 0)

+ MAX_II = II_setByPragma;

+ else

+ MAX_II = MII + 10;

/// We override the schedule function in ScheduleDAGInstrs to implement the

/// scheduling part of the Swing Modulo Scheduling algorithm.

void SwingSchedulerDAG::schedule() {

@@ -335,17 +434,28 @@ void SwingSchedulerDAG::schedule() {

if (SwpIgnoreRecMII)

RecMII = 0;

- MII = std::max(ResMII, RecMII);

- LLVM_DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII

- << ", res=" << ResMII << ")\n");

+ setMII(ResMII, RecMII);

+ setMAX_II();

+ LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II

+ << " (rec=" << RecMII << ", res=" << ResMII << ")\n");

// Can't schedule a loop without a valid MII.

- if (MII == 0)

+ if (MII == 0) {

+ LLVM_DEBUG(

+ dbgs()

+ << "0 is not a valid Minimal Initiation Interval, can NOT schedule\n");

+ NumFailZeroMII++;

return;

+ }

// Don't pipeline large loops.

- if (SwpMaxMii != -1 && (int)MII > SwpMaxMii)

+ if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {

+ LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii

+ << ", we don't pipleline large loops\n");

+ NumFailLargeMaxMII++;

return;

+ }

computeNodeFunctions(NodeSets);

@@ -362,7 +472,7 @@ void SwingSchedulerDAG::schedule() {

}

});

- std::stable_sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());

+ llvm::stable_sort(NodeSets, std::greater<NodeSet>());

groupRemainingNodes(NodeSets);

@@ -383,17 +493,27 @@ void SwingSchedulerDAG::schedule() {

SMSchedule Schedule(Pass.MF);

Scheduled = schedulePipeline(Schedule);

- if (!Scheduled)

+ if (!Scheduled){

+ LLVM_DEBUG(dbgs() << "No schedule found, return\n");

+ NumFailNoSchedule++;

return;

+ }

unsigned numStages = Schedule.getMaxStageCount();

// No need to generate pipeline if there are no overlapped iterations.

- if (numStages == 0)

+ if (numStages == 0) {

+ LLVM_DEBUG(

+ dbgs() << "No overlapped iterations, no need to generate pipeline\n");

+ NumFailZeroStage++;

return;

+ }

// Check that the maximum stage count is less than user-defined limit.

- if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages)

+ if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {

+ LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages

+ << " : too many stages, abort\n");

+ NumFailLargeMaxStage++;

return;

+ }

generatePipelinedLoop(Schedule);

++NumPipelined;

@@ -467,7 +587,8 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {

/// Return true if the instruction causes a chain between memory

/// references before and after it.

static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {

- return MI.isCall() || MI.hasUnmodeledSideEffects() ||

+ return MI.isCall() || MI.mayRaiseFPException() ||

+ MI.hasUnmodeledSideEffects() ||

(MI.hasOrderedMemoryRef() &&

(!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));

}

@@ -475,16 +596,16 @@ static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {

/// Return the underlying objects for the memory references of an instruction.

/// This function calls the code in ValueTracking, but first checks that the

/// instruction has a memory operand.

-static void getUnderlyingObjects(MachineInstr *MI,

- SmallVectorImpl<Value *> &Objs,

+static void getUnderlyingObjects(const MachineInstr *MI,

+ SmallVectorImpl<const Value *> &Objs,

const DataLayout &DL) {

if (!MI->hasOneMemOperand())

return;

MachineMemOperand *MM = *MI->memoperands_begin();

if (!MM->getValue())

return;

- GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);

- for (Value *V : Objs) {

+ GetUnderlyingObjects(MM->getValue(), Objs, DL);

+ for (const Value *V : Objs) {

if (!isIdentifiedObject(V)) {

Objs.clear();

return;

@@ -498,7 +619,7 @@ static void getUnderlyingObjects(MachineInstr *MI,

/// dependence. This code is very similar to the code in ScheduleDAGInstrs

/// but that code doesn't create loop carried dependences.

void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {

- MapVector<Value *, SmallVector<SUnit *, 4>> PendingLoads;

+ MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;

Value *UnknownValue =

UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));

for (auto &SU : SUnits) {

@@ -506,7 +627,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {

if (isDependenceBarrier(MI, AA))

PendingLoads.clear();

else if (MI.mayLoad()) {

- SmallVector<Value *, 4> Objs;

+ SmallVector<const Value *, 4> Objs;

getUnderlyingObjects(&MI, Objs, MF.getDataLayout());

if (Objs.empty())

Objs.push_back(UnknownValue);

@@ -515,12 +636,12 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {

SUs.push_back(&SU);

}

} else if (MI.mayStore()) {

- SmallVector<Value *, 4> Objs;

+ SmallVector<const Value *, 4> Objs;

getUnderlyingObjects(&MI, Objs, MF.getDataLayout());

if (Objs.empty())

Objs.push_back(UnknownValue);

for (auto V : Objs) {

- MapVector<Value *, SmallVector<SUnit *, 4>>::iterator I =

+ MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =

PendingLoads.find(V);

if (I == PendingLoads.end())

continue;

@@ -531,7 +652,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {

// First, perform the cheaper check that compares the base register.

// If they are the same and the load offset is less than the store

// offset, then mark the dependence as loop carried potentially.

- MachineOperand *BaseOp1, *BaseOp2;

+ const MachineOperand *BaseOp1, *BaseOp2;

int64_t Offset1, Offset2;

if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, TRI) &&

TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) {

@@ -744,27 +865,55 @@ namespace {

// the number of functional unit choices.

struct FuncUnitSorter {

const InstrItineraryData *InstrItins;

+ const MCSubtargetInfo *STI;

DenseMap<unsigned, unsigned> Resources;

- FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}

+ FuncUnitSorter(const TargetSubtargetInfo &TSI)

+ : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}

// Compute the number of functional unit alternatives needed

// at each stage, and take the minimum value. We prioritize the

// instructions by the least number of choices first.

unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {

- unsigned schedClass = Inst->getDesc().getSchedClass();

+ unsigned SchedClass = Inst->getDesc().getSchedClass();

unsigned min = UINT_MAX;

- for (const InstrStage *IS = InstrItins->beginStage(schedClass),

- *IE = InstrItins->endStage(schedClass);

- IS != IE; ++IS) {

- unsigned funcUnits = IS->getUnits();

- unsigned numAlternatives = countPopulation(funcUnits);

- if (numAlternatives < min) {

- min = numAlternatives;

- F = funcUnits;

+ if (InstrItins && !InstrItins->isEmpty()) {

+ for (const InstrStage &IS :

+ make_range(InstrItins->beginStage(SchedClass),

+ InstrItins->endStage(SchedClass))) {

+ unsigned funcUnits = IS.getUnits();

+ unsigned numAlternatives = countPopulation(funcUnits);

+ if (numAlternatives < min) {

+ min = numAlternatives;

+ F = funcUnits;

+ }

}

+ return min;

+ }

+ if (STI && STI->getSchedModel().hasInstrSchedModel()) {

+ const MCSchedClassDesc *SCDesc =

+ STI->getSchedModel().getSchedClassDesc(SchedClass);

+ if (!SCDesc->isValid())

+ // No valid Schedule Class Desc for schedClass, should be

+ // Pseudo/PostRAPseudo

+ return min;

+ for (const MCWriteProcResEntry &PRE :

+ make_range(STI->getWriteProcResBegin(SCDesc),

+ STI->getWriteProcResEnd(SCDesc))) {

+ if (!PRE.Cycles)

+ continue;

+ const MCProcResourceDesc *ProcResource =

+ STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);

+ unsigned NumUnits = ProcResource->NumUnits;

+ if (NumUnits < min) {

+ min = NumUnits;

+ F = PRE.ProcResourceIdx;

+ }

+ return min;

}

- return min;

+ llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");

}

// Compute the critical resources needed by the instruction. This

@@ -774,13 +923,34 @@ struct FuncUnitSorter {

// the same, highly used, functional unit have high priority.

void calcCriticalResources(MachineInstr &MI) {

unsigned SchedClass = MI.getDesc().getSchedClass();

- for (const InstrStage *IS = InstrItins->beginStage(SchedClass),

- *IE = InstrItins->endStage(SchedClass);

- IS != IE; ++IS) {

- unsigned FuncUnits = IS->getUnits();

- if (countPopulation(FuncUnits) == 1)

- Resources[FuncUnits]++;

+ if (InstrItins && !InstrItins->isEmpty()) {

+ for (const InstrStage &IS :

+ make_range(InstrItins->beginStage(SchedClass),

+ InstrItins->endStage(SchedClass))) {

+ unsigned FuncUnits = IS.getUnits();

+ if (countPopulation(FuncUnits) == 1)

+ Resources[FuncUnits]++;

+ }

+ return;

+ }

+ if (STI && STI->getSchedModel().hasInstrSchedModel()) {

+ const MCSchedClassDesc *SCDesc =

+ STI->getSchedModel().getSchedClassDesc(SchedClass);

+ if (!SCDesc->isValid())

+ // No valid Schedule Class Desc for schedClass, should be

+ // Pseudo/PostRAPseudo

+ return;

+ for (const MCWriteProcResEntry &PRE :

+ make_range(STI->getWriteProcResBegin(SCDesc),

+ STI->getWriteProcResEnd(SCDesc))) {

+ if (!PRE.Cycles)

+ continue;

+ Resources[PRE.ProcResourceIdx]++;

+ }

+ return;

}

+ llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");

}

/// Return true if IS1 has less priority than IS2.

@@ -803,14 +973,15 @@ struct FuncUnitSorter {

/// to add it to each existing DFA, until a legal space is found. If the

/// instruction cannot be reserved in an existing DFA, we create a new one.

unsigned SwingSchedulerDAG::calculateResMII() {

- SmallVector<DFAPacketizer *, 8> Resources;

+ LLVM_DEBUG(dbgs() << "calculateResMII:\n");

+ SmallVector<ResourceManager*, 8> Resources;

MachineBasicBlock *MBB = Loop.getHeader();

- Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));

+ Resources.push_back(new ResourceManager(&MF.getSubtarget()));

// Sort the instructions by the number of available choices for scheduling,

// least to most. Use the number of critical resources as the tie breaker.

- FuncUnitSorter FUS =

- FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());

+ FuncUnitSorter FUS = FuncUnitSorter(MF.getSubtarget());

for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),

E = MBB->getFirstTerminator();

I != E; ++I)

@@ -832,33 +1003,40 @@ unsigned SwingSchedulerDAG::calculateResMII() {

// DFA is needed for each cycle.

unsigned NumCycles = getSUnit(MI)->Latency;

unsigned ReservedCycles = 0;

- SmallVectorImpl<DFAPacketizer *>::iterator RI = Resources.begin();

- SmallVectorImpl<DFAPacketizer *>::iterator RE = Resources.end();

+ SmallVectorImpl<ResourceManager *>::iterator RI = Resources.begin();

+ SmallVectorImpl<ResourceManager *>::iterator RE = Resources.end();

+ LLVM_DEBUG({

+ dbgs() << "Trying to reserve resource for " << NumCycles

+ << " cycles for \n";

+ MI->dump();

+ });

for (unsigned C = 0; C < NumCycles; ++C)

while (RI != RE) {

- if ((*RI++)->canReserveResources(*MI)) {

+ if ((*RI)->canReserveResources(*MI)) {

+ (*RI)->reserveResources(*MI);

++ReservedCycles;

break;

}

+ RI++;

}

- // Start reserving resources using existing DFAs.

- for (unsigned C = 0; C < ReservedCycles; ++C) {

- --RI;

- (*RI)->reserveResources(*MI);

- }

+ LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles

+ << ", NumCycles:" << NumCycles << "\n");

// Add new DFAs, if needed, to reserve resources.

for (unsigned C = ReservedCycles; C < NumCycles; ++C) {

- DFAPacketizer *NewResource =

- TII->CreateTargetScheduleState(MF.getSubtarget());

+ LLVM_DEBUG(if (SwpDebugResource) dbgs()

+ << "NewResource created to reserve resources"

+ << "\n");

+ ResourceManager *NewResource = new ResourceManager(&MF.getSubtarget());

assert(NewResource->canReserveResources(*MI) && "Reserve error.");

NewResource->reserveResources(*MI);

Resources.push_back(NewResource);

}

int Resmii = Resources.size();

+ LLVM_DEBUG(dbgs() << "Retrun Res MII:" << Resmii << "\n");

// Delete the memory for each of the DFAs that were created earlier.

- for (DFAPacketizer *RI : Resources) {

- DFAPacketizer *D = RI;

+ for (ResourceManager *RI : Resources) {

+ ResourceManager *D = RI;

delete D;

}

Resources.clear();

@@ -1517,7 +1695,7 @@ void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {

}

-/// Add the node to the set, and add all is its connected nodes to the set.

+/// Add the node to the set, and add all of its connected nodes to the set.

void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,

SetVector<SUnit *> &NodesAdded) {

NewSet.insert(SU);

@@ -1741,12 +1919,16 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {

/// Process the nodes in the computed order and create the pipelined schedule

/// of the instructions, if possible. Return true if a schedule is found.

bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {

- if (NodeOrder.empty())

+ if (NodeOrder.empty()){

+ LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );

return false;

+ }

bool scheduleFound = false;

+ unsigned II = 0;

// Keep increasing II until a valid schedule is found.

- for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) {

+ for (II = MII; II <= MAX_II && !scheduleFound; ++II) {

Schedule.reset();

Schedule.setInitiationInterval(II);

LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");

@@ -1767,13 +1949,14 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {

Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,

II, this);

LLVM_DEBUG({

+ dbgs() << "\n";

dbgs() << "Inst (" << SU->NodeNum << ") ";

SU->getInstr()->dump();

dbgs() << "\n";

});

LLVM_DEBUG({

- dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart

- << " me: " << SchedEnd << " ms: " << SchedStart << "\n";

+ dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,

+ LateStart, SchedEnd, SchedStart);

});

if (EarlyStart > LateStart || SchedEnd < EarlyStart ||

@@ -1818,7 +2001,8 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {

scheduleFound = Schedule.isValidSchedule(this);

}

- LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");

+ LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II

+ << ")\n");

if (scheduleFound)

Schedule.finalizeSchedule(this);

@@ -1847,6 +2031,10 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {

InstrMapTy InstrMap;

SmallVector<MachineBasicBlock *, 4> PrologBBs;

+ MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();

+ assert(PreheaderBB != nullptr &&

+ "Need to add code to handle loops w/o preheader");

// Generate the prolog instructions that set up the pipeline.

generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);

MF.insert(BB->getIterator(), KernelBB);

@@ -1903,7 +2091,7 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {

removeDeadInstructions(KernelBB, EpilogBBs);

// Add branches between prolog and epilog blocks.

- addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);

+ addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);

// Remove the original loop since it's no longer referenced.

for (auto &I : *BB)

@@ -2242,7 +2430,7 @@ void SwingSchedulerDAG::generateExistingPhis(

// Use the value defined by the Phi, unless we're generating the first

// epilog and the Phi refers to a Phi in a different stage.

else if (VRMap[PrevStage - np].count(Def) &&

- (!LoopDefIsPhi || PrevStage != LastStageNum))

+ (!LoopDefIsPhi || (PrevStage != LastStageNum) || (LoopValStage == StageScheduled)))

PhiOp2 = VRMap[PrevStage - np][Def];

}

@@ -2588,7 +2776,8 @@ static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {

/// Create branches from each prolog basic block to the appropriate epilog

/// block. These edges are needed if the loop ends before reaching the

/// kernel.

-void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,

+void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB,

+ MBBVectorTy &PrologBBs,

MachineBasicBlock *KernelBB,

MBBVectorTy &EpilogBBs,

SMSchedule &Schedule, ValueMapTy *VRMap) {

@@ -2615,8 +2804,8 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,

// Check if the LOOP0 has already been removed. If so, then there is no need

// to reduce the trip count.

if (LC != 0)

- LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,

- MaxIter);

+ LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond,

+ PrevInsts, j, MaxIter);

// Record the value of the first trip count, which is used to determine if

// branches and blocks can be removed for constant trip counts.

@@ -2657,7 +2846,7 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,

/// during each iteration. Set Delta to the amount of the change.

bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {

const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();

- MachineOperand *BaseOp;

+ const MachineOperand *BaseOp;

int64_t Offset;

if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))

return false;

@@ -2698,7 +2887,9 @@ void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,

return;

SmallVector<MachineMemOperand *, 2> NewMMOs;

for (MachineMemOperand *MMO : NewMI.memoperands()) {

- if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) ||

+ // TODO: Figure out whether isAtomic is really necessary (see D57601).

+ if (MMO->isVolatile() || MMO->isAtomic() ||

+ (MMO->isInvariant() && MMO->isDereferenceable()) ||

(!MMO->getValue())) {

NewMMOs.push_back(MMO);

continue;

@@ -3058,6 +3249,7 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,

// Assume ordered loads and stores may have a loop carried dependence.

if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||

+ SI->mayRaiseFPException() || DI->mayRaiseFPException() ||

SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())

return true;

@@ -3069,7 +3261,7 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,

if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))

return true;

- MachineOperand *BaseOpS, *BaseOpD;

+ const MachineOperand *BaseOpS, *BaseOpD;

int64_t OffsetS, OffsetD;

const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();

if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, TRI) ||

@@ -3097,12 +3289,14 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,

// This is the main test, which checks the offset values and the loop

// increment value to determine if the accesses may be loop carried.

- if (OffsetS >= OffsetD)

- return OffsetS + AccessSizeS > DeltaS;

- else

- return OffsetD + AccessSizeD > DeltaD;

+ if (AccessSizeS == MemoryLocation::UnknownSize ||

+ AccessSizeD == MemoryLocation::UnknownSize)

+ return true;

- return true;

+ if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)

+ return true;

+ return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);

}

void SwingSchedulerDAG::postprocessDAG() {

@@ -3117,6 +3311,10 @@ void SwingSchedulerDAG::postprocessDAG() {

/// the relative values of StartCycle and EndCycle.

bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {

bool forward = true;

+ LLVM_DEBUG({

+ dbgs() << "Trying to insert node between " << StartCycle << " and "

+ << EndCycle << " II: " << II << "\n";

+ });

if (StartCycle > EndCycle)

forward = false;

@@ -3125,8 +3323,9 @@ bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {

for (int curCycle = StartCycle; curCycle != termCycle;

forward ? ++curCycle : --curCycle) {

- // Add the already scheduled instructions at the specified cycle to the DFA.

- Resources->clearResources();

+ // Add the already scheduled instructions at the specified cycle to the

+ // DFA.

+ ProcItinResources.clearResources();

for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);

checkCycle <= LastCycle; checkCycle += II) {

std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];

@@ -3136,13 +3335,13 @@ bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {

I != E; ++I) {

if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))

continue;

- assert(Resources->canReserveResources(*(*I)->getInstr()) &&

+ assert(ProcItinResources.canReserveResources(*(*I)->getInstr()) &&

"These instructions have already been scheduled.");

- Resources->reserveResources(*(*I)->getInstr());

+ ProcItinResources.reserveResources(*(*I)->getInstr());

}

if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||

- Resources->canReserveResources(*SU->getInstr())) {

+ ProcItinResources.canReserveResources(*SU->getInstr())) {

LLVM_DEBUG({

dbgs() << "\tinsert at cycle " << curCycle << " ";

SU->getInstr()->dump();

@@ -3360,6 +3559,14 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,

if (Pos < MoveUse)

MoveUse = Pos;

}

+ // We did not handle HW dependences in previous for loop,

+ // and we normally set Latency = 0 for Anti deps,

+ // so may have nodes in same cycle with Anti denpendent on HW regs.

+ else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {

+ OrderBeforeUse = true;

+ if ((MoveUse == 0) || (Pos < MoveUse))

+ MoveUse = Pos;

+ }

}

for (auto &P : SU->Preds) {

if (P.getSUnit() != *I)

@@ -3523,9 +3730,8 @@ void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {

for (SDep &PredEdge : SU->Preds) {

SUnit *PredSU = PredEdge.getSUnit();

- unsigned PredIndex =

- std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),

- std::make_pair(PredSU, 0), CompareKey));

+ unsigned PredIndex = std::get<1>(

+ *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));

if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {

PredBefore = true;

Pred = PredSU;

@@ -3535,9 +3741,13 @@ void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {

for (SDep &SuccEdge : SU->Succs) {

SUnit *SuccSU = SuccEdge.getSUnit();

- unsigned SuccIndex =

- std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),

- std::make_pair(SuccSU, 0), CompareKey));

+ // Do not process a boundary node, it was not included in NodeOrder,

+ // hence not in Indices either, call to std::lower_bound() below will

+ // return Indices.end().

+ if (SuccSU->isBoundaryNode())

+ continue;

+ unsigned SuccIndex = std::get<1>(

+ *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));

if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {

SuccBefore = true;

Succ = SuccSU;

@@ -3548,9 +3758,8 @@ void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {

if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {

// instructions in circuits are allowed to be scheduled

// after both a successor and predecessor.

- bool InCircuit = std::any_of(

- Circuits.begin(), Circuits.end(),

- [SU](const NodeSet &Circuit) { return Circuit.count(SU); });

+ bool InCircuit = llvm::any_of(

+ Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });

if (InCircuit)

LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);

else {

@@ -3740,5 +3949,140 @@ LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }

#endif

+void ResourceManager::initProcResourceVectors(

+ const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {

+ unsigned ProcResourceID = 0;

+ // We currently limit the resource kinds to 64 and below so that we can use

+ // uint64_t for Masks

+ assert(SM.getNumProcResourceKinds() < 64 &&

+ "Too many kinds of resources, unsupported");

+ // Create a unique bitmask for every processor resource unit.

+ // Skip resource at index 0, since it always references 'InvalidUnit'.

+ Masks.resize(SM.getNumProcResourceKinds());

+ for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {

+ const MCProcResourceDesc &Desc = *SM.getProcResource(I);

+ if (Desc.SubUnitsIdxBegin)

+ continue;

+ Masks[I] = 1ULL << ProcResourceID;

+ ProcResourceID++;

+ }

+ // Create a unique bitmask for every processor resource group.

+ for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {

+ const MCProcResourceDesc &Desc = *SM.getProcResource(I);

+ if (!Desc.SubUnitsIdxBegin)

+ continue;

+ Masks[I] = 1ULL << ProcResourceID;

+ for (unsigned U = 0; U < Desc.NumUnits; ++U)

+ Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];

+ ProcResourceID++;

+ }

+ LLVM_DEBUG({

+ if (SwpShowResMask) {

+ dbgs() << "ProcResourceDesc:\n";

+ for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {

+ const MCProcResourceDesc *ProcResource = SM.getProcResource(I);

+ dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",

+ ProcResource->Name, I, Masks[I],

+ ProcResource->NumUnits);

+ }

+ dbgs() << " -----------------\n";

+ }

+ });

+bool ResourceManager::canReserveResources(const MCInstrDesc *MID) const {

+ LLVM_DEBUG({

+ if (SwpDebugResource)

+ dbgs() << "canReserveResources:\n";

+ });

+ if (UseDFA)

+ return DFAResources->canReserveResources(MID);

+ unsigned InsnClass = MID->getSchedClass();

+ const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);

+ if (!SCDesc->isValid()) {

+ LLVM_DEBUG({

+ dbgs() << "No valid Schedule Class Desc for schedClass!\n";

+ dbgs() << "isPseduo:" << MID->isPseudo() << "\n";

+ });

+ return true;

+ }

+ const MCWriteProcResEntry *I = STI->getWriteProcResBegin(SCDesc);

+ const MCWriteProcResEntry *E = STI->getWriteProcResEnd(SCDesc);

+ for (; I != E; ++I) {

+ if (!I->Cycles)

+ continue;

+ const MCProcResourceDesc *ProcResource =

+ SM.getProcResource(I->ProcResourceIdx);

+ unsigned NumUnits = ProcResource->NumUnits;

+ LLVM_DEBUG({

+ if (SwpDebugResource)

+ dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",

+ ProcResource->Name, I->ProcResourceIdx,

+ ProcResourceCount[I->ProcResourceIdx], NumUnits,

+ I->Cycles);

+ });

+ if (ProcResourceCount[I->ProcResourceIdx] >= NumUnits)

+ return false;

+ }

+ LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return true\n\n";);

+ return true;

+void ResourceManager::reserveResources(const MCInstrDesc *MID) {

+ LLVM_DEBUG({

+ if (SwpDebugResource)

+ dbgs() << "reserveResources:\n";

+ });

+ if (UseDFA)

+ return DFAResources->reserveResources(MID);

+ unsigned InsnClass = MID->getSchedClass();

+ const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);

+ if (!SCDesc->isValid()) {

+ LLVM_DEBUG({

+ dbgs() << "No valid Schedule Class Desc for schedClass!\n";

+ dbgs() << "isPseduo:" << MID->isPseudo() << "\n";

+ });

+ return;

+ }

+ for (const MCWriteProcResEntry &PRE :

+ make_range(STI->getWriteProcResBegin(SCDesc),

+ STI->getWriteProcResEnd(SCDesc))) {

+ if (!PRE.Cycles)

+ continue;

+ ++ProcResourceCount[PRE.ProcResourceIdx];

+ LLVM_DEBUG({

+ if (SwpDebugResource) {

+ const MCProcResourceDesc *ProcResource =

+ SM.getProcResource(PRE.ProcResourceIdx);

+ dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",

+ ProcResource->Name, PRE.ProcResourceIdx,

+ ProcResourceCount[PRE.ProcResourceIdx],

+ ProcResource->NumUnits, PRE.Cycles);

+ }

+ });

+ }

+ LLVM_DEBUG({

+ if (SwpDebugResource)

+ dbgs() << "reserveResources: done!\n\n";

+ });

+bool ResourceManager::canReserveResources(const MachineInstr &MI) const {

+ return canReserveResources(&MI.getDesc());

+void ResourceManager::reserveResources(const MachineInstr &MI) {

+ return reserveResources(&MI.getDesc());

+void ResourceManager::clearResources() {

+ if (UseDFA)

+ return DFAResources->clearResources();

+ std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0);