diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp | 659 |
1 files changed, 659 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp new file mode 100644 index 000000000000..0584ec0bd2b3 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp @@ -0,0 +1,659 @@ +//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The machine combiner pass uses machine trace metrics to ensure the combined +// instructions do not lengthen the critical path or the resource depth. +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-combiner" + +STATISTIC(NumInstCombined, "Number of machineinst combined"); + +static cl::opt<unsigned> +inc_threshold("machine-combiner-inc-threshold", cl::Hidden, + cl::desc("Incremental depth computation will be used for basic " + "blocks with more instructions."), cl::init(500)); + +static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, + cl::desc("Dump all substituted intrs"), + cl::init(false)); + +#ifdef EXPENSIVE_CHECKS +static cl::opt<bool> VerifyPatternOrder( + "machine-combiner-verify-pattern-order", cl::Hidden, + cl::desc( + "Verify that the generated patterns are ordered by increasing latency"), + cl::init(true)); +#else +static cl::opt<bool> VerifyPatternOrder( + "machine-combiner-verify-pattern-order", cl::Hidden, + cl::desc( + "Verify that the generated patterns are ordered by increasing latency"), + cl::init(false)); +#endif + +namespace { +class MachineCombiner : public MachineFunctionPass { + const TargetSubtargetInfo *STI; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MCSchedModel SchedModel; + MachineRegisterInfo *MRI; + MachineLoopInfo *MLI; // Current MachineLoopInfo + MachineTraceMetrics *Traces; + MachineTraceMetrics::Ensemble *MinInstr; + + TargetSchedModel TSchedModel; + + /// True if optimizing for code size. + bool OptSize; + +public: + static char ID; + MachineCombiner() : MachineFunctionPass(ID) { + initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { return "Machine InstCombiner"; } + +private: + bool doSubstitute(unsigned NewSize, unsigned OldSize); + bool combineInstructions(MachineBasicBlock *); + MachineInstr *getOperandDef(const MachineOperand &MO); + unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, + MachineTraceMetrics::Trace BlockTrace); + unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, + MachineTraceMetrics::Trace BlockTrace); + bool + improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, + MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, + MachineCombinerPattern Pattern, bool SlackIsAccurate); + bool preservesResourceLen(MachineBasicBlock *MBB, + MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs); + void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, + SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); + std::pair<unsigned, unsigned> + getLatenciesForInstrSequences(MachineInstr &MI, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + MachineTraceMetrics::Trace BlockTrace); + + void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root, + SmallVector<MachineCombinerPattern, 16> &Patterns); +}; +} + +char MachineCombiner::ID = 0; +char &llvm::MachineCombinerID = MachineCombiner::ID; + +INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, + "Machine InstCombiner", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) +INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", + false, false) + +void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + AU.addRequired<MachineTraceMetrics>(); + AU.addPreserved<MachineTraceMetrics>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { + MachineInstr *DefInstr = nullptr; + // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) + DefInstr = MRI->getUniqueVRegDef(MO.getReg()); + // PHI's have no depth etc. + if (DefInstr && DefInstr->isPHI()) + DefInstr = nullptr; + return DefInstr; +} + +/// Computes depth of instructions in vector \InsInstr. +/// +/// \param InsInstrs is a vector of machine instructions +/// \param InstrIdxForVirtReg is a dense map of virtual register to index +/// of defining machine instruction in \p InsInstrs +/// \param BlockTrace is a trace of machine instructions +/// +/// \returns Depth of last instruction in \InsInstrs ("NewRoot") +unsigned +MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, + MachineTraceMetrics::Trace BlockTrace) { + SmallVector<unsigned, 16> InstrDepth; + assert(TSchedModel.hasInstrSchedModelOrItineraries() && + "Missing machine model\n"); + + // For each instruction in the new sequence compute the depth based on the + // operands. Use the trace information when possible. For new operands which + // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth + for (auto *InstrPtr : InsInstrs) { // for each Use + unsigned IDepth = 0; + for (const MachineOperand &MO : InstrPtr->operands()) { + // Check for virtual register operand. + if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) + continue; + if (!MO.isUse()) + continue; + unsigned DepthOp = 0; + unsigned LatencyOp = 0; + DenseMap<unsigned, unsigned>::iterator II = + InstrIdxForVirtReg.find(MO.getReg()); + if (II != InstrIdxForVirtReg.end()) { + // Operand is new virtual register not in trace + assert(II->second < InstrDepth.size() && "Bad Index"); + MachineInstr *DefInstr = InsInstrs[II->second]; + assert(DefInstr && + "There must be a definition for a new virtual register"); + DepthOp = InstrDepth[II->second]; + int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); + int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); + LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, + InstrPtr, UseIdx); + } else { + MachineInstr *DefInstr = getOperandDef(MO); + if (DefInstr) { + DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; + LatencyOp = TSchedModel.computeOperandLatency( + DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), + InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + } + } + IDepth = std::max(IDepth, DepthOp + LatencyOp); + } + InstrDepth.push_back(IDepth); + } + unsigned NewRootIdx = InsInstrs.size() - 1; + return InstrDepth[NewRootIdx]; +} + +/// Computes instruction latency as max of latency of defined operands. +/// +/// \param Root is a machine instruction that could be replaced by NewRoot. +/// It is used to compute a more accurate latency information for NewRoot in +/// case there is a dependent instruction in the same trace (\p BlockTrace) +/// \param NewRoot is the instruction for which the latency is computed +/// \param BlockTrace is a trace of machine instructions +/// +/// \returns Latency of \p NewRoot +unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, + MachineTraceMetrics::Trace BlockTrace) { + assert(TSchedModel.hasInstrSchedModelOrItineraries() && + "Missing machine model\n"); + + // Check each definition in NewRoot and compute the latency + unsigned NewRootLatency = 0; + + for (const MachineOperand &MO : NewRoot->operands()) { + // Check for virtual register operand. + if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) + continue; + if (!MO.isDef()) + continue; + // Get the first instruction that uses MO + MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); + RI++; + if (RI == MRI->reg_end()) + continue; + MachineInstr *UseMO = RI->getParent(); + unsigned LatencyOp = 0; + if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) { + LatencyOp = TSchedModel.computeOperandLatency( + NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, + UseMO->findRegisterUseOperandIdx(MO.getReg())); + } else { + LatencyOp = TSchedModel.computeInstrLatency(NewRoot); + } + NewRootLatency = std::max(NewRootLatency, LatencyOp); + } + return NewRootLatency; +} + +/// The combiner's goal may differ based on which pattern it is attempting +/// to optimize. +enum class CombinerObjective { + MustReduceDepth, // The data dependency chain must be improved. + Default // The critical path must not be lengthened. +}; + +static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { + // TODO: If C++ ever gets a real enum class, make this part of the + // MachineCombinerPattern class. + switch (P) { + case MachineCombinerPattern::REASSOC_AX_BY: + case MachineCombinerPattern::REASSOC_AX_YB: + case MachineCombinerPattern::REASSOC_XA_BY: + case MachineCombinerPattern::REASSOC_XA_YB: + return CombinerObjective::MustReduceDepth; + default: + return CombinerObjective::Default; + } +} + +/// Estimate the latency of the new and original instruction sequence by summing +/// up the latencies of the inserted and deleted instructions. This assumes +/// that the inserted and deleted instructions are dependent instruction chains, +/// which might not hold in all cases. +std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences( + MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + MachineTraceMetrics::Trace BlockTrace) { + assert(!InsInstrs.empty() && "Only support sequences that insert instrs."); + unsigned NewRootLatency = 0; + // NewRoot is the last instruction in the \p InsInstrs vector. + MachineInstr *NewRoot = InsInstrs.back(); + for (unsigned i = 0; i < InsInstrs.size() - 1; i++) + NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]); + NewRootLatency += getLatency(&MI, NewRoot, BlockTrace); + + unsigned RootLatency = 0; + for (auto I : DelInstrs) + RootLatency += TSchedModel.computeInstrLatency(I); + + return {NewRootLatency, RootLatency}; +} + +/// The DAGCombine code sequence ends in MI (Machine Instruction) Root. +/// The new code sequence ends in MI NewRoot. A necessary condition for the new +/// sequence to replace the old sequence is that it cannot lengthen the critical +/// path. The definition of "improve" may be restricted by specifying that the +/// new path improves the data dependency chain (MustReduceDepth). +bool MachineCombiner::improvesCriticalPathLen( + MachineBasicBlock *MBB, MachineInstr *Root, + MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, + MachineCombinerPattern Pattern, + bool SlackIsAccurate) { + assert(TSchedModel.hasInstrSchedModelOrItineraries() && + "Missing machine model\n"); + // Get depth and latency of NewRoot and Root. + unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); + unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; + + LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " + << NewRootDepth << "\tRootDepth: " << RootDepth); + + // For a transform such as reassociation, the cost equation is + // conservatively calculated so that we must improve the depth (data + // dependency cycles) in the critical path to proceed with the transform. + // Being conservative also protects against inaccuracies in the underlying + // machine trace metrics and CPU models. + if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) { + LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth "); + LLVM_DEBUG(NewRootDepth < RootDepth + ? dbgs() << "\t and it does it\n" + : dbgs() << "\t but it does NOT do it\n"); + return NewRootDepth < RootDepth; + } + + // A more flexible cost calculation for the critical path includes the slack + // of the original code sequence. This may allow the transform to proceed + // even if the instruction depths (data dependency cycles) become worse. + + // Account for the latency of the inserted and deleted instructions by + unsigned NewRootLatency, RootLatency; + std::tie(NewRootLatency, RootLatency) = + getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); + + unsigned RootSlack = BlockTrace.getInstrSlack(*Root); + unsigned NewCycleCount = NewRootDepth + NewRootLatency; + unsigned OldCycleCount = + RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); + LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency + << "\tRootLatency: " << RootLatency << "\n\tRootSlack: " + << RootSlack << " SlackIsAccurate=" << SlackIsAccurate + << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount + << "\n\tRootDepth + RootLatency + RootSlack = " + << OldCycleCount;); + LLVM_DEBUG(NewCycleCount <= OldCycleCount + ? dbgs() << "\n\t It IMPROVES PathLen because" + : dbgs() << "\n\t It DOES NOT improve PathLen because"); + LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount + << ", OldCycleCount = " << OldCycleCount << "\n"); + + return NewCycleCount <= OldCycleCount; +} + +/// helper routine to convert instructions into SC +void MachineCombiner::instr2instrSC( + SmallVectorImpl<MachineInstr *> &Instrs, + SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { + for (auto *InstrPtr : Instrs) { + unsigned Opc = InstrPtr->getOpcode(); + unsigned Idx = TII->get(Opc).getSchedClass(); + const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); + InstrsSC.push_back(SC); + } +} + +/// True when the new instructions do not increase resource length +bool MachineCombiner::preservesResourceLen( + MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs) { + if (!TSchedModel.hasInstrSchedModel()) + return true; + + // Compute current resource length + + //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); + SmallVector <const MachineBasicBlock *, 1> MBBarr; + MBBarr.push_back(MBB); + unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); + + // Deal with SC rather than Instructions. + SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; + SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; + + instr2instrSC(InsInstrs, InsInstrsSC); + instr2instrSC(DelInstrs, DelInstrsSC); + + ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); + ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); + + // Compute new resource length. + unsigned ResLenAfterCombine = + BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); + + LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: " + << ResLenBeforeCombine + << " and after: " << ResLenAfterCombine << "\n";); + LLVM_DEBUG( + ResLenAfterCombine <= ResLenBeforeCombine + ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n" + : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource " + "Length\n"); + + return ResLenAfterCombine <= ResLenBeforeCombine; +} + +/// \returns true when new instruction sequence should be generated +/// independent if it lengthens critical path or not +bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { + if (OptSize && (NewSize < OldSize)) + return true; + if (!TSchedModel.hasInstrSchedModelOrItineraries()) + return true; + return false; +} + +/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction +/// depths if requested. +/// +/// \param MBB basic block to insert instructions in +/// \param MI current machine instruction +/// \param InsInstrs new instructions to insert in \p MBB +/// \param DelInstrs instruction to delete from \p MBB +/// \param MinInstr is a pointer to the machine trace information +/// \param RegUnits set of live registers, needed to compute instruction depths +/// \param IncrementalUpdate if true, compute instruction depths incrementally, +/// otherwise invalidate the trace +static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, + SmallVector<MachineInstr *, 16> InsInstrs, + SmallVector<MachineInstr *, 16> DelInstrs, + MachineTraceMetrics::Ensemble *MinInstr, + SparseSet<LiveRegUnit> &RegUnits, + bool IncrementalUpdate) { + for (auto *InstrPtr : InsInstrs) + MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); + + for (auto *InstrPtr : DelInstrs) { + InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); + // Erase all LiveRegs defined by the removed instruction + for (auto I = RegUnits.begin(); I != RegUnits.end(); ) { + if (I->MI == InstrPtr) + I = RegUnits.erase(I); + else + I++; + } + } + + if (IncrementalUpdate) + for (auto *InstrPtr : InsInstrs) + MinInstr->updateDepth(MBB, *InstrPtr, RegUnits); + else + MinInstr->invalidate(MBB); + + NumInstCombined++; +} + +// Check that the difference between original and new latency is decreasing for +// later patterns. This helps to discover sub-optimal pattern orderings. +void MachineCombiner::verifyPatternOrder( + MachineBasicBlock *MBB, MachineInstr &Root, + SmallVector<MachineCombinerPattern, 16> &Patterns) { + long PrevLatencyDiff = std::numeric_limits<long>::max(); + (void)PrevLatencyDiff; // Variable is used in assert only. + for (auto P : Patterns) { + SmallVector<MachineInstr *, 16> InsInstrs; + SmallVector<MachineInstr *, 16> DelInstrs; + DenseMap<unsigned, unsigned> InstrIdxForVirtReg; + TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs, + InstrIdxForVirtReg); + // Found pattern, but did not generate alternative sequence. + // This can happen e.g. when an immediate could not be materialized + // in a single instruction. + if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries()) + continue; + + unsigned NewRootLatency, RootLatency; + std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences( + Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB)); + long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency); + assert(CurrentLatencyDiff <= PrevLatencyDiff && + "Current pattern is better than previous pattern."); + PrevLatencyDiff = CurrentLatencyDiff; + } +} + +/// Substitute a slow code sequence with a faster one by +/// evaluating instruction combining pattern. +/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction +/// combining based on machine trace metrics. Only combine a sequence of +/// instructions when this neither lengthens the critical path nor increases +/// resource pressure. When optimizing for codesize always combine when the new +/// sequence is shorter. +bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { + bool Changed = false; + LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); + + bool IncrementalUpdate = false; + auto BlockIter = MBB->begin(); + decltype(BlockIter) LastUpdate; + // Check if the block is in a loop. + const MachineLoop *ML = MLI->getLoopFor(MBB); + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + + SparseSet<LiveRegUnit> RegUnits; + RegUnits.setUniverse(TRI->getNumRegUnits()); + + while (BlockIter != MBB->end()) { + auto &MI = *BlockIter++; + SmallVector<MachineCombinerPattern, 16> Patterns; + // The motivating example is: + // + // MUL Other MUL_op1 MUL_op2 Other + // \ / \ | / + // ADD/SUB => MADD/MSUB + // (=Root) (=NewRoot) + + // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is + // usually beneficial for code size it unfortunately can hurt performance + // when the ADD is on the critical path, but the MUL is not. With the + // substitution the MUL becomes part of the critical path (in form of the + // MADD) and can lengthen it on architectures where the MADD latency is + // longer than the ADD latency. + // + // For each instruction we check if it can be the root of a combiner + // pattern. Then for each pattern the new code sequence in form of MI is + // generated and evaluated. When the efficiency criteria (don't lengthen + // critical path, don't use more resources) is met the new sequence gets + // hooked up into the basic block before the old sequence is removed. + // + // The algorithm does not try to evaluate all patterns and pick the best. + // This is only an artificial restriction though. In practice there is + // mostly one pattern, and getMachineCombinerPatterns() can order patterns + // based on an internal cost heuristic. If + // machine-combiner-verify-pattern-order is enabled, all patterns are + // checked to ensure later patterns do not provide better latency savings. + + if (!TII->getMachineCombinerPatterns(MI, Patterns)) + continue; + + if (VerifyPatternOrder) + verifyPatternOrder(MBB, MI, Patterns); + + for (auto P : Patterns) { + SmallVector<MachineInstr *, 16> InsInstrs; + SmallVector<MachineInstr *, 16> DelInstrs; + DenseMap<unsigned, unsigned> InstrIdxForVirtReg; + TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, + InstrIdxForVirtReg); + unsigned NewInstCount = InsInstrs.size(); + unsigned OldInstCount = DelInstrs.size(); + // Found pattern, but did not generate alternative sequence. + // This can happen e.g. when an immediate could not be materialized + // in a single instruction. + if (!NewInstCount) + continue; + + LLVM_DEBUG(if (dump_intrs) { + dbgs() << "\tFor the Pattern (" << (int)P + << ") these instructions could be removed\n"; + for (auto const *InstrPtr : DelInstrs) + InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, + /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); + dbgs() << "\tThese instructions could replace the removed ones\n"; + for (auto const *InstrPtr : InsInstrs) + InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, + /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); + }); + + bool SubstituteAlways = false; + if (ML && TII->isThroughputPattern(P)) + SubstituteAlways = true; + + if (IncrementalUpdate) { + // Update depths since the last incremental update. + MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits); + LastUpdate = BlockIter; + } + + // Substitute when we optimize for codesize and the new sequence has + // fewer instructions OR + // the new sequence neither lengthens the critical path nor increases + // resource pressure. + if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + RegUnits, IncrementalUpdate); + // Eagerly stop after the first pattern fires. + Changed = true; + break; + } else { + // For big basic blocks, we only compute the full trace the first time + // we hit this. We do not invalidate the trace, but instead update the + // instruction depths incrementally. + // NOTE: Only the instruction depths up to MI are accurate. All other + // trace information is not updated. + MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + Traces->verifyAnalysis(); + if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, + InstrIdxForVirtReg, P, + !IncrementalUpdate) && + preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { + if (MBB->size() > inc_threshold) { + // Use incremental depth updates for basic blocks above treshold + IncrementalUpdate = true; + LastUpdate = BlockIter; + } + + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + RegUnits, IncrementalUpdate); + + // Eagerly stop after the first pattern fires. + Changed = true; + break; + } + // Cleanup instructions of the alternative code sequence. There is no + // use for them. + MachineFunction *MF = MBB->getParent(); + for (auto *InstrPtr : InsInstrs) + MF->DeleteMachineInstr(InstrPtr); + } + InstrIdxForVirtReg.clear(); + } + } + + if (Changed && IncrementalUpdate) + Traces->invalidate(MBB); + return Changed; +} + +bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { + STI = &MF.getSubtarget(); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + SchedModel = STI->getSchedModel(); + TSchedModel.init(STI); + MRI = &MF.getRegInfo(); + MLI = &getAnalysis<MachineLoopInfo>(); + Traces = &getAnalysis<MachineTraceMetrics>(); + MinInstr = nullptr; + OptSize = MF.getFunction().hasOptSize(); + + LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); + if (!TII->useMachineCombiner()) { + LLVM_DEBUG( + dbgs() + << " Skipping pass: Target does not support machine combiner\n"); + return false; + } + + bool Changed = false; + + // Try to combine instructions. + for (auto &MBB : MF) + Changed |= combineInstructions(&MBB); + + return Changed; +} |