diff options
Diffstat (limited to 'llvm/tools/llvm-mca/Views/BottleneckAnalysis.h')
| -rw-r--r-- | llvm/tools/llvm-mca/Views/BottleneckAnalysis.h | 343 |
1 files changed, 343 insertions, 0 deletions
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h new file mode 100644 index 000000000000..9e3bd5978f09 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h @@ -0,0 +1,343 @@ +//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the bottleneck analysis view. +/// +/// This view internally observes backend pressure increase events in order to +/// identify problematic data dependencies and processor resource interferences. +/// +/// Example of bottleneck analysis report for a dot-product on X86 btver2: +/// +/// Cycles with backend pressure increase [ 40.76% ] +/// Throughput Bottlenecks: +/// Resource Pressure [ 39.34% ] +/// - JFPA [ 39.34% ] +/// - JFPU0 [ 39.34% ] +/// Data Dependencies: [ 1.42% ] +/// - Register Dependencies [ 1.42% ] +/// - Memory Dependencies [ 0.00% ] +/// +/// According to the example, backend pressure increased during the 40.76% of +/// the simulated cycles. In particular, the major cause of backend pressure +/// increases was the contention on floating point adder JFPA accessible from +/// pipeline resource JFPU0. +/// +/// At the end of each cycle, if pressure on the simulated out-of-order buffers +/// has increased, a backend pressure event is reported. +/// In particular, this occurs when there is a delta between the number of uOps +/// dispatched and the number of uOps issued to the underlying pipelines. +/// +/// The bottleneck analysis view is also responsible for identifying and printing +/// the most "critical" sequence of dependent instructions according to the +/// simulated run. +/// +/// Below is the critical sequence computed for the dot-product example on +/// btver2: +/// +/// Instruction Dependency Information +/// +----< 2. vhaddps %xmm3, %xmm3, %xmm4 +/// | +/// | < loop carried > +/// | +/// | 0. vmulps %xmm0, %xmm0, %xmm2 +/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] +/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3 +/// | +/// | < loop carried > +/// | +/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] +/// +/// +/// The algorithm that computes the critical sequence is very similar to a +/// critical path analysis. +/// +/// A dependency graph is used internally to track dependencies between nodes. +/// Nodes of the graph represent instructions from the input assembly sequence, +/// and edges of the graph represent data dependencies or processor resource +/// interferences. +/// +/// Edges are dynamically 'discovered' by observing instruction state transitions +/// and backend pressure increase events. Edges are internally ranked based on +/// their "criticality". A dependency is considered to be critical if it takes a +/// long time to execute, and if it contributes to backend pressure increases. +/// Criticality is internally measured in terms of cycles; it is computed for +/// every edge in the graph as a function of the edge latency and the number of +/// backend pressure increase cycles contributed by that edge. +/// +/// At the end of simulation, costs are propagated to nodes through the edges of +/// the graph, and the most expensive path connecting the root-set (a +/// set of nodes with no predecessors) to a leaf node is reported as critical +/// sequence. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H +#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H + +#include "Views/View.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +namespace mca { + +class PressureTracker { + const MCSchedModel &SM; + + // Resource pressure distribution. There is an element for every processor + // resource declared by the scheduling model. Quantities are number of cycles. + SmallVector<unsigned, 4> ResourcePressureDistribution; + + // Each processor resource is associated with a so-called processor resource + // mask. This vector allows to correlate processor resource IDs with processor + // resource masks. There is exactly one element per each processor resource + // declared by the scheduling model. + SmallVector<uint64_t, 4> ProcResID2Mask; + + // Maps processor resource state indices (returned by calls to + // `getResourceStateIndex(Mask)` to processor resource identifiers. + SmallVector<unsigned, 4> ResIdx2ProcResID; + + // Maps Processor Resource identifiers to ResourceUsers indices. + SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex; + + // Identifies the last user of a processor resource unit. + // This vector is updated on every instruction issued event. + // There is one entry for every processor resource unit declared by the + // processor model. An all_ones value is treated like an invalid instruction + // identifier. + using User = std::pair<unsigned, unsigned>; + SmallVector<User, 4> ResourceUsers; + + struct InstructionPressureInfo { + unsigned RegisterPressureCycles; + unsigned MemoryPressureCycles; + unsigned ResourcePressureCycles; + }; + DenseMap<unsigned, InstructionPressureInfo> IPI; + + void updateResourcePressureDistribution(uint64_t CumulativeMask); + + User getResourceUser(unsigned ProcResID, unsigned UnitID) const { + unsigned Index = ProcResID2ResourceUsersIndex[ProcResID]; + return ResourceUsers[Index + UnitID]; + } + +public: + PressureTracker(const MCSchedModel &Model); + + ArrayRef<unsigned> getResourcePressureDistribution() const { + return ResourcePressureDistribution; + } + + void getResourceUsers(uint64_t ResourceMask, + SmallVectorImpl<User> &Users) const; + + unsigned getRegisterPressureCycles(unsigned IID) const { + assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!"); + const InstructionPressureInfo &Info = IPI.find(IID)->second; + return Info.RegisterPressureCycles; + } + + unsigned getMemoryPressureCycles(unsigned IID) const { + assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!"); + const InstructionPressureInfo &Info = IPI.find(IID)->second; + return Info.MemoryPressureCycles; + } + + unsigned getResourcePressureCycles(unsigned IID) const { + assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!"); + const InstructionPressureInfo &Info = IPI.find(IID)->second; + return Info.ResourcePressureCycles; + } + + const char *resolveResourceName(uint64_t ResourceMask) const { + unsigned Index = getResourceStateIndex(ResourceMask); + unsigned ProcResID = ResIdx2ProcResID[Index]; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID); + return PRDesc.Name; + } + + void onInstructionDispatched(unsigned IID); + void onInstructionExecuted(unsigned IID); + + void handlePressureEvent(const HWPressureEvent &Event); + void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event); +}; + +// A dependency edge. +struct DependencyEdge { + enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE }; + + // Dependency edge descriptor. + // + // It specifies the dependency type, as well as the edge cost in cycles. + struct Dependency { + DependencyType Type; + uint64_t ResourceOrRegID; + uint64_t Cost; + }; + Dependency Dep; + + unsigned FromIID; + unsigned ToIID; + + // Used by the bottleneck analysis to compute the interference + // probability for processor resources. + unsigned Frequency; +}; + +// A dependency graph used by the bottleneck analysis to describe data +// dependencies and processor resource interferences between instructions. +// +// There is a node (an instance of struct DGNode) for every instruction in the +// input assembly sequence. Edges of the graph represent dependencies between +// instructions. +// +// Each edge of the graph is associated with a cost value which is used +// internally to rank dependency based on their impact on the runtime +// performance (see field DependencyEdge::Dependency::Cost). In general, the +// higher the cost of an edge, the higher the impact on performance. +// +// The cost of a dependency is a function of both the latency and the number of +// cycles where the dependency has been seen as critical (i.e. contributing to +// back-pressure increases). +// +// Loop carried dependencies are carefully expanded by the bottleneck analysis +// to guarantee that the graph stays acyclic. To this end, extra nodes are +// pre-allocated at construction time to describe instructions from "past and +// future" iterations. The graph is kept acyclic mainly because it simplifies the +// complexity of the algorithm that computes the critical sequence. +class DependencyGraph { + struct DGNode { + unsigned NumPredecessors; + unsigned NumVisitedPredecessors; + uint64_t Cost; + unsigned Depth; + + DependencyEdge CriticalPredecessor; + SmallVector<DependencyEdge, 8> OutgoingEdges; + }; + SmallVector<DGNode, 16> Nodes; + + DependencyGraph(const DependencyGraph &) = delete; + DependencyGraph &operator=(const DependencyGraph &) = delete; + + void addDependency(unsigned From, unsigned To, + DependencyEdge::Dependency &&DE); + + void pruneEdges(unsigned Iterations); + void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const; + void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations); + +#ifndef NDEBUG + void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE, + MCInstPrinter &MCIP) const; +#endif + +public: + DependencyGraph(unsigned Size) : Nodes(Size) {} + + void addRegisterDep(unsigned From, unsigned To, unsigned RegID, + unsigned Cost) { + addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost}); + } + + void addMemoryDep(unsigned From, unsigned To, unsigned Cost) { + addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost}); + } + + void addResourceDep(unsigned From, unsigned To, uint64_t Mask, + unsigned Cost) { + addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost}); + } + + // Called by the bottleneck analysis at the end of simulation to propagate + // costs through the edges of the graph, and compute a critical path. + void finalizeGraph(unsigned Iterations) { + SmallVector<unsigned, 16> RootSet; + pruneEdges(Iterations); + initializeRootSet(RootSet); + propagateThroughEdges(RootSet, Iterations); + } + + // Returns a sequence of edges representing the critical sequence based on the + // simulated run. It assumes that the graph has already been finalized (i.e. + // method `finalizeGraph()` has already been called on this graph). + void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const; + +#ifndef NDEBUG + void dump(raw_ostream &OS, MCInstPrinter &MCIP) const; +#endif +}; + +/// A view that collects and prints a few performance numbers. +class BottleneckAnalysis : public View { + const MCSubtargetInfo &STI; + MCInstPrinter &MCIP; + PressureTracker Tracker; + DependencyGraph DG; + + ArrayRef<MCInst> Source; + unsigned Iterations; + unsigned TotalCycles; + + bool PressureIncreasedBecauseOfResources; + bool PressureIncreasedBecauseOfRegisterDependencies; + bool PressureIncreasedBecauseOfMemoryDependencies; + // True if throughput was affected by dispatch stalls. + bool SeenStallCycles; + + struct BackPressureInfo { + // Cycles where backpressure increased. + unsigned PressureIncreaseCycles; + // Cycles where backpressure increased because of pipeline pressure. + unsigned ResourcePressureCycles; + // Cycles where backpressure increased because of data dependencies. + unsigned DataDependencyCycles; + // Cycles where backpressure increased because of register dependencies. + unsigned RegisterDependencyCycles; + // Cycles where backpressure increased because of memory dependencies. + unsigned MemoryDependencyCycles; + }; + BackPressureInfo BPI; + + // Used to populate the dependency graph DG. + void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy); + void addMemoryDep(unsigned From, unsigned To, unsigned Cy); + void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy); + + // Prints a bottleneck message to OS. + void printBottleneckHints(raw_ostream &OS) const; + void printCriticalSequence(raw_ostream &OS) const; + +public: + BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP, + ArrayRef<MCInst> Sequence, unsigned Iterations); + + void onCycleEnd() override; + void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; } + void onEvent(const HWPressureEvent &Event) override; + void onEvent(const HWInstructionEvent &Event) override; + + void printView(raw_ostream &OS) const override; + +#ifndef NDEBUG + void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); } +#endif +}; + +} // namespace mca +} // namespace llvm + +#endif |
