1 files changed, 343 insertions, 0 deletions
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
new file mode 100644
index 000000000000..9e3bd5978f09
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
@@ -0,0 +1,343 @@
+//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the bottleneck analysis view.
+///
+/// This view internally observes backend pressure increase events in order to
+/// identify problematic data dependencies and processor resource interferences.
+///
+/// Example of bottleneck analysis report for a dot-product on X86 btver2:
+///
+/// Cycles with backend pressure increase [ 40.76% ]
+/// Throughput Bottlenecks: 
+///   Resource Pressure       [ 39.34% ]
+///   - JFPA  [ 39.34% ]
+///   - JFPU0  [ 39.34% ]
+///   Data Dependencies:      [ 1.42% ]
+///   - Register Dependencies [ 1.42% ]
+///   - Memory Dependencies   [ 0.00% ]
+///
+/// According to the example, backend pressure increased during the 40.76% of
+/// the simulated cycles.  In particular, the major cause of backend pressure
+/// increases was the contention on floating point adder JFPA accessible from
+/// pipeline resource JFPU0.
+///
+/// At the end of each cycle, if pressure on the simulated out-of-order buffers
+/// has increased, a backend pressure event is reported.
+/// In particular, this occurs when there is a delta between the number of uOps
+/// dispatched and the number of uOps issued to the underlying pipelines.
+///
+/// The bottleneck analysis view is also responsible for identifying and printing
+/// the most "critical" sequence of dependent instructions according to the
+/// simulated run.
+///
+/// Below is the critical sequence computed for the dot-product example on
+/// btver2:
+///
+///              Instruction                     Dependency Information
+/// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
+/// |
+/// |    < loop carried > 
+/// |
+/// |      0.    vmulps	 %xmm0, %xmm0, %xmm2
+/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
+/// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
+/// |
+/// |    < loop carried > 
+/// |
+/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
+///
+///
+/// The algorithm that computes the critical sequence is very similar to a
+/// critical path analysis.
+/// 
+/// A dependency graph is used internally to track dependencies between nodes.
+/// Nodes of the graph represent instructions from the input assembly sequence,
+/// and edges of the graph represent data dependencies or processor resource
+/// interferences.
+///
+/// Edges are dynamically 'discovered' by observing instruction state transitions
+/// and backend pressure increase events. Edges are internally ranked based on
+/// their "criticality". A dependency is considered to be critical if it takes a
+/// long time to execute, and if it contributes to backend pressure increases.
+/// Criticality is internally measured in terms of cycles; it is computed for
+/// every edge in the graph as a function of the edge latency and the number of
+/// backend pressure increase cycles contributed by that edge.
+///
+/// At the end of simulation, costs are propagated to nodes through the edges of
+/// the graph, and the most expensive path connecting the root-set (a
+/// set of nodes with no predecessors) to a leaf node is reported as critical
+/// sequence.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
+#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
+
+#include "Views/View.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace mca {
+
+class PressureTracker {
+  const MCSchedModel &SM;
+
+  // Resource pressure distribution. There is an element for every processor
+  // resource declared by the scheduling model. Quantities are number of cycles.
+  SmallVector<unsigned, 4> ResourcePressureDistribution;
+
+  // Each processor resource is associated with a so-called processor resource
+  // mask. This vector allows to correlate processor resource IDs with processor
+  // resource masks. There is exactly one element per each processor resource
+  // declared by the scheduling model.
+  SmallVector<uint64_t, 4> ProcResID2Mask;
+
+  // Maps processor resource state indices (returned by calls to
+  // `getResourceStateIndex(Mask)` to processor resource identifiers.
+  SmallVector<unsigned, 4> ResIdx2ProcResID;
+
+  // Maps Processor Resource identifiers to ResourceUsers indices.
+  SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
+
+  // Identifies the last user of a processor resource unit.
+  // This vector is updated on every instruction issued event.
+  // There is one entry for every processor resource unit declared by the
+  // processor model. An all_ones value is treated like an invalid instruction
+  // identifier.
+  using User = std::pair<unsigned, unsigned>;
+  SmallVector<User, 4> ResourceUsers;
+
+  struct InstructionPressureInfo {
+    unsigned RegisterPressureCycles;
+    unsigned MemoryPressureCycles;
+    unsigned ResourcePressureCycles;
+  };
+  DenseMap<unsigned, InstructionPressureInfo> IPI;
+
+  void updateResourcePressureDistribution(uint64_t CumulativeMask);
+
+  User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
+    unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
+    return ResourceUsers[Index + UnitID];
+  }
+
+public:
+  PressureTracker(const MCSchedModel &Model);
+
+  ArrayRef<unsigned> getResourcePressureDistribution() const {
+    return ResourcePressureDistribution;
+  }
+
+  void getResourceUsers(uint64_t ResourceMask,
+                        SmallVectorImpl<User> &Users) const;
+
+  unsigned getRegisterPressureCycles(unsigned IID) const {
+    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
+    const InstructionPressureInfo &Info = IPI.find(IID)->second;
+    return Info.RegisterPressureCycles;
+  }
+
+  unsigned getMemoryPressureCycles(unsigned IID) const {
+    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
+    const InstructionPressureInfo &Info = IPI.find(IID)->second;
+    return Info.MemoryPressureCycles;
+  }
+
+  unsigned getResourcePressureCycles(unsigned IID) const {
+    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
+    const InstructionPressureInfo &Info = IPI.find(IID)->second;
+    return Info.ResourcePressureCycles;
+  }
+
+  const char *resolveResourceName(uint64_t ResourceMask) const {
+    unsigned Index = getResourceStateIndex(ResourceMask);
+    unsigned ProcResID = ResIdx2ProcResID[Index];
+    const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
+    return PRDesc.Name;
+  }
+
+  void onInstructionDispatched(unsigned IID);
+  void onInstructionExecuted(unsigned IID);
+
+  void handlePressureEvent(const HWPressureEvent &Event);
+  void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
+};
+
+// A dependency edge.
+struct DependencyEdge {
+  enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
+
+  // Dependency edge descriptor.
+  //
+  // It specifies the dependency type, as well as the edge cost in cycles.
+  struct Dependency {
+    DependencyType Type;
+    uint64_t ResourceOrRegID;
+    uint64_t Cost;
+  };
+  Dependency Dep;
+
+  unsigned FromIID;
+  unsigned ToIID;
+
+  // Used by the bottleneck analysis to compute the interference
+  // probability for processor resources.
+  unsigned Frequency;
+};
+
+// A dependency graph used by the bottleneck analysis to describe data
+// dependencies and processor resource interferences between instructions.
+//
+// There is a node (an instance of struct DGNode) for every instruction in the
+// input assembly sequence. Edges of the graph represent dependencies between
+// instructions.
+//
+// Each edge of the graph is associated with a cost value which is used
+// internally to rank dependency based on their impact on the runtime
+// performance (see field DependencyEdge::Dependency::Cost). In general, the
+// higher the cost of an edge, the higher the impact on performance.
+//
+// The cost of a dependency is a function of both the latency and the number of
+// cycles where the dependency has been seen as critical (i.e. contributing to
+// back-pressure increases).
+//
+// Loop carried dependencies are carefully expanded by the bottleneck analysis
+// to guarantee that the graph stays acyclic. To this end, extra nodes are
+// pre-allocated at construction time to describe instructions from "past and
+// future" iterations. The graph is kept acyclic mainly because it simplifies the
+// complexity of the algorithm that computes the critical sequence.
+class DependencyGraph {
+  struct DGNode {
+    unsigned NumPredecessors;
+    unsigned NumVisitedPredecessors;
+    uint64_t Cost;
+    unsigned Depth;
+
+    DependencyEdge CriticalPredecessor;
+    SmallVector<DependencyEdge, 8> OutgoingEdges;
+  };
+  SmallVector<DGNode, 16> Nodes;
+
+  DependencyGraph(const DependencyGraph &) = delete;
+  DependencyGraph &operator=(const DependencyGraph &) = delete;
+
+  void addDependency(unsigned From, unsigned To,
+                     DependencyEdge::Dependency &&DE);
+
+  void pruneEdges(unsigned Iterations);
+  void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
+  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
+
+#ifndef NDEBUG
+  void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
+                          MCInstPrinter &MCIP) const;
+#endif
+
+public:
+  DependencyGraph(unsigned Size) : Nodes(Size) {}
+
+  void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
+                      unsigned Cost) {
+    addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
+  }
+
+  void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
+    addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
+  }
+
+  void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
+                      unsigned Cost) {
+    addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
+  }
+
+  // Called by the bottleneck analysis at the end of simulation to propagate
+  // costs through the edges of the graph, and compute a critical path.
+  void finalizeGraph(unsigned Iterations) {
+    SmallVector<unsigned, 16> RootSet;
+    pruneEdges(Iterations);
+    initializeRootSet(RootSet);
+    propagateThroughEdges(RootSet, Iterations);
+  }
+
+  // Returns a sequence of edges representing the critical sequence based on the
+  // simulated run. It assumes that the graph has already been finalized (i.e.
+  // method `finalizeGraph()` has already been called on this graph).
+  void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
+
+#ifndef NDEBUG
+  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
+#endif
+};
+
+/// A view that collects and prints a few performance numbers.
+class BottleneckAnalysis : public View {
+  const MCSubtargetInfo &STI;
+  MCInstPrinter &MCIP;
+  PressureTracker Tracker;
+  DependencyGraph DG;
+
+  ArrayRef<MCInst> Source;
+  unsigned Iterations;
+  unsigned TotalCycles;
+
+  bool PressureIncreasedBecauseOfResources;
+  bool PressureIncreasedBecauseOfRegisterDependencies;
+  bool PressureIncreasedBecauseOfMemoryDependencies;
+  // True if throughput was affected by dispatch stalls.
+  bool SeenStallCycles;
+
+  struct BackPressureInfo {
+    // Cycles where backpressure increased.
+    unsigned PressureIncreaseCycles;
+    // Cycles where backpressure increased because of pipeline pressure.
+    unsigned ResourcePressureCycles;
+    // Cycles where backpressure increased because of data dependencies.
+    unsigned DataDependencyCycles;
+    // Cycles where backpressure increased because of register dependencies.
+    unsigned RegisterDependencyCycles;
+    // Cycles where backpressure increased because of memory dependencies.
+    unsigned MemoryDependencyCycles;
+  };
+  BackPressureInfo BPI;
+
+  // Used to populate the dependency graph DG.
+  void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
+  void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
+  void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
+
+  // Prints a bottleneck message to OS.
+  void printBottleneckHints(raw_ostream &OS) const;
+  void printCriticalSequence(raw_ostream &OS) const;
+
+public:
+  BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
+                     ArrayRef<MCInst> Sequence, unsigned Iterations);
+
+  void onCycleEnd() override;
+  void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
+  void onEvent(const HWPressureEvent &Event) override;
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  void printView(raw_ostream &OS) const override;
+
+#ifndef NDEBUG
+  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
+#endif
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif