20 files changed, 3049 insertions, 0 deletions
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
new file mode 100644
index 000000000000..feff0cd6d524
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
@@ -0,0 +1,652 @@
+//===--------------------- BottleneckAnalysis.cpp ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the functionalities used by the BottleneckAnalysis
+/// to report bottleneck info.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/BottleneckAnalysis.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MCA/Support.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+
+namespace llvm {
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+PressureTracker::PressureTracker(const MCSchedModel &Model)
+    : SM(Model),
+      ResourcePressureDistribution(Model.getNumProcResourceKinds(), 0),
+      ProcResID2Mask(Model.getNumProcResourceKinds(), 0),
+      ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0),
+      ProcResID2ResourceUsersIndex(Model.getNumProcResourceKinds(), 0) {
+  computeProcResourceMasks(SM, ProcResID2Mask);
+
+  // Ignore the invalid resource at index zero.
+  unsigned NextResourceUsersIdx = 0;
+  for (unsigned I = 1, E = Model.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    ProcResID2ResourceUsersIndex[I] = NextResourceUsersIdx;
+    NextResourceUsersIdx += ProcResource.NumUnits;
+    uint64_t ResourceMask = ProcResID2Mask[I];
+    ResIdx2ProcResID[getResourceStateIndex(ResourceMask)] = I;
+  }
+
+  ResourceUsers.resize(NextResourceUsersIdx);
+  std::fill(ResourceUsers.begin(), ResourceUsers.end(),
+            std::make_pair<unsigned, unsigned>(~0U, 0U));
+}
+
+void PressureTracker::getResourceUsers(uint64_t ResourceMask,
+                                       SmallVectorImpl<User> &Users) const {
+  unsigned Index = getResourceStateIndex(ResourceMask);
+  unsigned ProcResID = ResIdx2ProcResID[Index];
+  const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
+  for (unsigned I = 0, E = PRDesc.NumUnits; I < E; ++I) {
+    const User U = getResourceUser(ProcResID, I);
+    if (U.second && IPI.find(U.first) != IPI.end())
+      Users.emplace_back(U);
+  }
+}
+
+void PressureTracker::onInstructionDispatched(unsigned IID) {
+  IPI.insert(std::make_pair(IID, InstructionPressureInfo()));
+}
+
+void PressureTracker::onInstructionExecuted(unsigned IID) { IPI.erase(IID); }
+
+void PressureTracker::handleInstructionIssuedEvent(
+    const HWInstructionIssuedEvent &Event) {
+  unsigned IID = Event.IR.getSourceIndex();
+  using ResourceRef = HWInstructionIssuedEvent::ResourceRef;
+  using ResourceUse = std::pair<ResourceRef, ResourceCycles>;
+  for (const ResourceUse &Use : Event.UsedResources) {
+    const ResourceRef &RR = Use.first;
+    unsigned Index = ProcResID2ResourceUsersIndex[RR.first];
+    Index += countTrailingZeros(RR.second);
+    ResourceUsers[Index] = std::make_pair(IID, Use.second.getNumerator());
+  }
+}
+
+void PressureTracker::updateResourcePressureDistribution(
+    uint64_t CumulativeMask) {
+  while (CumulativeMask) {
+    uint64_t Current = CumulativeMask & (-CumulativeMask);
+    unsigned ResIdx = getResourceStateIndex(Current);
+    unsigned ProcResID = ResIdx2ProcResID[ResIdx];
+    uint64_t Mask = ProcResID2Mask[ProcResID];
+
+    if (Mask == Current) {
+      ResourcePressureDistribution[ProcResID]++;
+      CumulativeMask ^= Current;
+      continue;
+    }
+
+    Mask ^= Current;
+    while (Mask) {
+      uint64_t SubUnit = Mask & (-Mask);
+      ResIdx = getResourceStateIndex(SubUnit);
+      ProcResID = ResIdx2ProcResID[ResIdx];
+      ResourcePressureDistribution[ProcResID]++;
+      Mask ^= SubUnit;
+    }
+
+    CumulativeMask ^= Current;
+  }
+}
+
+void PressureTracker::handlePressureEvent(const HWPressureEvent &Event) {
+  assert(Event.Reason != HWPressureEvent::INVALID &&
+         "Unexpected invalid event!");
+
+  switch (Event.Reason) {
+  default:
+    break;
+
+  case HWPressureEvent::RESOURCES: {
+    const uint64_t ResourceMask = Event.ResourceMask;
+    updateResourcePressureDistribution(Event.ResourceMask);
+
+    for (const InstRef &IR : Event.AffectedInstructions) {
+      const Instruction &IS = *IR.getInstruction();
+      unsigned BusyResources = IS.getCriticalResourceMask() & ResourceMask;
+      if (!BusyResources)
+        continue;
+
+      unsigned IID = IR.getSourceIndex();
+      IPI[IID].ResourcePressureCycles++;
+    }
+    break;
+  }
+
+  case HWPressureEvent::REGISTER_DEPS:
+    for (const InstRef &IR : Event.AffectedInstructions) {
+      unsigned IID = IR.getSourceIndex();
+      IPI[IID].RegisterPressureCycles++;
+    }
+    break;
+
+  case HWPressureEvent::MEMORY_DEPS:
+    for (const InstRef &IR : Event.AffectedInstructions) {
+      unsigned IID = IR.getSourceIndex();
+      IPI[IID].MemoryPressureCycles++;
+    }
+  }
+}
+
+#ifndef NDEBUG
+void DependencyGraph::dumpDependencyEdge(raw_ostream &OS,
+                                         const DependencyEdge &DepEdge,
+                                         MCInstPrinter &MCIP) const {
+  unsigned FromIID = DepEdge.FromIID;
+  unsigned ToIID = DepEdge.ToIID;
+  assert(FromIID < ToIID && "Graph should be acyclic!");
+
+  const DependencyEdge::Dependency &DE = DepEdge.Dep;
+  assert(DE.Type != DependencyEdge::DT_INVALID && "Unexpected invalid edge!");
+
+  OS << " FROM: " << FromIID << " TO: " << ToIID << "             ";
+  if (DE.Type == DependencyEdge::DT_REGISTER) {
+    OS << " - REGISTER: ";
+    MCIP.printRegName(OS, DE.ResourceOrRegID);
+  } else if (DE.Type == DependencyEdge::DT_MEMORY) {
+    OS << " - MEMORY";
+  } else {
+    assert(DE.Type == DependencyEdge::DT_RESOURCE &&
+           "Unsupported dependency type!");
+    OS << " - RESOURCE MASK: " << DE.ResourceOrRegID;
+  }
+  OS << " - COST: " << DE.Cost << '\n';
+}
+#endif // NDEBUG
+
+void DependencyGraph::pruneEdges(unsigned Iterations) {
+  for (DGNode &N : Nodes) {
+    unsigned NumPruned = 0;
+    const unsigned Size = N.OutgoingEdges.size();
+    // Use a cut-off threshold to prune edges with a low frequency.
+    for (unsigned I = 0, E = Size; I < E; ++I) {
+      DependencyEdge &Edge = N.OutgoingEdges[I];
+      if (Edge.Frequency == Iterations)
+        continue;
+      double Factor = (double)Edge.Frequency / Iterations;
+      if (0.10 < Factor)
+        continue;
+      Nodes[Edge.ToIID].NumPredecessors--;
+      std::swap(Edge, N.OutgoingEdges[E - 1]);
+      --E;
+      ++NumPruned;
+    }
+
+    if (NumPruned)
+      N.OutgoingEdges.resize(Size - NumPruned);
+  }
+}
+
+void DependencyGraph::initializeRootSet(
+    SmallVectorImpl<unsigned> &RootSet) const {
+  for (unsigned I = 0, E = Nodes.size(); I < E; ++I) {
+    const DGNode &N = Nodes[I];
+    if (N.NumPredecessors == 0 && !N.OutgoingEdges.empty())
+      RootSet.emplace_back(I);
+  }
+}
+
+void DependencyGraph::propagateThroughEdges(
+    SmallVectorImpl<unsigned> &RootSet, unsigned Iterations) {
+  SmallVector<unsigned, 8> ToVisit;
+
+  // A critical sequence is computed as the longest path from a node of the
+  // RootSet to a leaf node (i.e. a node with no successors).  The RootSet is
+  // composed of nodes with at least one successor, and no predecessors.
+  //
+  // Each node of the graph starts with an initial default cost of zero.  The
+  // cost of a node is a measure of criticality: the higher the cost, the bigger
+  // is the performance impact.
+  // For register and memory dependencies, the cost is a function of the write
+  // latency as well as the actual delay (in cycles) caused to users.
+  // For processor resource dependencies, the cost is a function of the resource
+  // pressure. Resource interferences with low frequency values are ignored.
+  //
+  // This algorithm is very similar to a (reverse) Dijkstra.  Every iteration of
+  // the inner loop selects (i.e. visits) a node N from a set of `unvisited
+  // nodes`, and then propagates the cost of N to all its neighbors.
+  //
+  // The `unvisited nodes` set initially contains all the nodes from the
+  // RootSet.  A node N is added to the `unvisited nodes` if all its
+  // predecessors have been visited already.
+  // 
+  // For simplicity, every node tracks the number of unvisited incoming edges in
+  // field `NumVisitedPredecessors`.  When the value of that field drops to
+  // zero, then the corresponding node is added to a `ToVisit` set.
+  //
+  // At the end of every iteration of the outer loop, set `ToVisit` becomes our
+  // new `unvisited nodes` set.
+  // 
+  // The algorithm terminates when the set of unvisited nodes (i.e. our RootSet)
+  // is empty. This algorithm works under the assumption that the graph is
+  // acyclic.
+  do {
+    for (unsigned IID : RootSet) {
+      const DGNode &N = Nodes[IID];
+      for (const DependencyEdge &DepEdge : N.OutgoingEdges) {
+        unsigned ToIID = DepEdge.ToIID;
+        DGNode &To = Nodes[ToIID];
+        uint64_t Cost = N.Cost + DepEdge.Dep.Cost;
+        // Check if this is the most expensive incoming edge seen so far.  In
+        // case, update the total cost of the destination node (ToIID), as well
+        // its field `CriticalPredecessor`.
+        if (Cost > To.Cost) {
+          To.CriticalPredecessor = DepEdge;
+          To.Cost = Cost;
+          To.Depth = N.Depth + 1;
+        }
+        To.NumVisitedPredecessors++;
+        if (To.NumVisitedPredecessors == To.NumPredecessors)
+          ToVisit.emplace_back(ToIID);
+      }
+    }
+
+    std::swap(RootSet, ToVisit);
+    ToVisit.clear();
+  } while (!RootSet.empty());
+}
+
+void DependencyGraph::getCriticalSequence(
+    SmallVectorImpl<const DependencyEdge *> &Seq) const {
+  // At this stage, nodes of the graph have been already visited, and costs have
+  // been propagated through the edges (see method `propagateThroughEdges()`).
+
+  // Identify the node N with the highest cost in the graph. By construction,
+  // that node is the last instruction of our critical sequence.
+  // Field N.Depth would tell us the total length of the sequence.
+  //
+  // To obtain the sequence of critical edges, we simply follow the chain of critical
+  // predecessors starting from node N (field DGNode::CriticalPredecessor).
+  const auto It = std::max_element(
+      Nodes.begin(), Nodes.end(),
+      [](const DGNode &Lhs, const DGNode &Rhs) { return Lhs.Cost < Rhs.Cost; });
+  unsigned IID = std::distance(Nodes.begin(), It);
+  Seq.resize(Nodes[IID].Depth);
+  for (unsigned I = Seq.size(), E = 0; I > E; --I) {
+    const DGNode &N = Nodes[IID];
+    Seq[I - 1] = &N.CriticalPredecessor;
+    IID = N.CriticalPredecessor.FromIID;
+  }
+}
+
+static void printInstruction(formatted_raw_ostream &FOS,
+                             const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
+                             const MCInst &MCI,
+                             bool UseDifferentColor = false) {
+  std::string Instruction;
+  raw_string_ostream InstrStream(Instruction);
+
+  FOS.PadToColumn(14);
+
+  MCIP.printInst(&MCI, InstrStream, "", STI);
+  InstrStream.flush();
+
+  if (UseDifferentColor)
+    FOS.changeColor(raw_ostream::CYAN, true, false);
+  FOS << StringRef(Instruction).ltrim();
+  if (UseDifferentColor)
+    FOS.resetColor();
+}
+
+void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
+  // Early exit if no bottlenecks were found during the simulation.
+  if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
+    return;
+
+  SmallVector<const DependencyEdge *, 16> Seq;
+  DG.getCriticalSequence(Seq);
+  if (Seq.empty())
+    return;
+
+  OS << "\nCritical sequence based on the simulation:\n\n";
+
+  const DependencyEdge &FirstEdge = *Seq[0];
+  unsigned FromIID = FirstEdge.FromIID % Source.size();
+  unsigned ToIID = FirstEdge.ToIID % Source.size();
+  bool IsLoopCarried = FromIID >= ToIID;
+
+  formatted_raw_ostream FOS(OS);
+  FOS.PadToColumn(14);
+  FOS << "Instruction";
+  FOS.PadToColumn(58);
+  FOS << "Dependency Information";
+
+  bool HasColors = FOS.has_colors();
+
+  unsigned CurrentIID = 0;
+  if (IsLoopCarried) {
+    FOS << "\n +----< " << FromIID << ".";
+    printInstruction(FOS, STI, MCIP, Source[FromIID], HasColors);
+    FOS << "\n |\n |    < loop carried > \n |";
+  } else {
+    while (CurrentIID < FromIID) {
+      FOS << "\n        " << CurrentIID << ".";
+      printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+      CurrentIID++;
+    }
+
+    FOS << "\n +----< " << CurrentIID << ".";
+    printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors);
+    CurrentIID++;
+  }
+
+  for (const DependencyEdge *&DE : Seq) {
+    ToIID = DE->ToIID % Source.size();
+    unsigned LastIID = CurrentIID > ToIID ? Source.size() : ToIID;
+
+    while (CurrentIID < LastIID) {
+      FOS << "\n |      " << CurrentIID << ".";
+      printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+      CurrentIID++;
+    }
+
+    if (CurrentIID == ToIID) {
+      FOS << "\n +----> " << ToIID << ".";
+      printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors);
+    } else {
+      FOS << "\n |\n |    < loop carried > \n |"
+          << "\n +----> " << ToIID << ".";
+      printInstruction(FOS, STI, MCIP, Source[ToIID], HasColors);
+    }
+    FOS.PadToColumn(58);
+
+    const DependencyEdge::Dependency &Dep = DE->Dep;
+    if (HasColors)
+      FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false);
+
+    if (Dep.Type == DependencyEdge::DT_REGISTER) {
+      FOS << "## REGISTER dependency:  ";
+      if (HasColors)
+        FOS.changeColor(raw_ostream::MAGENTA, true, false);
+      MCIP.printRegName(FOS, Dep.ResourceOrRegID);
+    } else if (Dep.Type == DependencyEdge::DT_MEMORY) {
+      FOS << "## MEMORY dependency.";
+    } else {
+      assert(Dep.Type == DependencyEdge::DT_RESOURCE &&
+             "Unsupported dependency type!");
+      FOS << "## RESOURCE interference:  ";
+      if (HasColors)
+        FOS.changeColor(raw_ostream::MAGENTA, true, false);
+      FOS << Tracker.resolveResourceName(Dep.ResourceOrRegID);
+      if (HasColors) {
+        FOS.resetColor();
+        FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false);
+      }
+      FOS << " [ probability: " << ((DE->Frequency * 100) / Iterations)
+          << "% ]";
+    }
+    if (HasColors)
+      FOS.resetColor();
+    ++CurrentIID;
+  }
+
+  while (CurrentIID < Source.size()) {
+    FOS << "\n        " << CurrentIID << ".";
+    printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+    CurrentIID++;
+  }
+
+  FOS << '\n';
+  FOS.flush();
+}
+
+#ifndef NDEBUG
+void DependencyGraph::dump(raw_ostream &OS, MCInstPrinter &MCIP) const {
+  OS << "\nREG DEPS\n";
+  for (const DGNode &Node : Nodes)
+    for (const DependencyEdge &DE : Node.OutgoingEdges)
+      if (DE.Dep.Type == DependencyEdge::DT_REGISTER)
+        dumpDependencyEdge(OS, DE, MCIP);
+
+  OS << "\nMEM DEPS\n";
+  for (const DGNode &Node : Nodes)
+    for (const DependencyEdge &DE : Node.OutgoingEdges)
+      if (DE.Dep.Type == DependencyEdge::DT_MEMORY)
+        dumpDependencyEdge(OS, DE, MCIP);
+
+  OS << "\nRESOURCE DEPS\n";
+  for (const DGNode &Node : Nodes)
+    for (const DependencyEdge &DE : Node.OutgoingEdges)
+      if (DE.Dep.Type == DependencyEdge::DT_RESOURCE)
+        dumpDependencyEdge(OS, DE, MCIP);
+}
+#endif // NDEBUG
+
+void DependencyGraph::addDependency(unsigned From, unsigned To,
+                                    DependencyEdge::Dependency &&Dep) {
+  DGNode &NodeFrom = Nodes[From];
+  DGNode &NodeTo = Nodes[To];
+  SmallVectorImpl<DependencyEdge> &Vec = NodeFrom.OutgoingEdges;
+
+  auto It = find_if(Vec, [To, Dep](DependencyEdge &DE) {
+    return DE.ToIID == To && DE.Dep.ResourceOrRegID == Dep.ResourceOrRegID;
+  });
+
+  if (It != Vec.end()) {
+    It->Dep.Cost += Dep.Cost;
+    It->Frequency++;
+    return;
+  }
+
+  DependencyEdge DE = {Dep, From, To, 1};
+  Vec.emplace_back(DE);
+  NodeTo.NumPredecessors++;
+}
+
+BottleneckAnalysis::BottleneckAnalysis(const MCSubtargetInfo &sti,
+                                       MCInstPrinter &Printer,
+                                       ArrayRef<MCInst> S, unsigned NumIter)
+    : STI(sti), MCIP(Printer), Tracker(STI.getSchedModel()), DG(S.size() * 3),
+      Source(S), Iterations(NumIter), TotalCycles(0),
+      PressureIncreasedBecauseOfResources(false),
+      PressureIncreasedBecauseOfRegisterDependencies(false),
+      PressureIncreasedBecauseOfMemoryDependencies(false),
+      SeenStallCycles(false), BPI() {}
+
+void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To,
+                                        unsigned RegID, unsigned Cost) {
+  bool IsLoopCarried = From >= To;
+  unsigned SourceSize = Source.size();
+  if (IsLoopCarried) {
+    DG.addRegisterDep(From, To + SourceSize, RegID, Cost);
+    DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost);
+    return;
+  }
+  DG.addRegisterDep(From + SourceSize, To + SourceSize, RegID, Cost);
+}
+
+void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To,
+                                      unsigned Cost) {
+  bool IsLoopCarried = From >= To;
+  unsigned SourceSize = Source.size();
+  if (IsLoopCarried) {
+    DG.addMemoryDep(From, To + SourceSize, Cost);
+    DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost);
+    return;
+  }
+  DG.addMemoryDep(From + SourceSize, To + SourceSize, Cost);
+}
+
+void BottleneckAnalysis::addResourceDep(unsigned From, unsigned To,
+                                        uint64_t Mask, unsigned Cost) {
+  bool IsLoopCarried = From >= To;
+  unsigned SourceSize = Source.size();
+  if (IsLoopCarried) {
+    DG.addResourceDep(From, To + SourceSize, Mask, Cost);
+    DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost);
+    return;
+  }
+  DG.addResourceDep(From + SourceSize, To + SourceSize, Mask, Cost);
+}
+
+void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) {
+  const unsigned IID = Event.IR.getSourceIndex();
+  if (Event.Type == HWInstructionEvent::Dispatched) {
+    Tracker.onInstructionDispatched(IID);
+    return;
+  }
+  if (Event.Type == HWInstructionEvent::Executed) {
+    Tracker.onInstructionExecuted(IID);
+    return;
+  }
+
+  if (Event.Type != HWInstructionEvent::Issued)
+    return;
+
+  const Instruction &IS = *Event.IR.getInstruction();
+  unsigned To = IID % Source.size();
+
+  unsigned Cycles = 2 * Tracker.getResourcePressureCycles(IID);
+  uint64_t ResourceMask = IS.getCriticalResourceMask();
+  SmallVector<std::pair<unsigned, unsigned>, 4> Users;
+  while (ResourceMask) {
+    uint64_t Current = ResourceMask & (-ResourceMask);
+    Tracker.getResourceUsers(Current, Users);
+    for (const std::pair<unsigned, unsigned> &U : Users)
+      addResourceDep(U.first % Source.size(), To, Current, U.second + Cycles);
+    Users.clear();
+    ResourceMask ^= Current;
+  }
+
+  const CriticalDependency &RegDep = IS.getCriticalRegDep();
+  if (RegDep.Cycles) {
+    Cycles = RegDep.Cycles + 2 * Tracker.getRegisterPressureCycles(IID);
+    unsigned From = RegDep.IID % Source.size();
+    addRegisterDep(From, To, RegDep.RegID, Cycles);
+  }
+
+  const CriticalDependency &MemDep = IS.getCriticalMemDep();
+  if (MemDep.Cycles) {
+    Cycles = MemDep.Cycles + 2 * Tracker.getMemoryPressureCycles(IID);
+    unsigned From = MemDep.IID % Source.size();
+    addMemoryDep(From, To, Cycles);
+  }
+
+  Tracker.handleInstructionIssuedEvent(
+      static_cast<const HWInstructionIssuedEvent &>(Event));
+
+  // Check if this is the last simulated instruction.
+  if (IID == ((Iterations * Source.size()) - 1))
+    DG.finalizeGraph(Iterations);
+}
+
+void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) {
+  assert(Event.Reason != HWPressureEvent::INVALID &&
+         "Unexpected invalid event!");
+
+  Tracker.handlePressureEvent(Event);
+
+  switch (Event.Reason) {
+  default:
+    break;
+
+  case HWPressureEvent::RESOURCES:
+    PressureIncreasedBecauseOfResources = true;
+    break;
+  case HWPressureEvent::REGISTER_DEPS:
+    PressureIncreasedBecauseOfRegisterDependencies = true;
+    break;
+  case HWPressureEvent::MEMORY_DEPS:
+    PressureIncreasedBecauseOfMemoryDependencies = true;
+    break;
+  }
+}
+
+void BottleneckAnalysis::onCycleEnd() {
+  ++TotalCycles;
+
+  bool PressureIncreasedBecauseOfDataDependencies =
+      PressureIncreasedBecauseOfRegisterDependencies ||
+      PressureIncreasedBecauseOfMemoryDependencies;
+  if (!PressureIncreasedBecauseOfResources &&
+      !PressureIncreasedBecauseOfDataDependencies)
+    return;
+
+  ++BPI.PressureIncreaseCycles;
+  if (PressureIncreasedBecauseOfRegisterDependencies)
+    ++BPI.RegisterDependencyCycles;
+  if (PressureIncreasedBecauseOfMemoryDependencies)
+    ++BPI.MemoryDependencyCycles;
+  if (PressureIncreasedBecauseOfDataDependencies)
+    ++BPI.DataDependencyCycles;
+  if (PressureIncreasedBecauseOfResources)
+    ++BPI.ResourcePressureCycles;
+  PressureIncreasedBecauseOfResources = false;
+  PressureIncreasedBecauseOfRegisterDependencies = false;
+  PressureIncreasedBecauseOfMemoryDependencies = false;
+}
+
+void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const {
+  if (!SeenStallCycles || !BPI.PressureIncreaseCycles) {
+    OS << "\n\nNo resource or data dependency bottlenecks discovered.\n";
+    return;
+  }
+
+  double PressurePerCycle =
+      (double)BPI.PressureIncreaseCycles * 100 / TotalCycles;
+  double ResourcePressurePerCycle =
+      (double)BPI.ResourcePressureCycles * 100 / TotalCycles;
+  double DDPerCycle = (double)BPI.DataDependencyCycles * 100 / TotalCycles;
+  double RegDepPressurePerCycle =
+      (double)BPI.RegisterDependencyCycles * 100 / TotalCycles;
+  double MemDepPressurePerCycle =
+      (double)BPI.MemoryDependencyCycles * 100 / TotalCycles;
+
+  OS << "\n\nCycles with backend pressure increase [ "
+     << format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]";
+
+  OS << "\nThroughput Bottlenecks: "
+     << "\n  Resource Pressure       [ "
+     << format("%.2f", floor((ResourcePressurePerCycle * 100) + 0.5) / 100)
+     << "% ]";
+
+  if (BPI.PressureIncreaseCycles) {
+    ArrayRef<unsigned> Distribution = Tracker.getResourcePressureDistribution();
+    const MCSchedModel &SM = STI.getSchedModel();
+    for (unsigned I = 0, E = Distribution.size(); I < E; ++I) {
+      unsigned ResourceCycles = Distribution[I];
+      if (ResourceCycles) {
+        double Frequency = (double)ResourceCycles * 100 / TotalCycles;
+        const MCProcResourceDesc &PRDesc = *SM.getProcResource(I);
+        OS << "\n  - " << PRDesc.Name << "  [ "
+           << format("%.2f", floor((Frequency * 100) + 0.5) / 100) << "% ]";
+      }
+    }
+  }
+
+  OS << "\n  Data Dependencies:      [ "
+     << format("%.2f", floor((DDPerCycle * 100) + 0.5) / 100) << "% ]";
+  OS << "\n  - Register Dependencies [ "
+     << format("%.2f", floor((RegDepPressurePerCycle * 100) + 0.5) / 100)
+     << "% ]";
+  OS << "\n  - Memory Dependencies   [ "
+     << format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100)
+     << "% ]\n";
+}
+
+void BottleneckAnalysis::printView(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  printBottleneckHints(TempStream);
+  TempStream.flush();
+  OS << Buffer;
+  printCriticalSequence(OS);
+}
+
+} // namespace mca.
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
new file mode 100644
index 000000000000..9e3bd5978f09
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
@@ -0,0 +1,343 @@
+//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the bottleneck analysis view.
+///
+/// This view internally observes backend pressure increase events in order to
+/// identify problematic data dependencies and processor resource interferences.
+///
+/// Example of bottleneck analysis report for a dot-product on X86 btver2:
+///
+/// Cycles with backend pressure increase [ 40.76% ]
+/// Throughput Bottlenecks: 
+///   Resource Pressure       [ 39.34% ]
+///   - JFPA  [ 39.34% ]
+///   - JFPU0  [ 39.34% ]
+///   Data Dependencies:      [ 1.42% ]
+///   - Register Dependencies [ 1.42% ]
+///   - Memory Dependencies   [ 0.00% ]
+///
+/// According to the example, backend pressure increased during the 40.76% of
+/// the simulated cycles.  In particular, the major cause of backend pressure
+/// increases was the contention on floating point adder JFPA accessible from
+/// pipeline resource JFPU0.
+///
+/// At the end of each cycle, if pressure on the simulated out-of-order buffers
+/// has increased, a backend pressure event is reported.
+/// In particular, this occurs when there is a delta between the number of uOps
+/// dispatched and the number of uOps issued to the underlying pipelines.
+///
+/// The bottleneck analysis view is also responsible for identifying and printing
+/// the most "critical" sequence of dependent instructions according to the
+/// simulated run.
+///
+/// Below is the critical sequence computed for the dot-product example on
+/// btver2:
+///
+///              Instruction                     Dependency Information
+/// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
+/// |
+/// |    < loop carried > 
+/// |
+/// |      0.    vmulps	 %xmm0, %xmm0, %xmm2
+/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
+/// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
+/// |
+/// |    < loop carried > 
+/// |
+/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
+///
+///
+/// The algorithm that computes the critical sequence is very similar to a
+/// critical path analysis.
+/// 
+/// A dependency graph is used internally to track dependencies between nodes.
+/// Nodes of the graph represent instructions from the input assembly sequence,
+/// and edges of the graph represent data dependencies or processor resource
+/// interferences.
+///
+/// Edges are dynamically 'discovered' by observing instruction state transitions
+/// and backend pressure increase events. Edges are internally ranked based on
+/// their "criticality". A dependency is considered to be critical if it takes a
+/// long time to execute, and if it contributes to backend pressure increases.
+/// Criticality is internally measured in terms of cycles; it is computed for
+/// every edge in the graph as a function of the edge latency and the number of
+/// backend pressure increase cycles contributed by that edge.
+///
+/// At the end of simulation, costs are propagated to nodes through the edges of
+/// the graph, and the most expensive path connecting the root-set (a
+/// set of nodes with no predecessors) to a leaf node is reported as critical
+/// sequence.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
+#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
+
+#include "Views/View.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace mca {
+
+class PressureTracker {
+  const MCSchedModel &SM;
+
+  // Resource pressure distribution. There is an element for every processor
+  // resource declared by the scheduling model. Quantities are number of cycles.
+  SmallVector<unsigned, 4> ResourcePressureDistribution;
+
+  // Each processor resource is associated with a so-called processor resource
+  // mask. This vector allows to correlate processor resource IDs with processor
+  // resource masks. There is exactly one element per each processor resource
+  // declared by the scheduling model.
+  SmallVector<uint64_t, 4> ProcResID2Mask;
+
+  // Maps processor resource state indices (returned by calls to
+  // `getResourceStateIndex(Mask)` to processor resource identifiers.
+  SmallVector<unsigned, 4> ResIdx2ProcResID;
+
+  // Maps Processor Resource identifiers to ResourceUsers indices.
+  SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
+
+  // Identifies the last user of a processor resource unit.
+  // This vector is updated on every instruction issued event.
+  // There is one entry for every processor resource unit declared by the
+  // processor model. An all_ones value is treated like an invalid instruction
+  // identifier.
+  using User = std::pair<unsigned, unsigned>;
+  SmallVector<User, 4> ResourceUsers;
+
+  struct InstructionPressureInfo {
+    unsigned RegisterPressureCycles;
+    unsigned MemoryPressureCycles;
+    unsigned ResourcePressureCycles;
+  };
+  DenseMap<unsigned, InstructionPressureInfo> IPI;
+
+  void updateResourcePressureDistribution(uint64_t CumulativeMask);
+
+  User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
+    unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
+    return ResourceUsers[Index + UnitID];
+  }
+
+public:
+  PressureTracker(const MCSchedModel &Model);
+
+  ArrayRef<unsigned> getResourcePressureDistribution() const {
+    return ResourcePressureDistribution;
+  }
+
+  void getResourceUsers(uint64_t ResourceMask,
+                        SmallVectorImpl<User> &Users) const;
+
+  unsigned getRegisterPressureCycles(unsigned IID) const {
+    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
+    const InstructionPressureInfo &Info = IPI.find(IID)->second;
+    return Info.RegisterPressureCycles;
+  }
+
+  unsigned getMemoryPressureCycles(unsigned IID) const {
+    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
+    const InstructionPressureInfo &Info = IPI.find(IID)->second;
+    return Info.MemoryPressureCycles;
+  }
+
+  unsigned getResourcePressureCycles(unsigned IID) const {
+    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
+    const InstructionPressureInfo &Info = IPI.find(IID)->second;
+    return Info.ResourcePressureCycles;
+  }
+
+  const char *resolveResourceName(uint64_t ResourceMask) const {
+    unsigned Index = getResourceStateIndex(ResourceMask);
+    unsigned ProcResID = ResIdx2ProcResID[Index];
+    const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
+    return PRDesc.Name;
+  }
+
+  void onInstructionDispatched(unsigned IID);
+  void onInstructionExecuted(unsigned IID);
+
+  void handlePressureEvent(const HWPressureEvent &Event);
+  void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
+};
+
+// A dependency edge.
+struct DependencyEdge {
+  enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
+
+  // Dependency edge descriptor.
+  //
+  // It specifies the dependency type, as well as the edge cost in cycles.
+  struct Dependency {
+    DependencyType Type;
+    uint64_t ResourceOrRegID;
+    uint64_t Cost;
+  };
+  Dependency Dep;
+
+  unsigned FromIID;
+  unsigned ToIID;
+
+  // Used by the bottleneck analysis to compute the interference
+  // probability for processor resources.
+  unsigned Frequency;
+};
+
+// A dependency graph used by the bottleneck analysis to describe data
+// dependencies and processor resource interferences between instructions.
+//
+// There is a node (an instance of struct DGNode) for every instruction in the
+// input assembly sequence. Edges of the graph represent dependencies between
+// instructions.
+//
+// Each edge of the graph is associated with a cost value which is used
+// internally to rank dependency based on their impact on the runtime
+// performance (see field DependencyEdge::Dependency::Cost). In general, the
+// higher the cost of an edge, the higher the impact on performance.
+//
+// The cost of a dependency is a function of both the latency and the number of
+// cycles where the dependency has been seen as critical (i.e. contributing to
+// back-pressure increases).
+//
+// Loop carried dependencies are carefully expanded by the bottleneck analysis
+// to guarantee that the graph stays acyclic. To this end, extra nodes are
+// pre-allocated at construction time to describe instructions from "past and
+// future" iterations. The graph is kept acyclic mainly because it simplifies the
+// complexity of the algorithm that computes the critical sequence.
+class DependencyGraph {
+  struct DGNode {
+    unsigned NumPredecessors;
+    unsigned NumVisitedPredecessors;
+    uint64_t Cost;
+    unsigned Depth;
+
+    DependencyEdge CriticalPredecessor;
+    SmallVector<DependencyEdge, 8> OutgoingEdges;
+  };
+  SmallVector<DGNode, 16> Nodes;
+
+  DependencyGraph(const DependencyGraph &) = delete;
+  DependencyGraph &operator=(const DependencyGraph &) = delete;
+
+  void addDependency(unsigned From, unsigned To,
+                     DependencyEdge::Dependency &&DE);
+
+  void pruneEdges(unsigned Iterations);
+  void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
+  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
+
+#ifndef NDEBUG
+  void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
+                          MCInstPrinter &MCIP) const;
+#endif
+
+public:
+  DependencyGraph(unsigned Size) : Nodes(Size) {}
+
+  void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
+                      unsigned Cost) {
+    addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
+  }
+
+  void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
+    addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
+  }
+
+  void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
+                      unsigned Cost) {
+    addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
+  }
+
+  // Called by the bottleneck analysis at the end of simulation to propagate
+  // costs through the edges of the graph, and compute a critical path.
+  void finalizeGraph(unsigned Iterations) {
+    SmallVector<unsigned, 16> RootSet;
+    pruneEdges(Iterations);
+    initializeRootSet(RootSet);
+    propagateThroughEdges(RootSet, Iterations);
+  }
+
+  // Returns a sequence of edges representing the critical sequence based on the
+  // simulated run. It assumes that the graph has already been finalized (i.e.
+  // method `finalizeGraph()` has already been called on this graph).
+  void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
+
+#ifndef NDEBUG
+  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
+#endif
+};
+
+/// A view that collects and prints a few performance numbers.
+class BottleneckAnalysis : public View {
+  const MCSubtargetInfo &STI;
+  MCInstPrinter &MCIP;
+  PressureTracker Tracker;
+  DependencyGraph DG;
+
+  ArrayRef<MCInst> Source;
+  unsigned Iterations;
+  unsigned TotalCycles;
+
+  bool PressureIncreasedBecauseOfResources;
+  bool PressureIncreasedBecauseOfRegisterDependencies;
+  bool PressureIncreasedBecauseOfMemoryDependencies;
+  // True if throughput was affected by dispatch stalls.
+  bool SeenStallCycles;
+
+  struct BackPressureInfo {
+    // Cycles where backpressure increased.
+    unsigned PressureIncreaseCycles;
+    // Cycles where backpressure increased because of pipeline pressure.
+    unsigned ResourcePressureCycles;
+    // Cycles where backpressure increased because of data dependencies.
+    unsigned DataDependencyCycles;
+    // Cycles where backpressure increased because of register dependencies.
+    unsigned RegisterDependencyCycles;
+    // Cycles where backpressure increased because of memory dependencies.
+    unsigned MemoryDependencyCycles;
+  };
+  BackPressureInfo BPI;
+
+  // Used to populate the dependency graph DG.
+  void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
+  void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
+  void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
+
+  // Prints a bottleneck message to OS.
+  void printBottleneckHints(raw_ostream &OS) const;
+  void printCriticalSequence(raw_ostream &OS) const;
+
+public:
+  BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
+                     ArrayRef<MCInst> Sequence, unsigned Iterations);
+
+  void onCycleEnd() override;
+  void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
+  void onEvent(const HWPressureEvent &Event) override;
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  void printView(raw_ostream &OS) const override;
+
+#ifndef NDEBUG
+  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
+#endif
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp b/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp
new file mode 100644
index 000000000000..557b8ba17b17
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp
@@ -0,0 +1,85 @@
+//===--------------------- DispatchStatistics.cpp ---------------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the DispatchStatistics interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/DispatchStatistics.h"
+#include "llvm/Support/Format.h"
+
+namespace llvm {
+namespace mca {
+
+void DispatchStatistics::onEvent(const HWStallEvent &Event) {
+  if (Event.Type < HWStallEvent::LastGenericEvent)
+    HWStalls[Event.Type]++;
+}
+
+void DispatchStatistics::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type != HWInstructionEvent::Dispatched)
+    return;
+
+  const auto &DE = static_cast<const HWInstructionDispatchedEvent &>(Event);
+  NumDispatched += DE.MicroOpcodes;
+}
+
+void DispatchStatistics::printDispatchHistogram(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nDispatch Logic - "
+             << "number of cycles where we saw N micro opcodes dispatched:\n";
+  TempStream << "[# dispatched], [# cycles]\n";
+  for (const std::pair<unsigned, unsigned> &Entry : DispatchGroupSizePerCycle) {
+    double Percentage = ((double)Entry.second / NumCycles) * 100.0;
+    TempStream << " " << Entry.first << ",              " << Entry.second
+               << "  (" << format("%.1f", floor((Percentage * 10) + 0.5) / 10)
+               << "%)\n";
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+static void printStalls(raw_ostream &OS, unsigned NumStalls,
+                        unsigned NumCycles) {
+  if (!NumStalls) {
+    OS << NumStalls;
+    return;
+  }
+
+  double Percentage = ((double)NumStalls / NumCycles) * 100.0;
+  OS << NumStalls << "  ("
+     << format("%.1f", floor((Percentage * 10) + 0.5) / 10) << "%)";
+}
+
+void DispatchStatistics::printDispatchStalls(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream SS(Buffer);
+  SS << "\n\nDynamic Dispatch Stall Cycles:\n";
+  SS << "RAT     - Register unavailable:                      ";
+  printStalls(SS, HWStalls[HWStallEvent::RegisterFileStall], NumCycles);
+  SS << "\nRCU     - Retire tokens unavailable:                 ";
+  printStalls(SS, HWStalls[HWStallEvent::RetireControlUnitStall], NumCycles);
+  SS << "\nSCHEDQ  - Scheduler full:                            ";
+  printStalls(SS, HWStalls[HWStallEvent::SchedulerQueueFull], NumCycles);
+  SS << "\nLQ      - Load queue full:                           ";
+  printStalls(SS, HWStalls[HWStallEvent::LoadQueueFull], NumCycles);
+  SS << "\nSQ      - Store queue full:                          ";
+  printStalls(SS, HWStalls[HWStallEvent::StoreQueueFull], NumCycles);
+  SS << "\nGROUP   - Static restrictions on the dispatch group: ";
+  printStalls(SS, HWStalls[HWStallEvent::DispatchGroupStall], NumCycles);
+  SS << '\n';
+  SS.flush();
+  OS << Buffer;
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/DispatchStatistics.h b/llvm/tools/llvm-mca/Views/DispatchStatistics.h
new file mode 100644
index 000000000000..07c0f5a4c68f
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/DispatchStatistics.h
@@ -0,0 +1,85 @@
+//===--------------------- DispatchStatistics.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements a view that prints a few statistics related to the
+/// dispatch logic. It collects and analyzes instruction dispatch events as
+/// well as static/dynamic dispatch stall events.
+///
+/// Example:
+/// ========
+///
+/// Dynamic Dispatch Stall Cycles:
+/// RAT     - Register unavailable:                      0
+/// RCU     - Retire tokens unavailable:                 0
+/// SCHEDQ  - Scheduler full:                            42
+/// LQ      - Load queue full:                           0
+/// SQ      - Store queue full:                          0
+/// GROUP   - Static restrictions on the dispatch group: 0
+///
+///
+/// Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+/// [# dispatched], [# cycles]
+///  0,              15  (11.5%)
+///  2,              4  (3.1%)
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCHVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_DISPATCHVIEW_H
+
+#include "Views/View.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace llvm {
+namespace mca {
+
+class DispatchStatistics : public View {
+  unsigned NumDispatched;
+  unsigned NumCycles;
+
+  // Counts dispatch stall events caused by unavailability of resources.  There
+  // is one counter for every generic stall kind (see class HWStallEvent).
+  llvm::SmallVector<unsigned, 8> HWStalls;
+
+  using Histogram = std::map<unsigned, unsigned>;
+  Histogram DispatchGroupSizePerCycle;
+
+  void updateHistograms() {
+    DispatchGroupSizePerCycle[NumDispatched]++;
+    NumDispatched = 0;
+  }
+
+  void printDispatchHistogram(llvm::raw_ostream &OS) const;
+
+  void printDispatchStalls(llvm::raw_ostream &OS) const;
+
+public:
+  DispatchStatistics()
+      : NumDispatched(0), NumCycles(0),
+        HWStalls(HWStallEvent::LastGenericEvent) {}
+
+  void onEvent(const HWStallEvent &Event) override;
+
+  void onEvent(const HWInstructionEvent &Event) override;
+
+  void onCycleBegin() override { NumCycles++; }
+
+  void onCycleEnd() override { updateHistograms(); }
+
+  void printView(llvm::raw_ostream &OS) const override {
+    printDispatchStalls(OS);
+    printDispatchHistogram(OS);
+  }
+};
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
new file mode 100644
index 000000000000..a6f9153b4945
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
@@ -0,0 +1,112 @@
+//===--------------------- InstructionInfoView.cpp --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the InstructionInfoView API.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/InstructionInfoView.h"
+#include "llvm/Support/FormattedStream.h"
+
+namespace llvm {
+namespace mca {
+
+void InstructionInfoView::printView(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  const MCSchedModel &SM = STI.getSchedModel();
+
+  std::string Instruction;
+  raw_string_ostream InstrStream(Instruction);
+
+  TempStream << "\n\nInstruction Info:\n";
+  TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n"
+             << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects (U)\n";
+  if (PrintEncodings) {
+    TempStream << "[7]: Encoding Size\n";
+    TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    [7]    "
+               << "Encodings:                    Instructions:\n";
+  } else {
+    TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    Instructions:\n";
+  }
+
+  for (unsigned I = 0, E = Source.size(); I < E; ++I) {
+    const MCInst &Inst = Source[I];
+    const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode());
+
+    // Obtain the scheduling class information from the instruction.
+    unsigned SchedClassID = MCDesc.getSchedClass();
+    unsigned CPUID = SM.getProcessorID();
+
+    // Try to solve variant scheduling classes.
+    while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
+      SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &Inst, CPUID);
+
+    const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+    unsigned NumMicroOpcodes = SCDesc.NumMicroOps;
+    unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
+    // Add extra latency due to delays in the forwarding data paths.
+    Latency += MCSchedModel::getForwardingDelayCycles(
+        STI.getReadAdvanceEntries(SCDesc));
+    Optional<double> RThroughput =
+        MCSchedModel::getReciprocalThroughput(STI, SCDesc);
+
+    TempStream << ' ' << NumMicroOpcodes << "    ";
+    if (NumMicroOpcodes < 10)
+      TempStream << "  ";
+    else if (NumMicroOpcodes < 100)
+      TempStream << ' ';
+    TempStream << Latency << "   ";
+    if (Latency < 10)
+      TempStream << "  ";
+    else if (Latency < 100)
+      TempStream << ' ';
+
+    if (RThroughput.hasValue()) {
+      double RT = RThroughput.getValue();
+      TempStream << format("%.2f", RT) << ' ';
+      if (RT < 10.0)
+        TempStream << "  ";
+      else if (RT < 100.0)
+        TempStream << ' ';
+    } else {
+      TempStream << " -     ";
+    }
+    TempStream << (MCDesc.mayLoad() ? " *     " : "       ");
+    TempStream << (MCDesc.mayStore() ? " *     " : "       ");
+    TempStream << (MCDesc.hasUnmodeledSideEffects() ? " U     " : "       ");
+
+    if (PrintEncodings) {
+      StringRef Encoding(CE.getEncoding(I));
+      unsigned EncodingSize = Encoding.size();
+      TempStream << " " << EncodingSize
+                 << (EncodingSize < 10 ? "     " : "    ");
+      TempStream.flush();
+      formatted_raw_ostream FOS(TempStream);
+      for (unsigned i = 0, e = Encoding.size(); i != e; ++i)
+        FOS << format("%02x ", (uint8_t)Encoding[i]);
+      FOS.PadToColumn(30);
+      FOS.flush();
+    }
+
+    MCIP.printInst(&Inst, InstrStream, "", STI);
+    InstrStream.flush();
+
+    // Consume any tabs or spaces at the beginning of the string.
+    StringRef Str(Instruction);
+    Str = Str.ltrim();
+    TempStream << Str << '\n';
+    Instruction = "";
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+} // namespace mca.
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.h b/llvm/tools/llvm-mca/Views/InstructionInfoView.h
new file mode 100644
index 000000000000..0e948304119f
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.h
@@ -0,0 +1,73 @@
+//===--------------------- InstructionInfoView.h ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the instruction info view.
+///
+/// The goal fo the instruction info view is to print the latency and reciprocal
+/// throughput information for every instruction in the input sequence.
+/// This section also reports extra information related to the number of micro
+/// opcodes, and opcode properties (i.e. 'MayLoad', 'MayStore', 'HasSideEffects)
+///
+/// Example:
+///
+/// Instruction Info:
+/// [1]: #uOps
+/// [2]: Latency
+/// [3]: RThroughput
+/// [4]: MayLoad
+/// [5]: MayStore
+/// [6]: HasSideEffects
+///
+/// [1]    [2]    [3]    [4]    [5]    [6]	Instructions:
+///  1      2     1.00                    	vmulps	%xmm0, %xmm1, %xmm2
+///  1      3     1.00                    	vhaddps	%xmm2, %xmm2, %xmm3
+///  1      3     1.00                    	vhaddps	%xmm3, %xmm3, %xmm4
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
+
+#include "Views/View.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/CodeEmitter.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace llvm {
+namespace mca {
+
+/// A view that prints out generic instruction information.
+class InstructionInfoView : public View {
+  const llvm::MCSubtargetInfo &STI;
+  const llvm::MCInstrInfo &MCII;
+  CodeEmitter &CE;
+  bool PrintEncodings;
+  llvm::ArrayRef<llvm::MCInst> Source;
+  llvm::MCInstPrinter &MCIP;
+
+public:
+  InstructionInfoView(const llvm::MCSubtargetInfo &ST,
+                      const llvm::MCInstrInfo &II, CodeEmitter &C,
+                      bool ShouldPrintEncodings, llvm::ArrayRef<llvm::MCInst> S,
+                      llvm::MCInstPrinter &IP)
+      : STI(ST), MCII(II), CE(C), PrintEncodings(ShouldPrintEncodings),
+        Source(S), MCIP(IP) {}
+
+  void printView(llvm::raw_ostream &OS) const override;
+};
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-mca/Views/RegisterFileStatistics.cpp b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.cpp
new file mode 100644
index 000000000000..58736ee0d18c
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.cpp
@@ -0,0 +1,167 @@
+//===--------------------- RegisterFileStatistics.cpp -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the RegisterFileStatistics interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/RegisterFileStatistics.h"
+#include "llvm/Support/Format.h"
+
+namespace llvm {
+namespace mca {
+
+RegisterFileStatistics::RegisterFileStatistics(const MCSubtargetInfo &sti)
+    : STI(sti) {
+  const MCSchedModel &SM = STI.getSchedModel();
+  RegisterFileUsage RFUEmpty = {0, 0, 0};
+  MoveEliminationInfo MEIEmpty = {0, 0, 0, 0, 0};
+  if (!SM.hasExtraProcessorInfo()) {
+    // Assume a single register file.
+    PRFUsage.emplace_back(RFUEmpty);
+    MoveElimInfo.emplace_back(MEIEmpty);
+    return;
+  }
+
+  // Initialize a RegisterFileUsage for every user defined register file, plus
+  // the default register file which is always at index #0.
+  const MCExtraProcessorInfo &PI = SM.getExtraProcessorInfo();
+  // There is always an "InvalidRegisterFile" entry in tablegen. That entry can
+  // be skipped. If there are no user defined register files, then reserve a
+  // single entry for the default register file at index #0.
+  unsigned NumRegFiles = std::max(PI.NumRegisterFiles, 1U);
+
+  PRFUsage.resize(NumRegFiles);
+  std::fill(PRFUsage.begin(), PRFUsage.end(), RFUEmpty);
+
+  MoveElimInfo.resize(NumRegFiles);
+  std::fill(MoveElimInfo.begin(), MoveElimInfo.end(), MEIEmpty);
+}
+
+void RegisterFileStatistics::updateRegisterFileUsage(
+    ArrayRef<unsigned> UsedPhysRegs) {
+  for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I) {
+    RegisterFileUsage &RFU = PRFUsage[I];
+    unsigned NumUsedPhysRegs = UsedPhysRegs[I];
+    RFU.CurrentlyUsedMappings += NumUsedPhysRegs;
+    RFU.TotalMappings += NumUsedPhysRegs;
+    RFU.MaxUsedMappings =
+        std::max(RFU.MaxUsedMappings, RFU.CurrentlyUsedMappings);
+  }
+}
+
+void RegisterFileStatistics::updateMoveElimInfo(const Instruction &Inst) {
+  if (!Inst.isOptimizableMove())
+    return;
+
+  assert(Inst.getDefs().size() == 1 && "Expected a single definition!");
+  assert(Inst.getUses().size() == 1 && "Expected a single register use!");
+  const WriteState &WS = Inst.getDefs()[0];
+  const ReadState &RS = Inst.getUses()[0];
+
+  MoveEliminationInfo &Info =
+      MoveElimInfo[Inst.getDefs()[0].getRegisterFileID()];
+  Info.TotalMoveEliminationCandidates++;
+  if (WS.isEliminated())
+    Info.CurrentMovesEliminated++;
+  if (WS.isWriteZero() && RS.isReadZero())
+    Info.TotalMovesThatPropagateZero++;
+}
+
+void RegisterFileStatistics::onEvent(const HWInstructionEvent &Event) {
+  switch (Event.Type) {
+  default:
+    break;
+  case HWInstructionEvent::Retired: {
+    const auto &RE = static_cast<const HWInstructionRetiredEvent &>(Event);
+    for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I)
+      PRFUsage[I].CurrentlyUsedMappings -= RE.FreedPhysRegs[I];
+    break;
+  }
+  case HWInstructionEvent::Dispatched: {
+    const auto &DE = static_cast<const HWInstructionDispatchedEvent &>(Event);
+    updateRegisterFileUsage(DE.UsedPhysRegs);
+    updateMoveElimInfo(*DE.IR.getInstruction());
+  }
+  }
+}
+
+void RegisterFileStatistics::onCycleEnd() {
+  for (MoveEliminationInfo &MEI : MoveElimInfo) {
+    unsigned &CurrentMax = MEI.MaxMovesEliminatedPerCycle;
+    CurrentMax = std::max(CurrentMax, MEI.CurrentMovesEliminated);
+    MEI.TotalMovesEliminated += MEI.CurrentMovesEliminated;
+    MEI.CurrentMovesEliminated = 0;
+  }
+}
+
+void RegisterFileStatistics::printView(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+
+  TempStream << "\n\nRegister File statistics:";
+  const RegisterFileUsage &GlobalUsage = PRFUsage[0];
+  TempStream << "\nTotal number of mappings created:    "
+             << GlobalUsage.TotalMappings;
+  TempStream << "\nMax number of mappings used:         "
+             << GlobalUsage.MaxUsedMappings << '\n';
+
+  for (unsigned I = 1, E = PRFUsage.size(); I < E; ++I) {
+    const RegisterFileUsage &RFU = PRFUsage[I];
+    // Obtain the register file descriptor from the scheduling model.
+    assert(STI.getSchedModel().hasExtraProcessorInfo() &&
+           "Unable to find register file info!");
+    const MCExtraProcessorInfo &PI =
+        STI.getSchedModel().getExtraProcessorInfo();
+    assert(I <= PI.NumRegisterFiles && "Unexpected register file index!");
+    const MCRegisterFileDesc &RFDesc = PI.RegisterFiles[I];
+    // Skip invalid register files.
+    if (!RFDesc.NumPhysRegs)
+      continue;
+
+    TempStream << "\n*  Register File #" << I;
+    TempStream << " -- " << StringRef(RFDesc.Name) << ':';
+    TempStream << "\n   Number of physical registers:     ";
+    if (!RFDesc.NumPhysRegs)
+      TempStream << "unbounded";
+    else
+      TempStream << RFDesc.NumPhysRegs;
+    TempStream << "\n   Total number of mappings created: "
+               << RFU.TotalMappings;
+    TempStream << "\n   Max number of mappings used:      "
+               << RFU.MaxUsedMappings << '\n';
+    const MoveEliminationInfo &MEI = MoveElimInfo[I];
+
+    if (MEI.TotalMoveEliminationCandidates) {
+      TempStream << "   Number of optimizable moves:      "
+                 << MEI.TotalMoveEliminationCandidates;
+      double EliminatedMovProportion = (double)MEI.TotalMovesEliminated /
+                                       MEI.TotalMoveEliminationCandidates *
+                                       100.0;
+      double ZeroMovProportion = (double)MEI.TotalMovesThatPropagateZero /
+                                 MEI.TotalMoveEliminationCandidates * 100.0;
+      TempStream << "\n   Number of moves eliminated:       "
+                 << MEI.TotalMovesEliminated << "  "
+                 << format("(%.1f%%)",
+                           floor((EliminatedMovProportion * 10) + 0.5) / 10);
+      TempStream << "\n   Number of zero moves:             "
+                 << MEI.TotalMovesThatPropagateZero << "  "
+                 << format("(%.1f%%)",
+                           floor((ZeroMovProportion * 10) + 0.5) / 10);
+      TempStream << "\n   Max moves eliminated per cycle:   "
+                 << MEI.MaxMovesEliminatedPerCycle << '\n';
+    }
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h
new file mode 100644
index 000000000000..a2273dd48b22
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h
@@ -0,0 +1,80 @@
+//===--------------------- RegisterFileStatistics.h -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This view collects and prints register file usage statistics.
+///
+/// Example  (-mcpu=btver2):
+/// ========================
+///
+/// Register File statistics:
+/// Total number of mappings created:    6
+/// Max number of mappings used:         3
+///
+/// *  Register File #1 -- FpuPRF:
+///    Number of physical registers:     72
+///    Total number of mappings created: 0
+///    Max number of mappings used:      0
+///    Number of optimizable moves:      200
+///    Number of moves eliminated:       200 (100.0%)
+///    Number of zero moves:             200 (100.0%)
+///    Max moves eliminated per cycle:   2
+///
+/// *  Register File #2 -- IntegerPRF:
+///    Number of physical registers:     64
+///    Total number of mappings created: 6
+///    Max number of mappings used:      3
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_REGISTERFILESTATISTICS_H
+#define LLVM_TOOLS_LLVM_MCA_REGISTERFILESTATISTICS_H
+
+#include "Views/View.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+namespace mca {
+
+class RegisterFileStatistics : public View {
+  const llvm::MCSubtargetInfo &STI;
+
+  // Used to track the number of physical registers used in a register file.
+  struct RegisterFileUsage {
+    unsigned TotalMappings;
+    unsigned MaxUsedMappings;
+    unsigned CurrentlyUsedMappings;
+  };
+
+  struct MoveEliminationInfo {
+    unsigned TotalMoveEliminationCandidates;
+    unsigned TotalMovesEliminated;
+    unsigned TotalMovesThatPropagateZero;
+    unsigned MaxMovesEliminatedPerCycle;
+    unsigned CurrentMovesEliminated;
+  };
+
+  // There is one entry for each register file implemented by the processor.
+  llvm::SmallVector<RegisterFileUsage, 4> PRFUsage;
+  llvm::SmallVector<MoveEliminationInfo, 4> MoveElimInfo;
+
+  void updateRegisterFileUsage(ArrayRef<unsigned> UsedPhysRegs);
+  void updateMoveElimInfo(const Instruction &Inst);
+
+public:
+  RegisterFileStatistics(const llvm::MCSubtargetInfo &sti);
+
+  void onCycleEnd() override;
+  void onEvent(const HWInstructionEvent &Event) override;
+  void printView(llvm::raw_ostream &OS) const override;
+};
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp b/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp
new file mode 100644
index 000000000000..38a2478cf4fe
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp
@@ -0,0 +1,184 @@
+//===--------------------- ResourcePressureView.cpp -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods in the ResourcePressureView interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/ResourcePressureView.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace mca {
+
+ResourcePressureView::ResourcePressureView(const llvm::MCSubtargetInfo &sti,
+                                           MCInstPrinter &Printer,
+                                           ArrayRef<MCInst> S)
+    : STI(sti), MCIP(Printer), Source(S), LastInstructionIdx(0) {
+  // Populate the map of resource descriptors.
+  unsigned R2VIndex = 0;
+  const MCSchedModel &SM = STI.getSchedModel();
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+
+    Resource2VecIndex.insert(std::pair<unsigned, unsigned>(I, R2VIndex));
+    R2VIndex += ProcResource.NumUnits;
+  }
+
+  NumResourceUnits = R2VIndex;
+  ResourceUsage.resize(NumResourceUnits * (Source.size() + 1));
+  std::fill(ResourceUsage.begin(), ResourceUsage.end(), 0.0);
+}
+
+void ResourcePressureView::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Dispatched) {
+    LastInstructionIdx = Event.IR.getSourceIndex();
+    return;
+  }
+
+  // We're only interested in Issue events.
+  if (Event.Type != HWInstructionEvent::Issued)
+    return;
+
+  const auto &IssueEvent = static_cast<const HWInstructionIssuedEvent &>(Event);
+  const unsigned SourceIdx = Event.IR.getSourceIndex() % Source.size();
+  for (const std::pair<ResourceRef, ResourceCycles> &Use :
+       IssueEvent.UsedResources) {
+    const ResourceRef &RR = Use.first;
+    assert(Resource2VecIndex.find(RR.first) != Resource2VecIndex.end());
+    unsigned R2VIndex = Resource2VecIndex[RR.first];
+    R2VIndex += countTrailingZeros(RR.second);
+    ResourceUsage[R2VIndex + NumResourceUnits * SourceIdx] += Use.second;
+    ResourceUsage[R2VIndex + NumResourceUnits * Source.size()] += Use.second;
+  }
+}
+
+static void printColumnNames(formatted_raw_ostream &OS,
+                             const MCSchedModel &SM) {
+  unsigned Column = OS.getColumn();
+  for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds();
+       I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+
+    for (unsigned J = 0; J < NumUnits; ++J) {
+      Column += 7;
+      OS << "[" << ResourceIndex;
+      if (NumUnits > 1)
+        OS << '.' << J;
+      OS << ']';
+      OS.PadToColumn(Column);
+    }
+
+    ResourceIndex++;
+  }
+}
+
+static void printResourcePressure(formatted_raw_ostream &OS, double Pressure,
+                                  unsigned Col) {
+  if (!Pressure || Pressure < 0.005) {
+    OS << " - ";
+  } else {
+    // Round to the value to the nearest hundredth and then print it.
+    OS << format("%.2f", floor((Pressure * 100) + 0.5) / 100);
+  }
+  OS.PadToColumn(Col);
+}
+
+void ResourcePressureView::printResourcePressurePerIter(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  formatted_raw_ostream FOS(TempStream);
+
+  FOS << "\n\nResources:\n";
+  const MCSchedModel &SM = STI.getSchedModel();
+  for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds();
+       I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+
+    for (unsigned J = 0; J < NumUnits; ++J) {
+      FOS << '[' << ResourceIndex;
+      if (NumUnits > 1)
+        FOS << '.' << J;
+      FOS << ']';
+      FOS.PadToColumn(6);
+      FOS << "- " << ProcResource.Name << '\n';
+    }
+
+    ResourceIndex++;
+  }
+
+  FOS << "\n\nResource pressure per iteration:\n";
+  FOS.flush();
+  printColumnNames(FOS, SM);
+  FOS << '\n';
+  FOS.flush();
+
+  const unsigned Executions = LastInstructionIdx / Source.size() + 1;
+  for (unsigned I = 0, E = NumResourceUnits; I < E; ++I) {
+    double Usage = ResourceUsage[I + Source.size() * E];
+    printResourcePressure(FOS, Usage / Executions, (I + 1) * 7);
+  }
+
+  FOS.flush();
+  OS << Buffer;
+}
+
+void ResourcePressureView::printResourcePressurePerInst(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  formatted_raw_ostream FOS(TempStream);
+
+  FOS << "\n\nResource pressure by instruction:\n";
+  printColumnNames(FOS, STI.getSchedModel());
+  FOS << "Instructions:\n";
+
+  std::string Instruction;
+  raw_string_ostream InstrStream(Instruction);
+
+  unsigned InstrIndex = 0;
+  const unsigned Executions = LastInstructionIdx / Source.size() + 1;
+  for (const MCInst &MCI : Source) {
+    unsigned BaseEltIdx = InstrIndex * NumResourceUnits;
+    for (unsigned J = 0; J < NumResourceUnits; ++J) {
+      double Usage = ResourceUsage[J + BaseEltIdx];
+      printResourcePressure(FOS, Usage / Executions, (J + 1) * 7);
+    }
+
+    MCIP.printInst(&MCI, InstrStream, "", STI);
+    InstrStream.flush();
+    StringRef Str(Instruction);
+
+    // Remove any tabs or spaces at the beginning of the instruction.
+    Str = Str.ltrim();
+
+    FOS << Str << '\n';
+    Instruction = "";
+
+    FOS.flush();
+    OS << Buffer;
+    Buffer = "";
+
+    ++InstrIndex;
+  }
+}
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/ResourcePressureView.h b/llvm/tools/llvm-mca/Views/ResourcePressureView.h
new file mode 100644
index 000000000000..0fa0b9a36aa3
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/ResourcePressureView.h
@@ -0,0 +1,103 @@
+//===--------------------- ResourcePressureView.h ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file define class ResourcePressureView.
+/// Class ResourcePressureView observes hardware events generated by
+/// the Pipeline object and collects statistics related to resource usage at
+/// instruction granularity.
+/// Resource pressure information is then printed out to a stream in the
+/// form of a table like the one from the example below:
+///
+/// Resources:
+/// [0] - JALU0
+/// [1] - JALU1
+/// [2] - JDiv
+/// [3] - JFPM
+/// [4] - JFPU0
+/// [5] - JFPU1
+/// [6] - JLAGU
+/// [7] - JSAGU
+/// [8] - JSTC
+/// [9] - JVIMUL
+///
+/// Resource pressure per iteration:
+/// [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+/// 0.00   0.00   0.00   0.00   2.00   2.00   0.00   0.00   0.00   0.00
+///
+/// Resource pressure by instruction:
+/// [0]  [1]  [2]  [3]  [4]  [5]  [6]  [7]  [8]  [9]  Instructions:
+///  -    -    -    -    -   1.00  -    -    -    -   vpermilpd  $1,    %xmm0,
+///  %xmm1
+///  -    -    -    -   1.00  -    -    -    -    -   vaddps     %xmm0, %xmm1,
+///  %xmm2
+///  -    -    -    -    -   1.00  -    -    -    -   vmovshdup  %xmm2, %xmm3
+///  -    -    -    -   1.00  -    -    -    -    -   vaddss     %xmm2, %xmm3,
+///  %xmm4
+///
+/// In this example, we have AVX code executed on AMD Jaguar (btver2).
+/// Both shuffles and vector floating point add operations on XMM registers have
+/// a reciprocal throughput of 1cy.
+/// Each add is issued to pipeline JFPU0, while each shuffle is issued to
+/// pipeline JFPU1. The overall pressure per iteration is reported by two
+/// tables: the first smaller table is the resource pressure per iteration;
+/// the second table reports resource pressure per instruction. Values are the
+/// average resource cycles consumed by an instruction.
+/// Every vector add from the example uses resource JFPU0 for an average of 1cy
+/// per iteration. Consequently, the resource pressure on JFPU0 is of 2cy per
+/// iteration.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
+
+#include "Views/View.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+namespace mca {
+
+/// This class collects resource pressure statistics and it is able to print
+/// out all the collected information as a table to an output stream.
+class ResourcePressureView : public View {
+  const llvm::MCSubtargetInfo &STI;
+  llvm::MCInstPrinter &MCIP;
+  llvm::ArrayRef<llvm::MCInst> Source;
+  unsigned LastInstructionIdx;
+
+  // Map to quickly obtain the ResourceUsage column index from a processor
+  // resource ID.
+  llvm::DenseMap<unsigned, unsigned> Resource2VecIndex;
+
+  // Table of resources used by instructions.
+  std::vector<ResourceCycles> ResourceUsage;
+  unsigned NumResourceUnits;
+
+  void printResourcePressurePerIter(llvm::raw_ostream &OS) const;
+  void printResourcePressurePerInst(llvm::raw_ostream &OS) const;
+
+public:
+  ResourcePressureView(const llvm::MCSubtargetInfo &sti,
+                       llvm::MCInstPrinter &Printer,
+                       llvm::ArrayRef<llvm::MCInst> S);
+
+  void onEvent(const HWInstructionEvent &Event) override;
+  void printView(llvm::raw_ostream &OS) const override {
+    printResourcePressurePerIter(OS);
+    printResourcePressurePerInst(OS);
+  }
+};
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
new file mode 100644
index 000000000000..cb4fbae78039
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
@@ -0,0 +1,90 @@
+//===--------------------- RetireControlUnitStatistics.cpp ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the RetireControlUnitStatistics interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/RetireControlUnitStatistics.h"
+#include "llvm/Support/Format.h"
+
+namespace llvm {
+namespace mca {
+
+RetireControlUnitStatistics::RetireControlUnitStatistics(const MCSchedModel &SM)
+    : NumRetired(0), NumCycles(0), EntriesInUse(0), MaxUsedEntries(0),
+      SumOfUsedEntries(0) {
+  TotalROBEntries = SM.MicroOpBufferSize;
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    if (EPI.ReorderBufferSize)
+      TotalROBEntries = EPI.ReorderBufferSize;
+  }
+}
+
+void RetireControlUnitStatistics::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Dispatched) {
+    unsigned NumEntries =
+        static_cast<const HWInstructionDispatchedEvent &>(Event).MicroOpcodes;
+    EntriesInUse += NumEntries;
+  }
+
+  if (Event.Type == HWInstructionEvent::Retired) {
+    unsigned ReleasedEntries = Event.IR.getInstruction()->getDesc().NumMicroOps;
+    assert(EntriesInUse >= ReleasedEntries && "Invalid internal state!");
+    EntriesInUse -= ReleasedEntries;
+    ++NumRetired;
+  }
+}
+
+void RetireControlUnitStatistics::onCycleEnd() {
+  // Update histogram
+  RetiredPerCycle[NumRetired]++;
+  NumRetired = 0;
+  ++NumCycles;
+  MaxUsedEntries = std::max(MaxUsedEntries, EntriesInUse);
+  SumOfUsedEntries += EntriesInUse;
+}
+
+void RetireControlUnitStatistics::printView(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nRetire Control Unit - "
+             << "number of cycles where we saw N instructions retired:\n";
+  TempStream << "[# retired], [# cycles]\n";
+
+  for (const std::pair<unsigned, unsigned> &Entry : RetiredPerCycle) {
+    TempStream << " " << Entry.first;
+    if (Entry.first < 10)
+      TempStream << ",           ";
+    else
+      TempStream << ",          ";
+    TempStream << Entry.second << "  ("
+               << format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
+               << "%)\n";
+  }
+
+  unsigned AvgUsage = (double)SumOfUsedEntries / NumCycles;
+  double MaxUsagePercentage = ((double)MaxUsedEntries / TotalROBEntries) * 100.0;
+  double NormalizedMaxPercentage = floor((MaxUsagePercentage * 10) + 0.5) / 10;
+  double AvgUsagePercentage = ((double)AvgUsage / TotalROBEntries) * 100.0;
+  double NormalizedAvgPercentage = floor((AvgUsagePercentage * 10) + 0.5) / 10;
+
+  TempStream << "\nTotal ROB Entries:                " << TotalROBEntries
+             << "\nMax Used ROB Entries:             " << MaxUsedEntries
+             << format("  ( %.1f%% )", NormalizedMaxPercentage)
+             << "\nAverage Used ROB Entries per cy:  " << AvgUsage
+             << format("  ( %.1f%% )\n", NormalizedAvgPercentage);
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h
new file mode 100644
index 000000000000..1a4d3dec5c56
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h
@@ -0,0 +1,60 @@
+//===--------------------- RetireControlUnitStatistics.h --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines class RetireControlUnitStatistics: a view that knows how
+/// to print general statistics related to the retire control unit.
+///
+/// Example:
+/// ========
+///
+/// Retire Control Unit - number of cycles where we saw N instructions retired:
+/// [# retired], [# cycles]
+///  0,           109  (17.9%)
+///  1,           102  (16.7%)
+///  2,           399  (65.4%)
+///
+/// Total ROB Entries:                64
+/// Max Used ROB Entries:             35  ( 54.7% )
+/// Average Used ROB Entries per cy:  32  ( 50.0% )
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_RETIRECONTROLUNITSTATISTICS_H
+#define LLVM_TOOLS_LLVM_MCA_RETIRECONTROLUNITSTATISTICS_H
+
+#include "Views/View.h"
+#include "llvm/MC/MCSchedule.h"
+#include <map>
+
+namespace llvm {
+namespace mca {
+
+class RetireControlUnitStatistics : public View {
+  using Histogram = std::map<unsigned, unsigned>;
+  Histogram RetiredPerCycle;
+
+  unsigned NumRetired;
+  unsigned NumCycles;
+  unsigned TotalROBEntries;
+  unsigned EntriesInUse;
+  unsigned MaxUsedEntries;
+  unsigned SumOfUsedEntries;
+
+public:
+  RetireControlUnitStatistics(const MCSchedModel &SM);
+
+  void onEvent(const HWInstructionEvent &Event) override;
+  void onCycleEnd() override;
+  void printView(llvm::raw_ostream &OS) const override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
new file mode 100644
index 000000000000..bd0ba350ab68
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -0,0 +1,178 @@
+//===--------------------- SchedulerStatistics.cpp --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the SchedulerStatistics interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/SchedulerStatistics.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+
+namespace llvm {
+namespace mca {
+
+SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
+    : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0),
+      NumCycles(0), MostRecentLoadDispatched(~0U),
+      MostRecentStoreDispatched(~0U),
+      Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    LQResourceID = EPI.LoadQueueID;
+    SQResourceID = EPI.StoreQueueID;
+  }
+}
+
+// FIXME: This implementation works under the assumption that load/store queue
+// entries are reserved at 'instruction dispatched' stage, and released at
+// 'instruction executed' stage. This currently matches the behavior of LSUnit.
+//
+// The current design minimizes the number of events generated by the
+// Dispatch/Execute stages, at the cost of doing extra bookkeeping in method
+// `onEvent`. However, it introduces a subtle dependency between this view and
+// how the LSUnit works.
+//
+// In future we should add a new "memory queue" event type, so that we stop
+// making assumptions on how LSUnit internally works (See PR39828).
+void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Issued) {
+    const Instruction &Inst = *Event.IR.getInstruction();
+    NumIssued += Inst.getDesc().NumMicroOps;
+  } else if (Event.Type == HWInstructionEvent::Dispatched) {
+    const Instruction &Inst = *Event.IR.getInstruction();
+    const unsigned Index = Event.IR.getSourceIndex();
+    if (LQResourceID && Inst.getDesc().MayLoad &&
+        MostRecentLoadDispatched != Index) {
+      Usage[LQResourceID].SlotsInUse++;
+      MostRecentLoadDispatched = Index;
+    }
+    if (SQResourceID && Inst.getDesc().MayStore &&
+        MostRecentStoreDispatched != Index) {
+      Usage[SQResourceID].SlotsInUse++;
+      MostRecentStoreDispatched = Index;
+    }
+  } else if (Event.Type == HWInstructionEvent::Executed) {
+    const Instruction &Inst = *Event.IR.getInstruction();
+    if (LQResourceID && Inst.getDesc().MayLoad) {
+      assert(Usage[LQResourceID].SlotsInUse);
+      Usage[LQResourceID].SlotsInUse--;
+    }
+    if (SQResourceID && Inst.getDesc().MayStore) {
+      assert(Usage[SQResourceID].SlotsInUse);
+      Usage[SQResourceID].SlotsInUse--;
+    }
+  }
+}
+
+void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */,
+                                            ArrayRef<unsigned> Buffers) {
+  for (const unsigned Buffer : Buffers) {
+    if (Buffer == LQResourceID || Buffer == SQResourceID)
+      continue;
+    Usage[Buffer].SlotsInUse++;
+  }
+}
+
+void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */,
+                                            ArrayRef<unsigned> Buffers) {
+  for (const unsigned Buffer : Buffers) {
+    if (Buffer == LQResourceID || Buffer == SQResourceID)
+      continue;
+    Usage[Buffer].SlotsInUse--;
+  }
+}
+
+void SchedulerStatistics::updateHistograms() {
+  for (BufferUsage &BU : Usage) {
+    BU.CumulativeNumUsedSlots += BU.SlotsInUse;
+    BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+  }
+
+  IssueWidthPerCycle[NumIssued]++;
+  NumIssued = 0;
+}
+
+void SchedulerStatistics::printSchedulerStats(raw_ostream &OS) const {
+  OS << "\n\nSchedulers - "
+     << "number of cycles where we saw N micro opcodes issued:\n";
+  OS << "[# issued], [# cycles]\n";
+
+  bool HasColors = OS.has_colors();
+  const auto It =
+      std::max_element(IssueWidthPerCycle.begin(), IssueWidthPerCycle.end());
+  for (const std::pair<unsigned, unsigned> &Entry : IssueWidthPerCycle) {
+    unsigned NumIssued = Entry.first;
+    if (NumIssued == It->first && HasColors)
+      OS.changeColor(raw_ostream::SAVEDCOLOR, true, false);
+
+    unsigned IPC = Entry.second;
+    OS << " " << NumIssued << ",          " << IPC << "  ("
+       << format("%.1f", ((double)IPC / NumCycles) * 100) << "%)\n";
+    if (HasColors)
+      OS.resetColor();
+  }
+}
+
+void SchedulerStatistics::printSchedulerUsage(raw_ostream &OS) const {
+  assert(NumCycles && "Unexpected number of cycles!");
+
+  OS << "\nScheduler's queue usage:\n";
+  if (all_of(Usage, [](const BufferUsage &BU) { return !BU.MaxUsedSlots; })) {
+    OS << "No scheduler resources used.\n";
+    return;
+  }
+
+  OS << "[1] Resource name.\n"
+     << "[2] Average number of used buffer entries.\n"
+     << "[3] Maximum number of used buffer entries.\n"
+     << "[4] Total number of buffer entries.\n\n"
+     << " [1]            [2]        [3]        [4]\n";
+
+  formatted_raw_ostream FOS(OS);
+  bool HasColors = FOS.has_colors();
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    if (ProcResource.BufferSize <= 0)
+      continue;
+
+    const BufferUsage &BU = Usage[I];
+    double AvgUsage = (double)BU.CumulativeNumUsedSlots / NumCycles;
+    double AlmostFullThreshold = (double)(ProcResource.BufferSize * 4) / 5;
+    unsigned NormalizedAvg = floor((AvgUsage * 10) + 0.5) / 10;
+    unsigned NormalizedThreshold = floor((AlmostFullThreshold * 10) + 0.5) / 10;
+
+    FOS << ProcResource.Name;
+    FOS.PadToColumn(17);
+    if (HasColors && NormalizedAvg >= NormalizedThreshold)
+      FOS.changeColor(raw_ostream::YELLOW, true, false);
+    FOS << NormalizedAvg;
+    if (HasColors)
+      FOS.resetColor();
+    FOS.PadToColumn(28);
+    if (HasColors &&
+        BU.MaxUsedSlots == static_cast<unsigned>(ProcResource.BufferSize))
+      FOS.changeColor(raw_ostream::RED, true, false);
+    FOS << BU.MaxUsedSlots;
+    if (HasColors)
+      FOS.resetColor();
+    FOS.PadToColumn(39);
+    FOS << ProcResource.BufferSize << '\n';
+  }
+
+  FOS.flush();
+}
+
+void SchedulerStatistics::printView(raw_ostream &OS) const {
+  printSchedulerStats(OS);
+  printSchedulerUsage(OS);
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
new file mode 100644
index 000000000000..32711b4483b4
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -0,0 +1,95 @@
+//===--------------------- SchedulerStatistics.h ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines class SchedulerStatistics. Class SchedulerStatistics is a
+/// View that listens to instruction issue events in order to print general
+/// statistics related to the hardware schedulers.
+///
+/// Example:
+/// ========
+///
+/// Schedulers - number of cycles where we saw N instructions issued:
+/// [# issued], [# cycles]
+///  0,          6  (2.9%)
+///  1,          106  (50.7%)
+///  2,          97  (46.4%)
+///
+/// Scheduler's queue usage:
+/// [1] Resource name.
+/// [2] Average number of used buffer entries.
+/// [3] Maximum number of used buffer entries.
+/// [4] Total number of buffer entries.
+///
+///  [1]            [2]        [3]        [4]
+/// JALU01           0          0          20
+/// JFPU01           15         18         18
+/// JLSAGU           0          0          12
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULERSTATISTICS_H
+#define LLVM_TOOLS_LLVM_MCA_SCHEDULERSTATISTICS_H
+
+#include "Views/View.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace llvm {
+namespace mca {
+
+class SchedulerStatistics final : public View {
+  const llvm::MCSchedModel &SM;
+  unsigned LQResourceID;
+  unsigned SQResourceID;
+
+  unsigned NumIssued;
+  unsigned NumCycles;
+
+  unsigned MostRecentLoadDispatched;
+  unsigned MostRecentStoreDispatched;
+
+  // Tracks the usage of a scheduler's queue.
+  struct BufferUsage {
+    unsigned SlotsInUse;
+    unsigned MaxUsedSlots;
+    uint64_t CumulativeNumUsedSlots;
+  };
+
+  using Histogram = std::map<unsigned, unsigned>;
+  Histogram IssueWidthPerCycle;
+
+  std::vector<BufferUsage> Usage;
+
+  void updateHistograms();
+  void printSchedulerStats(llvm::raw_ostream &OS) const;
+  void printSchedulerUsage(llvm::raw_ostream &OS) const;
+
+public:
+  SchedulerStatistics(const llvm::MCSubtargetInfo &STI);
+  void onEvent(const HWInstructionEvent &Event) override;
+  void onCycleBegin() override { NumCycles++; }
+  void onCycleEnd() override { updateHistograms(); }
+
+  // Increases the number of used scheduler queue slots of every buffered
+  // resource in the Buffers set.
+  void onReservedBuffers(const InstRef &IR,
+                         llvm::ArrayRef<unsigned> Buffers) override;
+
+  // Decreases by one the number of used scheduler queue slots of every
+  // buffered resource in the Buffers set.
+  void onReleasedBuffers(const InstRef &IR,
+                         llvm::ArrayRef<unsigned> Buffers) override;
+
+  void printView(llvm::raw_ostream &OS) const override;
+};
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-mca/Views/SummaryView.cpp b/llvm/tools/llvm-mca/Views/SummaryView.cpp
new file mode 100644
index 000000000000..ef5550048f4c
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/SummaryView.cpp
@@ -0,0 +1,94 @@
+//===--------------------- SummaryView.cpp -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the functionalities used by the SummaryView to print
+/// the report information.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/SummaryView.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/Support.h"
+#include "llvm/Support/Format.h"
+
+namespace llvm {
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef<MCInst> S,
+                         unsigned Width)
+    : SM(Model), Source(S), DispatchWidth(Width?Width: Model.IssueWidth),
+      LastInstructionIdx(0),
+      TotalCycles(0), NumMicroOps(0),
+      ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
+      ProcResourceMasks(Model.getNumProcResourceKinds()),
+      ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0) {
+  computeProcResourceMasks(SM, ProcResourceMasks);
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    unsigned Index = getResourceStateIndex(ProcResourceMasks[I]);
+    ResIdx2ProcResID[Index] = I;
+  }
+}
+
+void SummaryView::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Dispatched)
+    LastInstructionIdx = Event.IR.getSourceIndex();
+
+  // We are only interested in the "instruction retired" events generated by
+  // the retire stage for instructions that are part of iteration #0.
+  if (Event.Type != HWInstructionEvent::Retired ||
+      Event.IR.getSourceIndex() >= Source.size())
+    return;
+
+  // Update the cumulative number of resource cycles based on the processor
+  // resource usage information available from the instruction descriptor. We
+  // need to compute the cumulative number of resource cycles for every
+  // processor resource which is consumed by an instruction of the block.
+  const Instruction &Inst = *Event.IR.getInstruction();
+  const InstrDesc &Desc = Inst.getDesc();
+  NumMicroOps += Desc.NumMicroOps;
+  for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) {
+    if (RU.second.size()) {
+      unsigned ProcResID = ResIdx2ProcResID[getResourceStateIndex(RU.first)];
+      ProcResourceUsage[ProcResID] += RU.second.size();
+    }
+  }
+}
+
+void SummaryView::printView(raw_ostream &OS) const {
+  unsigned Instructions = Source.size();
+  unsigned Iterations = (LastInstructionIdx / Instructions) + 1;
+  unsigned TotalInstructions = Instructions * Iterations;
+  unsigned TotalUOps = NumMicroOps * Iterations;
+  double IPC = (double)TotalInstructions / TotalCycles;
+  double UOpsPerCycle = (double)TotalUOps / TotalCycles;
+  double BlockRThroughput = computeBlockRThroughput(
+      SM, DispatchWidth, NumMicroOps, ProcResourceUsage);
+
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "Iterations:        " << Iterations;
+  TempStream << "\nInstructions:      " << TotalInstructions;
+  TempStream << "\nTotal Cycles:      " << TotalCycles;
+  TempStream << "\nTotal uOps:        " << TotalUOps << '\n';
+  TempStream << "\nDispatch Width:    " << DispatchWidth;
+  TempStream << "\nuOps Per Cycle:    "
+             << format("%.2f", floor((UOpsPerCycle * 100) + 0.5) / 100);
+  TempStream << "\nIPC:               "
+             << format("%.2f", floor((IPC * 100) + 0.5) / 100);
+  TempStream << "\nBlock RThroughput: "
+             << format("%.1f", floor((BlockRThroughput * 10) + 0.5) / 10)
+             << '\n';
+  TempStream.flush();
+  OS << Buffer;
+}
+
+} // namespace mca.
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/SummaryView.h b/llvm/tools/llvm-mca/Views/SummaryView.h
new file mode 100644
index 000000000000..9be31b7d51bd
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/SummaryView.h
@@ -0,0 +1,80 @@
+//===--------------------- SummaryView.h ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the summary view.
+///
+/// The goal of the summary view is to give a very quick overview of the
+/// performance throughput. Below is an example of summary view:
+///
+///
+/// Iterations:        300
+/// Instructions:      900
+/// Total Cycles:      610
+/// Dispatch Width:    2
+/// IPC:               1.48
+/// Block RThroughput: 2.0
+///
+/// The summary view collects a few performance numbers. The two main
+/// performance indicators are 'Total Cycles' and IPC (Instructions Per Cycle).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
+
+#include "Views/View.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace mca {
+
+/// A view that collects and prints a few performance numbers.
+class SummaryView : public View {
+  const llvm::MCSchedModel &SM;
+  llvm::ArrayRef<llvm::MCInst> Source;
+  const unsigned DispatchWidth;
+  unsigned LastInstructionIdx;
+  unsigned TotalCycles;
+  // The total number of micro opcodes contributed by a block of instructions.
+  unsigned NumMicroOps;
+
+  // For each processor resource, this vector stores the cumulative number of
+  // resource cycles consumed by the analyzed code block.
+  llvm::SmallVector<unsigned, 8> ProcResourceUsage;
+
+  // Each processor resource is associated with a so-called processor resource
+  // mask. This vector allows to correlate processor resource IDs with processor
+  // resource masks. There is exactly one element per each processor resource
+  // declared by the scheduling model.
+  llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
+
+  // Used to map resource indices to actual processor resource IDs.
+  llvm::SmallVector<unsigned, 8> ResIdx2ProcResID;
+
+  // Compute the reciprocal throughput for the analyzed code block.
+  // The reciprocal block throughput is computed as the MAX between:
+  //   - NumMicroOps / DispatchWidth
+  //   - Total Resource Cycles / #Units   (for every resource consumed).
+  double getBlockRThroughput() const;
+
+public:
+  SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef<llvm::MCInst> S,
+              unsigned Width);
+
+  void onCycleEnd() override { ++TotalCycles; }
+  void onEvent(const HWInstructionEvent &Event) override;
+  void printView(llvm::raw_ostream &OS) const override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-mca/Views/TimelineView.cpp b/llvm/tools/llvm-mca/Views/TimelineView.cpp
new file mode 100644
index 000000000000..1e7caa297ac6
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/TimelineView.cpp
@@ -0,0 +1,325 @@
+//===--------------------- TimelineView.cpp ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \brief
+///
+/// This file implements the TimelineView interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/TimelineView.h"
+#include <numeric>
+
+namespace llvm {
+namespace mca {
+
+TimelineView::TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer,
+                           llvm::ArrayRef<llvm::MCInst> S, unsigned Iterations,
+                           unsigned Cycles)
+    : STI(sti), MCIP(Printer), Source(S), CurrentCycle(0),
+      MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0), WaitTime(S.size()),
+      UsedBuffer(S.size()) {
+  unsigned NumInstructions = Source.size();
+  assert(Iterations && "Invalid number of iterations specified!");
+  NumInstructions *= Iterations;
+  Timeline.resize(NumInstructions);
+  TimelineViewEntry InvalidTVEntry = {-1, 0, 0, 0, 0};
+  std::fill(Timeline.begin(), Timeline.end(), InvalidTVEntry);
+
+  WaitTimeEntry NullWTEntry = {0, 0, 0};
+  std::fill(WaitTime.begin(), WaitTime.end(), NullWTEntry);
+
+  std::pair<unsigned, int> NullUsedBufferEntry = {/* Invalid resource ID*/ 0,
+                                                  /* unknown buffer size */ -1};
+  std::fill(UsedBuffer.begin(), UsedBuffer.end(), NullUsedBufferEntry);
+}
+
+void TimelineView::onReservedBuffers(const InstRef &IR,
+                                     ArrayRef<unsigned> Buffers) {
+  if (IR.getSourceIndex() >= Source.size())
+    return;
+
+  const MCSchedModel &SM = STI.getSchedModel();
+  std::pair<unsigned, int> BufferInfo = {0, -1};
+  for (const unsigned Buffer : Buffers) {
+    const MCProcResourceDesc &MCDesc = *SM.getProcResource(Buffer);
+    if (!BufferInfo.first || BufferInfo.second > MCDesc.BufferSize) {
+      BufferInfo.first = Buffer;
+      BufferInfo.second = MCDesc.BufferSize;
+    }
+  }
+
+  UsedBuffer[IR.getSourceIndex()] = BufferInfo;
+}
+
+void TimelineView::onEvent(const HWInstructionEvent &Event) {
+  const unsigned Index = Event.IR.getSourceIndex();
+  if (Index >= Timeline.size())
+    return;
+
+  switch (Event.Type) {
+  case HWInstructionEvent::Retired: {
+    TimelineViewEntry &TVEntry = Timeline[Index];
+    if (CurrentCycle < MaxCycle)
+      TVEntry.CycleRetired = CurrentCycle;
+
+    // Update the WaitTime entry which corresponds to this Index.
+    assert(TVEntry.CycleDispatched >= 0 && "Invalid TVEntry found!");
+    unsigned CycleDispatched = static_cast<unsigned>(TVEntry.CycleDispatched);
+    WaitTimeEntry &WTEntry = WaitTime[Index % Source.size()];
+    WTEntry.CyclesSpentInSchedulerQueue +=
+        TVEntry.CycleIssued - CycleDispatched;
+    assert(CycleDispatched <= TVEntry.CycleReady &&
+           "Instruction cannot be ready if it hasn't been dispatched yet!");
+    WTEntry.CyclesSpentInSQWhileReady +=
+        TVEntry.CycleIssued - TVEntry.CycleReady;
+    WTEntry.CyclesSpentAfterWBAndBeforeRetire +=
+        (CurrentCycle - 1) - TVEntry.CycleExecuted;
+    break;
+  }
+  case HWInstructionEvent::Ready:
+    Timeline[Index].CycleReady = CurrentCycle;
+    break;
+  case HWInstructionEvent::Issued:
+    Timeline[Index].CycleIssued = CurrentCycle;
+    break;
+  case HWInstructionEvent::Executed:
+    Timeline[Index].CycleExecuted = CurrentCycle;
+    break;
+  case HWInstructionEvent::Dispatched:
+    // There may be multiple dispatch events. Microcoded instructions that are
+    // expanded into multiple uOps may require multiple dispatch cycles. Here,
+    // we want to capture the first dispatch cycle.
+    if (Timeline[Index].CycleDispatched == -1)
+      Timeline[Index].CycleDispatched = static_cast<int>(CurrentCycle);
+    break;
+  default:
+    return;
+  }
+  if (CurrentCycle < MaxCycle)
+    LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+static raw_ostream::Colors chooseColor(unsigned CumulativeCycles,
+                                       unsigned Executions, int BufferSize) {
+  if (CumulativeCycles && BufferSize < 0)
+    return raw_ostream::MAGENTA;
+  unsigned Size = static_cast<unsigned>(BufferSize);
+  if (CumulativeCycles >= Size * Executions)
+    return raw_ostream::RED;
+  if ((CumulativeCycles * 2) >= Size * Executions)
+    return raw_ostream::YELLOW;
+  return raw_ostream::SAVEDCOLOR;
+}
+
+static void tryChangeColor(raw_ostream &OS, unsigned Cycles,
+                           unsigned Executions, int BufferSize) {
+  if (!OS.has_colors())
+    return;
+
+  raw_ostream::Colors Color = chooseColor(Cycles, Executions, BufferSize);
+  if (Color == raw_ostream::SAVEDCOLOR) {
+    OS.resetColor();
+    return;
+  }
+  OS.changeColor(Color, /* bold */ true, /* BG */ false);
+}
+
+void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS,
+                                      const WaitTimeEntry &Entry,
+                                      unsigned SourceIndex,
+                                      unsigned Executions) const {
+  bool PrintingTotals = SourceIndex == Source.size();
+  unsigned CumulativeExecutions = PrintingTotals ? Timeline.size() : Executions;
+
+  if (!PrintingTotals)
+    OS << SourceIndex << '.';
+
+  OS.PadToColumn(7);
+
+  double AverageTime1, AverageTime2, AverageTime3;
+  AverageTime1 =
+      (double)Entry.CyclesSpentInSchedulerQueue / CumulativeExecutions;
+  AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / CumulativeExecutions;
+  AverageTime3 =
+      (double)Entry.CyclesSpentAfterWBAndBeforeRetire / CumulativeExecutions;
+
+  OS << Executions;
+  OS.PadToColumn(13);
+
+  int BufferSize = PrintingTotals ? 0 : UsedBuffer[SourceIndex].second;
+  if (!PrintingTotals)
+    tryChangeColor(OS, Entry.CyclesSpentInSchedulerQueue, CumulativeExecutions,
+                   BufferSize);
+  OS << format("%.1f", floor((AverageTime1 * 10) + 0.5) / 10);
+  OS.PadToColumn(20);
+  if (!PrintingTotals)
+    tryChangeColor(OS, Entry.CyclesSpentInSQWhileReady, CumulativeExecutions,
+                   BufferSize);
+  OS << format("%.1f", floor((AverageTime2 * 10) + 0.5) / 10);
+  OS.PadToColumn(27);
+  if (!PrintingTotals)
+    tryChangeColor(OS, Entry.CyclesSpentAfterWBAndBeforeRetire,
+                   CumulativeExecutions, STI.getSchedModel().MicroOpBufferSize);
+  OS << format("%.1f", floor((AverageTime3 * 10) + 0.5) / 10);
+
+  if (OS.has_colors())
+    OS.resetColor();
+  OS.PadToColumn(34);
+}
+
+void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
+  std::string Header =
+      "\n\nAverage Wait times (based on the timeline view):\n"
+      "[0]: Executions\n"
+      "[1]: Average time spent waiting in a scheduler's queue\n"
+      "[2]: Average time spent waiting in a scheduler's queue while ready\n"
+      "[3]: Average time elapsed from WB until retire stage\n\n"
+      "      [0]    [1]    [2]    [3]\n";
+  OS << Header;
+
+  // Use a different string stream for printing instructions.
+  std::string Instruction;
+  raw_string_ostream InstrStream(Instruction);
+
+  formatted_raw_ostream FOS(OS);
+  unsigned Executions = Timeline.size() / Source.size();
+  unsigned IID = 0;
+  for (const MCInst &Inst : Source) {
+    printWaitTimeEntry(FOS, WaitTime[IID], IID, Executions);
+    // Append the instruction info at the end of the line.
+    MCIP.printInst(&Inst, InstrStream, "", STI);
+    InstrStream.flush();
+
+    // Consume any tabs or spaces at the beginning of the string.
+    StringRef Str(Instruction);
+    Str = Str.ltrim();
+    FOS << "   " << Str << '\n';
+    FOS.flush();
+    Instruction = "";
+
+    ++IID;
+  }
+
+  // If the timeline contains more than one instruction,
+  // let's also print global averages.
+  if (Source.size() != 1) {
+    WaitTimeEntry TotalWaitTime = std::accumulate(
+        WaitTime.begin(), WaitTime.end(), WaitTimeEntry{0, 0, 0},
+        [](const WaitTimeEntry &A, const WaitTimeEntry &B) {
+          return WaitTimeEntry{
+              A.CyclesSpentInSchedulerQueue + B.CyclesSpentInSchedulerQueue,
+              A.CyclesSpentInSQWhileReady + B.CyclesSpentInSQWhileReady,
+              A.CyclesSpentAfterWBAndBeforeRetire +
+                  B.CyclesSpentAfterWBAndBeforeRetire};
+        });
+    printWaitTimeEntry(FOS, TotalWaitTime, IID, Executions);
+    FOS << "   "
+        << "<total>" << '\n';
+    InstrStream.flush();
+  }
+}
+
+void TimelineView::printTimelineViewEntry(formatted_raw_ostream &OS,
+                                          const TimelineViewEntry &Entry,
+                                          unsigned Iteration,
+                                          unsigned SourceIndex) const {
+  if (Iteration == 0 && SourceIndex == 0)
+    OS << '\n';
+  OS << '[' << Iteration << ',' << SourceIndex << ']';
+  OS.PadToColumn(10);
+  assert(Entry.CycleDispatched >= 0 && "Invalid TimelineViewEntry!");
+  unsigned CycleDispatched = static_cast<unsigned>(Entry.CycleDispatched);
+  for (unsigned I = 0, E = CycleDispatched; I < E; ++I)
+    OS << ((I % 5 == 0) ? '.' : ' ');
+  OS << TimelineView::DisplayChar::Dispatched;
+  if (CycleDispatched != Entry.CycleExecuted) {
+    // Zero latency instructions have the same value for CycleDispatched,
+    // CycleIssued and CycleExecuted.
+    for (unsigned I = CycleDispatched + 1, E = Entry.CycleIssued; I < E; ++I)
+      OS << TimelineView::DisplayChar::Waiting;
+    if (Entry.CycleIssued == Entry.CycleExecuted)
+      OS << TimelineView::DisplayChar::DisplayChar::Executed;
+    else {
+      if (CycleDispatched != Entry.CycleIssued)
+        OS << TimelineView::DisplayChar::Executing;
+      for (unsigned I = Entry.CycleIssued + 1, E = Entry.CycleExecuted; I < E;
+           ++I)
+        OS << TimelineView::DisplayChar::Executing;
+      OS << TimelineView::DisplayChar::Executed;
+    }
+  }
+
+  for (unsigned I = Entry.CycleExecuted + 1, E = Entry.CycleRetired; I < E; ++I)
+    OS << TimelineView::DisplayChar::RetireLag;
+  OS << TimelineView::DisplayChar::Retired;
+
+  // Skip other columns.
+  for (unsigned I = Entry.CycleRetired + 1, E = LastCycle; I <= E; ++I)
+    OS << ((I % 5 == 0 || I == LastCycle) ? '.' : ' ');
+}
+
+static void printTimelineHeader(formatted_raw_ostream &OS, unsigned Cycles) {
+  OS << "\n\nTimeline view:\n";
+  if (Cycles >= 10) {
+    OS.PadToColumn(10);
+    for (unsigned I = 0; I <= Cycles; ++I) {
+      if (((I / 10) & 1) == 0)
+        OS << ' ';
+      else
+        OS << I % 10;
+    }
+    OS << '\n';
+  }
+
+  OS << "Index";
+  OS.PadToColumn(10);
+  for (unsigned I = 0; I <= Cycles; ++I) {
+    if (((I / 10) & 1) == 0)
+      OS << I % 10;
+    else
+      OS << ' ';
+  }
+  OS << '\n';
+}
+
+void TimelineView::printTimeline(raw_ostream &OS) const {
+  formatted_raw_ostream FOS(OS);
+  printTimelineHeader(FOS, LastCycle);
+  FOS.flush();
+
+  // Use a different string stream for the instruction.
+  std::string Instruction;
+  raw_string_ostream InstrStream(Instruction);
+
+  unsigned IID = 0;
+  const unsigned Iterations = Timeline.size() / Source.size();
+  for (unsigned Iteration = 0; Iteration < Iterations; ++Iteration) {
+    for (const MCInst &Inst : Source) {
+      const TimelineViewEntry &Entry = Timeline[IID];
+      if (Entry.CycleRetired == 0)
+        return;
+
+      unsigned SourceIndex = IID % Source.size();
+      printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex);
+      // Append the instruction info at the end of the line.
+      MCIP.printInst(&Inst, InstrStream, "", STI);
+      InstrStream.flush();
+
+      // Consume any tabs or spaces at the beginning of the string.
+      StringRef Str(Instruction);
+      Str = Str.ltrim();
+      FOS << "   " << Str << '\n';
+      FOS.flush();
+      Instruction = "";
+
+      ++IID;
+    }
+  }
+}
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/TimelineView.h b/llvm/tools/llvm-mca/Views/TimelineView.h
new file mode 100644
index 000000000000..9bec3b87db45
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/TimelineView.h
@@ -0,0 +1,189 @@
+//===--------------------- TimelineView.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \brief
+///
+/// This file implements a timeline view for the llvm-mca tool.
+///
+/// Class TimelineView observes events generated by the pipeline. For every
+/// instruction executed by the pipeline, it stores information related to
+/// state transition. It then plots that information in the form of a table
+/// as reported by the example below:
+///
+/// Timeline view:
+///     	          0123456
+/// Index	0123456789
+///
+/// [0,0]	DeER .    .    ..	vmovshdup  %xmm0, %xmm1
+/// [0,1]	DeER .    .    ..	vpermilpd  $1, %xmm0, %xmm2
+/// [0,2]	.DeER.    .    ..	vpermilps  $231, %xmm0, %xmm5
+/// [0,3]	.DeeeER   .    ..	vaddss  %xmm1, %xmm0, %xmm3
+/// [0,4]	. D==eeeER.    ..	vaddss  %xmm3, %xmm2, %xmm4
+/// [0,5]	. D=====eeeER  ..	vaddss  %xmm4, %xmm5, %xmm6
+///
+/// [1,0]	.  DeE------R  ..	vmovshdup  %xmm0, %xmm1
+/// [1,1]	.  DeE------R  ..	vpermilpd  $1, %xmm0, %xmm2
+/// [1,2]	.   DeE-----R  ..	vpermilps  $231, %xmm0, %xmm5
+/// [1,3]	.   D=eeeE--R  ..	vaddss  %xmm1, %xmm0, %xmm3
+/// [1,4]	.    D===eeeER ..	vaddss  %xmm3, %xmm2, %xmm4
+/// [1,5]	.    D======eeeER	vaddss  %xmm4, %xmm5, %xmm6
+///
+/// There is an entry for every instruction in the input assembly sequence.
+/// The first field is a pair of numbers obtained from the instruction index.
+/// The first element of the pair is the iteration index, while the second
+/// element of the pair is a sequence number (i.e. a position in the assembly
+/// sequence).
+/// The second field of the table is the actual timeline information; each
+/// column is the information related to a specific cycle of execution.
+/// The timeline of an instruction is described by a sequence of character
+/// where each character represents the instruction state at a specific cycle.
+///
+/// Possible instruction states are:
+///  D: Instruction Dispatched
+///  e: Instruction Executing
+///  E: Instruction Executed (write-back stage)
+///  R: Instruction retired
+///  =: Instruction waiting in the Scheduler's queue
+///  -: Instruction executed, waiting to retire in order.
+///
+/// dots ('.') and empty spaces are cycles where the instruction is not
+/// in-flight.
+///
+/// The last column is the assembly instruction associated to the entry.
+///
+/// Based on the timeline view information from the example, instruction 0
+/// at iteration 0 was dispatched at cycle 0, and was retired at cycle 3.
+/// Instruction [0,1] was also dispatched at cycle 0, and it retired at
+/// the same cycle than instruction [0,0].
+/// Instruction [0,4] has been dispatched at cycle 2. However, it had to
+/// wait for two cycles before being issued. That is because operands
+/// became ready only at cycle 5.
+///
+/// This view helps further understanding bottlenecks and the impact of
+/// resource pressure on the code.
+///
+/// To better understand why instructions had to wait for multiple cycles in
+/// the scheduler's queue, class TimelineView also reports extra timing info
+/// in another table named "Average Wait times" (see example below).
+///
+///
+/// Average Wait times (based on the timeline view):
+/// [0]: Executions
+/// [1]: Average time spent waiting in a scheduler's queue
+/// [2]: Average time spent waiting in a scheduler's queue while ready
+/// [3]: Average time elapsed from WB until retire stage
+///
+///	[0]	[1]	[2]	[3]
+/// 0.	 2	1.0	1.0	3.0	vmovshdup  %xmm0, %xmm1
+/// 1.	 2	1.0	1.0	3.0	vpermilpd  $1, %xmm0, %xmm2
+/// 2.	 2	1.0	1.0	2.5	vpermilps  $231, %xmm0, %xmm5
+/// 3.	 2	1.5	0.5	1.0	vaddss  %xmm1, %xmm0, %xmm3
+/// 4.	 2	3.5	0.0	0.0	vaddss  %xmm3, %xmm2, %xmm4
+/// 5.	 2	6.5	0.0	0.0	vaddss  %xmm4, %xmm5, %xmm6
+///      2	2.4	0.6	1.6     <total>
+///
+/// By comparing column [2] with column [1], we get an idea about how many
+/// cycles were spent in the scheduler's queue due to data dependencies.
+///
+/// In this example, instruction 5 spent an average of ~6 cycles in the
+/// scheduler's queue. As soon as operands became ready, the instruction
+/// was immediately issued to the pipeline(s).
+/// That is expected because instruction 5 cannot transition to the "ready"
+/// state until %xmm4 is written by instruction 4.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
+
+#include "Views/View.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace mca {
+
+/// This class listens to instruction state transition events
+/// in order to construct a timeline information.
+///
+/// For every instruction executed by the Pipeline, this class constructs
+/// a TimelineViewEntry object. TimelineViewEntry objects are then used
+/// to print the timeline information, as well as the "average wait times"
+/// for every instruction in the input assembly sequence.
+class TimelineView : public View {
+  const llvm::MCSubtargetInfo &STI;
+  llvm::MCInstPrinter &MCIP;
+  llvm::ArrayRef<llvm::MCInst> Source;
+
+  unsigned CurrentCycle;
+  unsigned MaxCycle;
+  unsigned LastCycle;
+
+  struct TimelineViewEntry {
+    int CycleDispatched;  // A negative value is an "invalid cycle".
+    unsigned CycleReady;
+    unsigned CycleIssued;
+    unsigned CycleExecuted;
+    unsigned CycleRetired;
+  };
+  std::vector<TimelineViewEntry> Timeline;
+
+  struct WaitTimeEntry {
+    unsigned CyclesSpentInSchedulerQueue;
+    unsigned CyclesSpentInSQWhileReady;
+    unsigned CyclesSpentAfterWBAndBeforeRetire;
+  };
+  std::vector<WaitTimeEntry> WaitTime;
+
+  // This field is used to map instructions to buffered resources.
+  // Elements of this vector are <resourceID, BufferSizer> pairs.
+  std::vector<std::pair<unsigned, int>> UsedBuffer;
+
+  void printTimelineViewEntry(llvm::formatted_raw_ostream &OS,
+                              const TimelineViewEntry &E, unsigned Iteration,
+                              unsigned SourceIndex) const;
+  void printWaitTimeEntry(llvm::formatted_raw_ostream &OS,
+                          const WaitTimeEntry &E, unsigned Index,
+                          unsigned Executions) const;
+
+  // Display characters for the TimelineView report output.
+  struct DisplayChar {
+    static const char Dispatched = 'D';
+    static const char Executed = 'E';
+    static const char Retired = 'R';
+    static const char Waiting = '='; // Instruction is waiting in the scheduler.
+    static const char Executing = 'e';
+    static const char RetireLag = '-'; // The instruction is waiting to retire.
+  };
+
+public:
+  TimelineView(const llvm::MCSubtargetInfo &sti, llvm::MCInstPrinter &Printer,
+               llvm::ArrayRef<llvm::MCInst> S, unsigned Iterations,
+               unsigned Cycles);
+
+  // Event handlers.
+  void onCycleEnd() override { ++CurrentCycle; }
+  void onEvent(const HWInstructionEvent &Event) override;
+  void onReservedBuffers(const InstRef &IR,
+                         llvm::ArrayRef<unsigned> Buffers) override;
+
+  // print functionalities.
+  void printTimeline(llvm::raw_ostream &OS) const;
+  void printAverageWaitTimes(llvm::raw_ostream &OS) const;
+  void printView(llvm::raw_ostream &OS) const override {
+    printTimeline(OS);
+    printAverageWaitTimes(OS);
+  }
+};
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-mca/Views/View.cpp b/llvm/tools/llvm-mca/Views/View.cpp
new file mode 100644
index 000000000000..8e5c34d2d5c2
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/View.cpp
@@ -0,0 +1,21 @@
+//===----------------------- View.cpp ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the virtual anchor method in View.h to pin the vtable.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Views/View.h"
+
+namespace llvm {
+namespace mca {
+
+void View::anchor() {}
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/Views/View.h b/llvm/tools/llvm-mca/Views/View.h
new file mode 100644
index 000000000000..3b52511b4d29
--- /dev/null
+++ b/llvm/tools/llvm-mca/Views/View.h
@@ -0,0 +1,33 @@
+//===----------------------- View.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the main interface for Views. Each view contributes a
+/// portion of the final report generated by the tool.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_VIEW_H
+#define LLVM_TOOLS_LLVM_MCA_VIEW_H
+
+#include "llvm/MCA/HWEventListener.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace mca {
+
+class View : public HWEventListener {
+public:
+  virtual void printView(llvm::raw_ostream &OS) const = 0;
+  virtual ~View() = default;
+  void anchor() override;
+};
+} // namespace mca
+} // namespace llvm
+
+#endif