diff options
Diffstat (limited to 'llvm/tools/llvm-mca')
27 files changed, 4146 insertions, 0 deletions
diff --git a/llvm/tools/llvm-mca/CodeRegion.cpp b/llvm/tools/llvm-mca/CodeRegion.cpp new file mode 100644 index 000000000000..e05517c1ac95 --- /dev/null +++ b/llvm/tools/llvm-mca/CodeRegion.cpp @@ -0,0 +1,117 @@ +//===-------------------------- CodeRegion.cpp -----------------*- C++ -* -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements methods from the CodeRegions interface. +/// +//===----------------------------------------------------------------------===// + +#include "CodeRegion.h" + +namespace llvm { +namespace mca { + +CodeRegions::CodeRegions(llvm::SourceMgr &S) : SM(S), FoundErrors(false) { + // Create a default region for the input code sequence. + Regions.emplace_back(std::make_unique<CodeRegion>("", SMLoc())); +} + +bool CodeRegion::isLocInRange(SMLoc Loc) const { + if (RangeEnd.isValid() && Loc.getPointer() > RangeEnd.getPointer()) + return false; + if (RangeStart.isValid() && Loc.getPointer() < RangeStart.getPointer()) + return false; + return true; +} + +void CodeRegions::beginRegion(StringRef Description, SMLoc Loc) { + if (ActiveRegions.empty()) { + // Remove the default region if there is at least one user defined region. + // By construction, only the default region has an invalid start location. + if (Regions.size() == 1 && !Regions[0]->startLoc().isValid() && + !Regions[0]->endLoc().isValid()) { + ActiveRegions[Description] = 0; + Regions[0] = std::make_unique<CodeRegion>(Description, Loc); + return; + } + } else { + auto It = ActiveRegions.find(Description); + if (It != ActiveRegions.end()) { + const CodeRegion &R = *Regions[It->second]; + if (Description.empty()) { + SM.PrintMessage(Loc, SourceMgr::DK_Error, + "found multiple overlapping anonymous regions"); + SM.PrintMessage(R.startLoc(), SourceMgr::DK_Note, + "Previous anonymous region was defined here"); + FoundErrors = true; + return; + } + + SM.PrintMessage(Loc, SourceMgr::DK_Error, + "overlapping regions cannot have the same name"); + SM.PrintMessage(R.startLoc(), SourceMgr::DK_Note, + "region " + Description + " was previously defined here"); + FoundErrors = true; + return; + } + } + + ActiveRegions[Description] = Regions.size(); + Regions.emplace_back(std::make_unique<CodeRegion>(Description, Loc)); + return; +} + +void CodeRegions::endRegion(StringRef Description, SMLoc Loc) { + if (Description.empty()) { + // Special case where there is only one user defined region, + // and this LLVM-MCA-END directive doesn't provide a region name. + // In this case, we assume that the user simply wanted to just terminate + // the only active region. + if (ActiveRegions.size() == 1) { + auto It = ActiveRegions.begin(); + Regions[It->second]->setEndLocation(Loc); + ActiveRegions.erase(It); + return; + } + + // Special case where the region end marker applies to the default region. + if (ActiveRegions.empty() && Regions.size() == 1 && + !Regions[0]->startLoc().isValid() && !Regions[0]->endLoc().isValid()) { + Regions[0]->setEndLocation(Loc); + return; + } + } + + auto It = ActiveRegions.find(Description); + if (It != ActiveRegions.end()) { + Regions[It->second]->setEndLocation(Loc); + ActiveRegions.erase(It); + return; + } + + FoundErrors = true; + SM.PrintMessage(Loc, SourceMgr::DK_Error, + "found an invalid region end directive"); + if (!Description.empty()) { + SM.PrintMessage(Loc, SourceMgr::DK_Note, + "unable to find an active region named " + Description); + } else { + SM.PrintMessage(Loc, SourceMgr::DK_Note, + "unable to find an active anonymous region"); + } +} + +void CodeRegions::addInstruction(const MCInst &Instruction) { + SMLoc Loc = Instruction.getLoc(); + for (UniqueCodeRegion &Region : Regions) + if (Region->isLocInRange(Loc)) + Region->addInstruction(Instruction); +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/tools/llvm-mca/CodeRegion.h b/llvm/tools/llvm-mca/CodeRegion.h new file mode 100644 index 000000000000..cabb4a5d4484 --- /dev/null +++ b/llvm/tools/llvm-mca/CodeRegion.h @@ -0,0 +1,128 @@ +//===-------------------------- CodeRegion.h -------------------*- C++ -* -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements class CodeRegion and CodeRegions. +/// +/// A CodeRegion describes a region of assembly code guarded by special LLVM-MCA +/// comment directives. +/// +/// # LLVM-MCA-BEGIN foo +/// ... ## asm +/// # LLVM-MCA-END +/// +/// A comment starting with substring LLVM-MCA-BEGIN marks the beginning of a +/// new region of code. +/// A comment starting with substring LLVM-MCA-END marks the end of the +/// last-seen region of code. +/// +/// Code regions are not allowed to overlap. Each region can have a optional +/// description; internally, regions are described by a range of source +/// locations (SMLoc objects). +/// +/// An instruction (a MCInst) is added to a region R only if its location is in +/// range [R.RangeStart, R.RangeEnd]. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_CODEREGION_H +#define LLVM_TOOLS_LLVM_MCA_CODEREGION_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/SMLoc.h" +#include "llvm/Support/SourceMgr.h" +#include <vector> + +namespace llvm { +namespace mca { + +/// A region of assembly code. +/// +/// It identifies a sequence of machine instructions. +class CodeRegion { + // An optional descriptor for this region. + llvm::StringRef Description; + // Instructions that form this region. + llvm::SmallVector<llvm::MCInst, 8> Instructions; + // Source location range. + llvm::SMLoc RangeStart; + llvm::SMLoc RangeEnd; + + CodeRegion(const CodeRegion &) = delete; + CodeRegion &operator=(const CodeRegion &) = delete; + +public: + CodeRegion(llvm::StringRef Desc, llvm::SMLoc Start) + : Description(Desc), RangeStart(Start), RangeEnd() {} + + void addInstruction(const llvm::MCInst &Instruction) { + Instructions.emplace_back(Instruction); + } + + llvm::SMLoc startLoc() const { return RangeStart; } + llvm::SMLoc endLoc() const { return RangeEnd; } + + void setEndLocation(llvm::SMLoc End) { RangeEnd = End; } + bool empty() const { return Instructions.empty(); } + bool isLocInRange(llvm::SMLoc Loc) const; + + llvm::ArrayRef<llvm::MCInst> getInstructions() const { return Instructions; } + + llvm::StringRef getDescription() const { return Description; } +}; + +class CodeRegionParseError final : public Error {}; + +class CodeRegions { + // A source manager. Used by the tool to generate meaningful warnings. + llvm::SourceMgr &SM; + + using UniqueCodeRegion = std::unique_ptr<CodeRegion>; + std::vector<UniqueCodeRegion> Regions; + llvm::StringMap<unsigned> ActiveRegions; + bool FoundErrors; + + CodeRegions(const CodeRegions &) = delete; + CodeRegions &operator=(const CodeRegions &) = delete; + +public: + CodeRegions(llvm::SourceMgr &S); + + typedef std::vector<UniqueCodeRegion>::iterator iterator; + typedef std::vector<UniqueCodeRegion>::const_iterator const_iterator; + + iterator begin() { return Regions.begin(); } + iterator end() { return Regions.end(); } + const_iterator begin() const { return Regions.cbegin(); } + const_iterator end() const { return Regions.cend(); } + + void beginRegion(llvm::StringRef Description, llvm::SMLoc Loc); + void endRegion(llvm::StringRef Description, llvm::SMLoc Loc); + void addInstruction(const llvm::MCInst &Instruction); + llvm::SourceMgr &getSourceMgr() const { return SM; } + + llvm::ArrayRef<llvm::MCInst> getInstructionSequence(unsigned Idx) const { + return Regions[Idx]->getInstructions(); + } + + bool empty() const { + return llvm::all_of(Regions, [](const UniqueCodeRegion &Region) { + return Region->empty(); + }); + } + + bool isValid() const { return !FoundErrors; } +}; + +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp new file mode 100644 index 000000000000..8ddcd2f4abe2 --- /dev/null +++ b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp @@ -0,0 +1,140 @@ +//===----------------------- CodeRegionGenerator.cpp ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines classes responsible for generating llvm-mca +/// CodeRegions from various types of input. llvm-mca only analyzes CodeRegions, +/// so the classes here provide the input-to-CodeRegions translation. +// +//===----------------------------------------------------------------------===// + +#include "CodeRegionGenerator.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/SMLoc.h" +#include <memory> + +namespace llvm { +namespace mca { + +// This virtual dtor serves as the anchor for the CodeRegionGenerator class. +CodeRegionGenerator::~CodeRegionGenerator() {} + +// A comment consumer that parses strings. The only valid tokens are strings. +class MCACommentConsumer : public AsmCommentConsumer { +public: + CodeRegions &Regions; + + MCACommentConsumer(CodeRegions &R) : Regions(R) {} + void HandleComment(SMLoc Loc, StringRef CommentText) override; +}; + +// This class provides the callbacks that occur when parsing input assembly. +class MCStreamerWrapper final : public MCStreamer { + CodeRegions &Regions; + +public: + MCStreamerWrapper(MCContext &Context, mca::CodeRegions &R) + : MCStreamer(Context), Regions(R) {} + + // We only want to intercept the emission of new instructions. + virtual void EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &/* unused */) override { + Regions.addInstruction(Inst); + } + + bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override { + return true; + } + + void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment) override {} + void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr, + uint64_t Size = 0, unsigned ByteAlignment = 0, + SMLoc Loc = SMLoc()) override {} + void EmitGPRel32Value(const MCExpr *Value) override {} + void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {} + void EmitCOFFSymbolStorageClass(int StorageClass) override {} + void EmitCOFFSymbolType(int Type) override {} + void EndCOFFSymbolDef() override {} + + ArrayRef<MCInst> GetInstructionSequence(unsigned Index) const { + return Regions.getInstructionSequence(Index); + } +}; + +void MCACommentConsumer::HandleComment(SMLoc Loc, StringRef CommentText) { + // Skip empty comments. + StringRef Comment(CommentText); + if (Comment.empty()) + return; + + // Skip spaces and tabs. + unsigned Position = Comment.find_first_not_of(" \t"); + if (Position >= Comment.size()) + // We reached the end of the comment. Bail out. + return; + + Comment = Comment.drop_front(Position); + if (Comment.consume_front("LLVM-MCA-END")) { + // Skip spaces and tabs. + Position = Comment.find_first_not_of(" \t"); + if (Position < Comment.size()) + Comment = Comment.drop_front(Position); + Regions.endRegion(Comment, Loc); + return; + } + + // Try to parse the LLVM-MCA-BEGIN comment. + if (!Comment.consume_front("LLVM-MCA-BEGIN")) + return; + + // Skip spaces and tabs. + Position = Comment.find_first_not_of(" \t"); + if (Position < Comment.size()) + Comment = Comment.drop_front(Position); + // Use the rest of the string as a descriptor for this code snippet. + Regions.beginRegion(Comment, Loc); +} + +Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions() { + MCTargetOptions Opts; + Opts.PreserveAsmComments = false; + MCStreamerWrapper Str(Ctx, Regions); + + // Create a MCAsmParser and setup the lexer to recognize llvm-mca ASM + // comments. + std::unique_ptr<MCAsmParser> Parser( + createMCAsmParser(Regions.getSourceMgr(), Ctx, Str, MAI)); + MCAsmLexer &Lexer = Parser->getLexer(); + MCACommentConsumer CC(Regions); + Lexer.setCommentConsumer(&CC); + // Enable support for MASM literal numbers (example: 05h, 101b). + Lexer.setLexMasmIntegers(true); + + std::unique_ptr<MCTargetAsmParser> TAP( + TheTarget.createMCAsmParser(STI, *Parser, MCII, Opts)); + if (!TAP) + return make_error<StringError>( + "This target does not support assembly parsing.", + inconvertibleErrorCode()); + Parser->setTargetParser(*TAP); + Parser->Run(false); + + // Set the assembler dialect from the input. llvm-mca will use this as the + // default dialect when printing reports. + AssemblerDialect = Parser->getAssemblerDialect(); + return Regions; +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.h b/llvm/tools/llvm-mca/CodeRegionGenerator.h new file mode 100644 index 000000000000..9a10aa2c148b --- /dev/null +++ b/llvm/tools/llvm-mca/CodeRegionGenerator.h @@ -0,0 +1,69 @@ +//===----------------------- CodeRegionGenerator.h --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file declares classes responsible for generating llvm-mca +/// CodeRegions from various types of input. llvm-mca only analyzes CodeRegions, +/// so the classes here provide the input-to-CodeRegions translation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_CODEREGION_GENERATOR_H +#define LLVM_TOOLS_LLVM_MCA_CODEREGION_GENERATOR_H + +#include "CodeRegion.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include <memory> + +namespace llvm { +namespace mca { + +/// This class is responsible for parsing the input given to the llvm-mca +/// driver, and converting that into a CodeRegions instance. +class CodeRegionGenerator { +protected: + CodeRegions Regions; + CodeRegionGenerator(const CodeRegionGenerator &) = delete; + CodeRegionGenerator &operator=(const CodeRegionGenerator &) = delete; + +public: + CodeRegionGenerator(SourceMgr &SM) : Regions(SM) {} + virtual ~CodeRegionGenerator(); + virtual Expected<const CodeRegions &> parseCodeRegions() = 0; +}; + +/// This class is responsible for parsing input ASM and generating +/// a CodeRegions instance. +class AsmCodeRegionGenerator final : public CodeRegionGenerator { + const Target &TheTarget; + MCContext &Ctx; + const MCAsmInfo &MAI; + const MCSubtargetInfo &STI; + const MCInstrInfo &MCII; + unsigned AssemblerDialect; // This is set during parsing. + +public: + AsmCodeRegionGenerator(const Target &T, SourceMgr &SM, MCContext &C, + const MCAsmInfo &A, const MCSubtargetInfo &S, + const MCInstrInfo &I) + : CodeRegionGenerator(SM), TheTarget(T), Ctx(C), MAI(A), STI(S), MCII(I), + AssemblerDialect(0) {} + + unsigned getAssemblerDialect() const { return AssemblerDialect; } + Expected<const CodeRegions &> parseCodeRegions() override; +}; + +} // namespace mca +} // namespace llvm + +#endif // LLVM_TOOLS_LLVM_MCA_CODEREGION_GENERATOR_H diff --git a/llvm/tools/llvm-mca/PipelinePrinter.cpp b/llvm/tools/llvm-mca/PipelinePrinter.cpp new file mode 100644 index 000000000000..90d468075996 --- /dev/null +++ b/llvm/tools/llvm-mca/PipelinePrinter.cpp @@ -0,0 +1,25 @@ +//===--------------------- PipelinePrinter.cpp ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the PipelinePrinter interface. +/// +//===----------------------------------------------------------------------===// + +#include "PipelinePrinter.h" +#include "Views/View.h" + +namespace llvm { +namespace mca { + +void PipelinePrinter::printReport(llvm::raw_ostream &OS) const { + for (const auto &V : Views) + V->printView(OS); +} +} // namespace mca. +} // namespace llvm diff --git a/llvm/tools/llvm-mca/PipelinePrinter.h b/llvm/tools/llvm-mca/PipelinePrinter.h new file mode 100644 index 000000000000..004309cd7b8e --- /dev/null +++ b/llvm/tools/llvm-mca/PipelinePrinter.h @@ -0,0 +1,53 @@ +//===--------------------- PipelinePrinter.h --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements class PipelinePrinter. +/// +/// PipelinePrinter allows the customization of the performance report. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H +#define LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H + +#include "Views/View.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/Pipeline.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "llvm-mca" + +namespace llvm { +namespace mca { + +/// A printer class that knows how to collects statistics on the +/// code analyzed by the llvm-mca tool. +/// +/// This class knows how to print out the analysis information collected +/// during the execution of the code. Internally, it delegates to other +/// classes the task of printing out timeline information as well as +/// resource pressure. +class PipelinePrinter { + Pipeline &P; + llvm::SmallVector<std::unique_ptr<View>, 8> Views; + +public: + PipelinePrinter(Pipeline &pipeline) : P(pipeline) {} + + void addView(std::unique_ptr<View> V) { + P.addEventListener(V.get()); + Views.emplace_back(std::move(V)); + } + + void printReport(llvm::raw_ostream &OS) const; +}; +} // namespace mca +} // namespace llvm + +#endif // LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp new file mode 100644 index 000000000000..feff0cd6d524 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp @@ -0,0 +1,652 @@ +//===--------------------- BottleneckAnalysis.cpp ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the functionalities used by the BottleneckAnalysis +/// to report bottleneck info. +/// +//===----------------------------------------------------------------------===// + +#include "Views/BottleneckAnalysis.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MCA/Support.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormattedStream.h" + +namespace llvm { +namespace mca { + +#define DEBUG_TYPE "llvm-mca" + +PressureTracker::PressureTracker(const MCSchedModel &Model) + : SM(Model), + ResourcePressureDistribution(Model.getNumProcResourceKinds(), 0), + ProcResID2Mask(Model.getNumProcResourceKinds(), 0), + ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0), + ProcResID2ResourceUsersIndex(Model.getNumProcResourceKinds(), 0) { + computeProcResourceMasks(SM, ProcResID2Mask); + + // Ignore the invalid resource at index zero. + unsigned NextResourceUsersIdx = 0; + for (unsigned I = 1, E = Model.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(I); + ProcResID2ResourceUsersIndex[I] = NextResourceUsersIdx; + NextResourceUsersIdx += ProcResource.NumUnits; + uint64_t ResourceMask = ProcResID2Mask[I]; + ResIdx2ProcResID[getResourceStateIndex(ResourceMask)] = I; + } + + ResourceUsers.resize(NextResourceUsersIdx); + std::fill(ResourceUsers.begin(), ResourceUsers.end(), + std::make_pair<unsigned, unsigned>(~0U, 0U)); +} + +void PressureTracker::getResourceUsers(uint64_t ResourceMask, + SmallVectorImpl<User> &Users) const { + unsigned Index = getResourceStateIndex(ResourceMask); + unsigned ProcResID = ResIdx2ProcResID[Index]; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID); + for (unsigned I = 0, E = PRDesc.NumUnits; I < E; ++I) { + const User U = getResourceUser(ProcResID, I); + if (U.second && IPI.find(U.first) != IPI.end()) + Users.emplace_back(U); + } +} + +void PressureTracker::onInstructionDispatched(unsigned IID) { + IPI.insert(std::make_pair(IID, InstructionPressureInfo())); +} + +void PressureTracker::onInstructionExecuted(unsigned IID) { IPI.erase(IID); } + +void PressureTracker::handleInstructionIssuedEvent( + const HWInstructionIssuedEvent &Event) { + unsigned IID = Event.IR.getSourceIndex(); + using ResourceRef = HWInstructionIssuedEvent::ResourceRef; + using ResourceUse = std::pair<ResourceRef, ResourceCycles>; + for (const ResourceUse &Use : Event.UsedResources) { + const ResourceRef &RR = Use.first; + unsigned Index = ProcResID2ResourceUsersIndex[RR.first]; + Index += countTrailingZeros(RR.second); + ResourceUsers[Index] = std::make_pair(IID, Use.second.getNumerator()); + } +} + +void PressureTracker::updateResourcePressureDistribution( + uint64_t CumulativeMask) { + while (CumulativeMask) { + uint64_t Current = CumulativeMask & (-CumulativeMask); + unsigned ResIdx = getResourceStateIndex(Current); + unsigned ProcResID = ResIdx2ProcResID[ResIdx]; + uint64_t Mask = ProcResID2Mask[ProcResID]; + + if (Mask == Current) { + ResourcePressureDistribution[ProcResID]++; + CumulativeMask ^= Current; + continue; + } + + Mask ^= Current; + while (Mask) { + uint64_t SubUnit = Mask & (-Mask); + ResIdx = getResourceStateIndex(SubUnit); + ProcResID = ResIdx2ProcResID[ResIdx]; + ResourcePressureDistribution[ProcResID]++; + Mask ^= SubUnit; + } + + CumulativeMask ^= Current; + } +} + +void PressureTracker::handlePressureEvent(const HWPressureEvent &Event) { + assert(Event.Reason != HWPressureEvent::INVALID && + "Unexpected invalid event!"); + + switch (Event.Reason) { + default: + break; + + case HWPressureEvent::RESOURCES: { + const uint64_t ResourceMask = Event.ResourceMask; + updateResourcePressureDistribution(Event.ResourceMask); + + for (const InstRef &IR : Event.AffectedInstructions) { + const Instruction &IS = *IR.getInstruction(); + unsigned BusyResources = IS.getCriticalResourceMask() & ResourceMask; + if (!BusyResources) + continue; + + unsigned IID = IR.getSourceIndex(); + IPI[IID].ResourcePressureCycles++; + } + break; + } + + case HWPressureEvent::REGISTER_DEPS: + for (const InstRef &IR : Event.AffectedInstructions) { + unsigned IID = IR.getSourceIndex(); + IPI[IID].RegisterPressureCycles++; + } + break; + + case HWPressureEvent::MEMORY_DEPS: + for (const InstRef &IR : Event.AffectedInstructions) { + unsigned IID = IR.getSourceIndex(); + IPI[IID].MemoryPressureCycles++; + } + } +} + +#ifndef NDEBUG +void DependencyGraph::dumpDependencyEdge(raw_ostream &OS, + const DependencyEdge &DepEdge, + MCInstPrinter &MCIP) const { + unsigned FromIID = DepEdge.FromIID; + unsigned ToIID = DepEdge.ToIID; + assert(FromIID < ToIID && "Graph should be acyclic!"); + + const DependencyEdge::Dependency &DE = DepEdge.Dep; + assert(DE.Type != DependencyEdge::DT_INVALID && "Unexpected invalid edge!"); + + OS << " FROM: " << FromIID << " TO: " << ToIID << " "; + if (DE.Type == DependencyEdge::DT_REGISTER) { + OS << " - REGISTER: "; + MCIP.printRegName(OS, DE.ResourceOrRegID); + } else if (DE.Type == DependencyEdge::DT_MEMORY) { + OS << " - MEMORY"; + } else { + assert(DE.Type == DependencyEdge::DT_RESOURCE && + "Unsupported dependency type!"); + OS << " - RESOURCE MASK: " << DE.ResourceOrRegID; + } + OS << " - COST: " << DE.Cost << '\n'; +} +#endif // NDEBUG + +void DependencyGraph::pruneEdges(unsigned Iterations) { + for (DGNode &N : Nodes) { + unsigned NumPruned = 0; + const unsigned Size = N.OutgoingEdges.size(); + // Use a cut-off threshold to prune edges with a low frequency. + for (unsigned I = 0, E = Size; I < E; ++I) { + DependencyEdge &Edge = N.OutgoingEdges[I]; + if (Edge.Frequency == Iterations) + continue; + double Factor = (double)Edge.Frequency / Iterations; + if (0.10 < Factor) + continue; + Nodes[Edge.ToIID].NumPredecessors--; + std::swap(Edge, N.OutgoingEdges[E - 1]); + --E; + ++NumPruned; + } + + if (NumPruned) + N.OutgoingEdges.resize(Size - NumPruned); + } +} + +void DependencyGraph::initializeRootSet( + SmallVectorImpl<unsigned> &RootSet) const { + for (unsigned I = 0, E = Nodes.size(); I < E; ++I) { + const DGNode &N = Nodes[I]; + if (N.NumPredecessors == 0 && !N.OutgoingEdges.empty()) + RootSet.emplace_back(I); + } +} + +void DependencyGraph::propagateThroughEdges( + SmallVectorImpl<unsigned> &RootSet, unsigned Iterations) { + SmallVector<unsigned, 8> ToVisit; + + // A critical sequence is computed as the longest path from a node of the + // RootSet to a leaf node (i.e. a node with no successors). The RootSet is + // composed of nodes with at least one successor, and no predecessors. + // + // Each node of the graph starts with an initial default cost of zero. The + // cost of a node is a measure of criticality: the higher the cost, the bigger + // is the performance impact. + // For register and memory dependencies, the cost is a function of the write + // latency as well as the actual delay (in cycles) caused to users. + // For processor resource dependencies, the cost is a function of the resource + // pressure. Resource interferences with low frequency values are ignored. + // + // This algorithm is very similar to a (reverse) Dijkstra. Every iteration of + // the inner loop selects (i.e. visits) a node N from a set of `unvisited + // nodes`, and then propagates the cost of N to all its neighbors. + // + // The `unvisited nodes` set initially contains all the nodes from the + // RootSet. A node N is added to the `unvisited nodes` if all its + // predecessors have been visited already. + // + // For simplicity, every node tracks the number of unvisited incoming edges in + // field `NumVisitedPredecessors`. When the value of that field drops to + // zero, then the corresponding node is added to a `ToVisit` set. + // + // At the end of every iteration of the outer loop, set `ToVisit` becomes our + // new `unvisited nodes` set. + // + // The algorithm terminates when the set of unvisited nodes (i.e. our RootSet) + // is empty. This algorithm works under the assumption that the graph is + // acyclic. + do { + for (unsigned IID : RootSet) { + const DGNode &N = Nodes[IID]; + for (const DependencyEdge &DepEdge : N.OutgoingEdges) { + unsigned ToIID = DepEdge.ToIID; + DGNode &To = Nodes[ToIID]; + uint64_t Cost = N.Cost + DepEdge.Dep.Cost; + // Check if this is the most expensive incoming edge seen so far. In + // case, update the total cost of the destination node (ToIID), as well + // its field `CriticalPredecessor`. + if (Cost > To.Cost) { + To.CriticalPredecessor = DepEdge; + To.Cost = Cost; + To.Depth = N.Depth + 1; + } + To.NumVisitedPredecessors++; + if (To.NumVisitedPredecessors == To.NumPredecessors) + ToVisit.emplace_back(ToIID); + } + } + + std::swap(RootSet, ToVisit); + ToVisit.clear(); + } while (!RootSet.empty()); +} + +void DependencyGraph::getCriticalSequence( + SmallVectorImpl<const DependencyEdge *> &Seq) const { + // At this stage, nodes of the graph have been already visited, and costs have + // been propagated through the edges (see method `propagateThroughEdges()`). + + // Identify the node N with the highest cost in the graph. By construction, + // that node is the last instruction of our critical sequence. + // Field N.Depth would tell us the total length of the sequence. + // + // To obtain the sequence of critical edges, we simply follow the chain of critical + // predecessors starting from node N (field DGNode::CriticalPredecessor). + const auto It = std::max_element( + Nodes.begin(), Nodes.end(), + [](const DGNode &Lhs, const DGNode &Rhs) { return Lhs.Cost < Rhs.Cost; }); + unsigned IID = std::distance(Nodes.begin(), It); + Seq.resize(Nodes[IID].Depth); + for (unsigned I = Seq.size(), E = 0; I > E; --I) { + const DGNode &N = Nodes[IID]; + Seq[I - 1] = &N.CriticalPredecessor; + IID = N.CriticalPredecessor.FromIID; + } +} + +static void printInstruction(formatted_raw_ostream &FOS, + const MCSubtargetInfo &STI, MCInstPrinter &MCIP, + const MCInst &MCI, + bool UseDifferentColor = false) { + std::string Instruction; + raw_string_ostream InstrStream(Instruction); + + FOS.PadToColumn(14); + + MCIP.printInst(&MCI, InstrStream, "", STI); + InstrStream.flush(); + + if (UseDifferentColor) + FOS.changeColor(raw_ostream::CYAN, true, false); + FOS << StringRef(Instruction).ltrim(); + if (UseDifferentColor) + FOS.resetColor(); +} + +void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const { + // Early exit if no bottlenecks were found during the simulation. + if (!SeenStallCycles || !BPI.PressureIncreaseCycles) + return; + + SmallVector<const DependencyEdge *, 16> Seq; + DG.getCriticalSequence(Seq); + if (Seq.empty()) + return; + + OS << "\nCritical sequence based on the simulation:\n\n"; + + const DependencyEdge &FirstEdge = *Seq[0]; + unsigned FromIID = FirstEdge.FromIID % Source.size(); + unsigned ToIID = FirstEdge.ToIID % Source.size(); + bool IsLoopCarried = FromIID >= ToIID; + + formatted_raw_ostream FOS(OS); + FOS.PadToColumn(14); + FOS << "Instruction"; + FOS.PadToColumn(58); + FOS << "Dependency Information"; + + bool HasColors = FOS.has_colors(); + + unsigned CurrentIID = 0; + if (IsLoopCarried) { + FOS << "\n +----< " << FromIID << "."; + printInstruction(FOS, STI, MCIP, Source[FromIID], HasColors); + FOS << "\n |\n | < loop carried > \n |"; + } else { + while (CurrentIID < FromIID) { + FOS << "\n " << CurrentIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID]); + CurrentIID++; + } + + FOS << "\n +----< " << CurrentIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors); + CurrentIID++; + } + + for (const DependencyEdge *&DE : Seq) { + ToIID = DE->ToIID % Source.size(); + unsigned LastIID = CurrentIID > ToIID ? Source.size() : ToIID; + + while (CurrentIID < LastIID) { + FOS << "\n | " << CurrentIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID]); + CurrentIID++; + } + + if (CurrentIID == ToIID) { + FOS << "\n +----> " << ToIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors); + } else { + FOS << "\n |\n | < loop carried > \n |" + << "\n +----> " << ToIID << "."; + printInstruction(FOS, STI, MCIP, Source[ToIID], HasColors); + } + FOS.PadToColumn(58); + + const DependencyEdge::Dependency &Dep = DE->Dep; + if (HasColors) + FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false); + + if (Dep.Type == DependencyEdge::DT_REGISTER) { + FOS << "## REGISTER dependency: "; + if (HasColors) + FOS.changeColor(raw_ostream::MAGENTA, true, false); + MCIP.printRegName(FOS, Dep.ResourceOrRegID); + } else if (Dep.Type == DependencyEdge::DT_MEMORY) { + FOS << "## MEMORY dependency."; + } else { + assert(Dep.Type == DependencyEdge::DT_RESOURCE && + "Unsupported dependency type!"); + FOS << "## RESOURCE interference: "; + if (HasColors) + FOS.changeColor(raw_ostream::MAGENTA, true, false); + FOS << Tracker.resolveResourceName(Dep.ResourceOrRegID); + if (HasColors) { + FOS.resetColor(); + FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false); + } + FOS << " [ probability: " << ((DE->Frequency * 100) / Iterations) + << "% ]"; + } + if (HasColors) + FOS.resetColor(); + ++CurrentIID; + } + + while (CurrentIID < Source.size()) { + FOS << "\n " << CurrentIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID]); + CurrentIID++; + } + + FOS << '\n'; + FOS.flush(); +} + +#ifndef NDEBUG +void DependencyGraph::dump(raw_ostream &OS, MCInstPrinter &MCIP) const { + OS << "\nREG DEPS\n"; + for (const DGNode &Node : Nodes) + for (const DependencyEdge &DE : Node.OutgoingEdges) + if (DE.Dep.Type == DependencyEdge::DT_REGISTER) + dumpDependencyEdge(OS, DE, MCIP); + + OS << "\nMEM DEPS\n"; + for (const DGNode &Node : Nodes) + for (const DependencyEdge &DE : Node.OutgoingEdges) + if (DE.Dep.Type == DependencyEdge::DT_MEMORY) + dumpDependencyEdge(OS, DE, MCIP); + + OS << "\nRESOURCE DEPS\n"; + for (const DGNode &Node : Nodes) + for (const DependencyEdge &DE : Node.OutgoingEdges) + if (DE.Dep.Type == DependencyEdge::DT_RESOURCE) + dumpDependencyEdge(OS, DE, MCIP); +} +#endif // NDEBUG + +void DependencyGraph::addDependency(unsigned From, unsigned To, + DependencyEdge::Dependency &&Dep) { + DGNode &NodeFrom = Nodes[From]; + DGNode &NodeTo = Nodes[To]; + SmallVectorImpl<DependencyEdge> &Vec = NodeFrom.OutgoingEdges; + + auto It = find_if(Vec, [To, Dep](DependencyEdge &DE) { + return DE.ToIID == To && DE.Dep.ResourceOrRegID == Dep.ResourceOrRegID; + }); + + if (It != Vec.end()) { + It->Dep.Cost += Dep.Cost; + It->Frequency++; + return; + } + + DependencyEdge DE = {Dep, From, To, 1}; + Vec.emplace_back(DE); + NodeTo.NumPredecessors++; +} + +BottleneckAnalysis::BottleneckAnalysis(const MCSubtargetInfo &sti, + MCInstPrinter &Printer, + ArrayRef<MCInst> S, unsigned NumIter) + : STI(sti), MCIP(Printer), Tracker(STI.getSchedModel()), DG(S.size() * 3), + Source(S), Iterations(NumIter), TotalCycles(0), + PressureIncreasedBecauseOfResources(false), + PressureIncreasedBecauseOfRegisterDependencies(false), + PressureIncreasedBecauseOfMemoryDependencies(false), + SeenStallCycles(false), BPI() {} + +void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To, + unsigned RegID, unsigned Cost) { + bool IsLoopCarried = From >= To; + unsigned SourceSize = Source.size(); + if (IsLoopCarried) { + DG.addRegisterDep(From, To + SourceSize, RegID, Cost); + DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost); + return; + } + DG.addRegisterDep(From + SourceSize, To + SourceSize, RegID, Cost); +} + +void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To, + unsigned Cost) { + bool IsLoopCarried = From >= To; + unsigned SourceSize = Source.size(); + if (IsLoopCarried) { + DG.addMemoryDep(From, To + SourceSize, Cost); + DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost); + return; + } + DG.addMemoryDep(From + SourceSize, To + SourceSize, Cost); +} + +void BottleneckAnalysis::addResourceDep(unsigned From, unsigned To, + uint64_t Mask, unsigned Cost) { + bool IsLoopCarried = From >= To; + unsigned SourceSize = Source.size(); + if (IsLoopCarried) { + DG.addResourceDep(From, To + SourceSize, Mask, Cost); + DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost); + return; + } + DG.addResourceDep(From + SourceSize, To + SourceSize, Mask, Cost); +} + +void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) { + const unsigned IID = Event.IR.getSourceIndex(); + if (Event.Type == HWInstructionEvent::Dispatched) { + Tracker.onInstructionDispatched(IID); + return; + } + if (Event.Type == HWInstructionEvent::Executed) { + Tracker.onInstructionExecuted(IID); + return; + } + + if (Event.Type != HWInstructionEvent::Issued) + return; + + const Instruction &IS = *Event.IR.getInstruction(); + unsigned To = IID % Source.size(); + + unsigned Cycles = 2 * Tracker.getResourcePressureCycles(IID); + uint64_t ResourceMask = IS.getCriticalResourceMask(); + SmallVector<std::pair<unsigned, unsigned>, 4> Users; + while (ResourceMask) { + uint64_t Current = ResourceMask & (-ResourceMask); + Tracker.getResourceUsers(Current, Users); + for (const std::pair<unsigned, unsigned> &U : Users) + addResourceDep(U.first % Source.size(), To, Current, U.second + Cycles); + Users.clear(); + ResourceMask ^= Current; + } + + const CriticalDependency &RegDep = IS.getCriticalRegDep(); + if (RegDep.Cycles) { + Cycles = RegDep.Cycles + 2 * Tracker.getRegisterPressureCycles(IID); + unsigned From = RegDep.IID % Source.size(); + addRegisterDep(From, To, RegDep.RegID, Cycles); + } + + const CriticalDependency &MemDep = IS.getCriticalMemDep(); + if (MemDep.Cycles) { + Cycles = MemDep.Cycles + 2 * Tracker.getMemoryPressureCycles(IID); + unsigned From = MemDep.IID % Source.size(); + addMemoryDep(From, To, Cycles); + } + + Tracker.handleInstructionIssuedEvent( + static_cast<const HWInstructionIssuedEvent &>(Event)); + + // Check if this is the last simulated instruction. + if (IID == ((Iterations * Source.size()) - 1)) + DG.finalizeGraph(Iterations); +} + +void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) { + assert(Event.Reason != HWPressureEvent::INVALID && + "Unexpected invalid event!"); + + Tracker.handlePressureEvent(Event); + + switch (Event.Reason) { + default: + break; + + case HWPressureEvent::RESOURCES: + PressureIncreasedBecauseOfResources = true; + break; + case HWPressureEvent::REGISTER_DEPS: + PressureIncreasedBecauseOfRegisterDependencies = true; + break; + case HWPressureEvent::MEMORY_DEPS: + PressureIncreasedBecauseOfMemoryDependencies = true; + break; + } +} + +void BottleneckAnalysis::onCycleEnd() { + ++TotalCycles; + + bool PressureIncreasedBecauseOfDataDependencies = + PressureIncreasedBecauseOfRegisterDependencies || + PressureIncreasedBecauseOfMemoryDependencies; + if (!PressureIncreasedBecauseOfResources && + !PressureIncreasedBecauseOfDataDependencies) + return; + + ++BPI.PressureIncreaseCycles; + if (PressureIncreasedBecauseOfRegisterDependencies) + ++BPI.RegisterDependencyCycles; + if (PressureIncreasedBecauseOfMemoryDependencies) + ++BPI.MemoryDependencyCycles; + if (PressureIncreasedBecauseOfDataDependencies) + ++BPI.DataDependencyCycles; + if (PressureIncreasedBecauseOfResources) + ++BPI.ResourcePressureCycles; + PressureIncreasedBecauseOfResources = false; + PressureIncreasedBecauseOfRegisterDependencies = false; + PressureIncreasedBecauseOfMemoryDependencies = false; +} + +void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const { + if (!SeenStallCycles || !BPI.PressureIncreaseCycles) { + OS << "\n\nNo resource or data dependency bottlenecks discovered.\n"; + return; + } + + double PressurePerCycle = + (double)BPI.PressureIncreaseCycles * 100 / TotalCycles; + double ResourcePressurePerCycle = + (double)BPI.ResourcePressureCycles * 100 / TotalCycles; + double DDPerCycle = (double)BPI.DataDependencyCycles * 100 / TotalCycles; + double RegDepPressurePerCycle = + (double)BPI.RegisterDependencyCycles * 100 / TotalCycles; + double MemDepPressurePerCycle = + (double)BPI.MemoryDependencyCycles * 100 / TotalCycles; + + OS << "\n\nCycles with backend pressure increase [ " + << format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]"; + + OS << "\nThroughput Bottlenecks: " + << "\n Resource Pressure [ " + << format("%.2f", floor((ResourcePressurePerCycle * 100) + 0.5) / 100) + << "% ]"; + + if (BPI.PressureIncreaseCycles) { + ArrayRef<unsigned> Distribution = Tracker.getResourcePressureDistribution(); + const MCSchedModel &SM = STI.getSchedModel(); + for (unsigned I = 0, E = Distribution.size(); I < E; ++I) { + unsigned ResourceCycles = Distribution[I]; + if (ResourceCycles) { + double Frequency = (double)ResourceCycles * 100 / TotalCycles; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(I); + OS << "\n - " << PRDesc.Name << " [ " + << format("%.2f", floor((Frequency * 100) + 0.5) / 100) << "% ]"; + } + } + } + + OS << "\n Data Dependencies: [ " + << format("%.2f", floor((DDPerCycle * 100) + 0.5) / 100) << "% ]"; + OS << "\n - Register Dependencies [ " + << format("%.2f", floor((RegDepPressurePerCycle * 100) + 0.5) / 100) + << "% ]"; + OS << "\n - Memory Dependencies [ " + << format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100) + << "% ]\n"; +} + +void BottleneckAnalysis::printView(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + printBottleneckHints(TempStream); + TempStream.flush(); + OS << Buffer; + printCriticalSequence(OS); +} + +} // namespace mca. +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h new file mode 100644 index 000000000000..9e3bd5978f09 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h @@ -0,0 +1,343 @@ +//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the bottleneck analysis view. +/// +/// This view internally observes backend pressure increase events in order to +/// identify problematic data dependencies and processor resource interferences. +/// +/// Example of bottleneck analysis report for a dot-product on X86 btver2: +/// +/// Cycles with backend pressure increase [ 40.76% ] +/// Throughput Bottlenecks: +/// Resource Pressure [ 39.34% ] +/// - JFPA [ 39.34% ] +/// - JFPU0 [ 39.34% ] +/// Data Dependencies: [ 1.42% ] +/// - Register Dependencies [ 1.42% ] +/// - Memory Dependencies [ 0.00% ] +/// +/// According to the example, backend pressure increased during the 40.76% of +/// the simulated cycles. In particular, the major cause of backend pressure +/// increases was the contention on floating point adder JFPA accessible from +/// pipeline resource JFPU0. +/// +/// At the end of each cycle, if pressure on the simulated out-of-order buffers +/// has increased, a backend pressure event is reported. +/// In particular, this occurs when there is a delta between the number of uOps +/// dispatched and the number of uOps issued to the underlying pipelines. +/// +/// The bottleneck analysis view is also responsible for identifying and printing +/// the most "critical" sequence of dependent instructions according to the +/// simulated run. +/// +/// Below is the critical sequence computed for the dot-product example on +/// btver2: +/// +/// Instruction Dependency Information +/// +----< 2. vhaddps %xmm3, %xmm3, %xmm4 +/// | +/// | < loop carried > +/// | +/// | 0. vmulps %xmm0, %xmm0, %xmm2 +/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] +/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3 +/// | +/// | < loop carried > +/// | +/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] +/// +/// +/// The algorithm that computes the critical sequence is very similar to a +/// critical path analysis. +/// +/// A dependency graph is used internally to track dependencies between nodes. +/// Nodes of the graph represent instructions from the input assembly sequence, +/// and edges of the graph represent data dependencies or processor resource +/// interferences. +/// +/// Edges are dynamically 'discovered' by observing instruction state transitions +/// and backend pressure increase events. Edges are internally ranked based on +/// their "criticality". A dependency is considered to be critical if it takes a +/// long time to execute, and if it contributes to backend pressure increases. +/// Criticality is internally measured in terms of cycles; it is computed for +/// every edge in the graph as a function of the edge latency and the number of +/// backend pressure increase cycles contributed by that edge. +/// +/// At the end of simulation, costs are propagated to nodes through the edges of +/// the graph, and the most expensive path connecting the root-set (a +/// set of nodes with no predecessors) to a leaf node is reported as critical +/// sequence. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H +#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H + +#include "Views/View.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +namespace mca { + +class PressureTracker { + const MCSchedModel &SM; + + // Resource pressure distribution. There is an element for every processor + // resource declared by the scheduling model. Quantities are number of cycles. + SmallVector<unsigned, 4> ResourcePressureDistribution; + + // Each processor resource is associated with a so-called processor resource + // mask. This vector allows to correlate processor resource IDs with processor + // resource masks. There is exactly one element per each processor resource + // declared by the scheduling model. + SmallVector<uint64_t, 4> ProcResID2Mask; + + // Maps processor resource state indices (returned by calls to + // `getResourceStateIndex(Mask)` to processor resource identifiers. + SmallVector<unsigned, 4> ResIdx2ProcResID; + + // Maps Processor Resource identifiers to ResourceUsers indices. + SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex; + + // Identifies the last user of a processor resource unit. + // This vector is updated on every instruction issued event. + // There is one entry for every processor resource unit declared by the + // processor model. An all_ones value is treated like an invalid instruction + // identifier. + using User = std::pair<unsigned, unsigned>; + SmallVector<User, 4> ResourceUsers; + + struct InstructionPressureInfo { + unsigned RegisterPressureCycles; + unsigned MemoryPressureCycles; + unsigned ResourcePressureCycles; + }; + DenseMap<unsigned, InstructionPressureInfo> IPI; + + void updateResourcePressureDistribution(uint64_t CumulativeMask); + + User getResourceUser(unsigned ProcResID, unsigned UnitID) const { + unsigned Index = ProcResID2ResourceUsersIndex[ProcResID]; + return ResourceUsers[Index + UnitID]; + } + +public: + PressureTracker(const MCSchedModel &Model); + + ArrayRef<unsigned> getResourcePressureDistribution() const { + return ResourcePressureDistribution; + } + + void getResourceUsers(uint64_t ResourceMask, + SmallVectorImpl<User> &Users) const; + + unsigned getRegisterPressureCycles(unsigned IID) const { + assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!"); + const InstructionPressureInfo &Info = IPI.find(IID)->second; + return Info.RegisterPressureCycles; + } + + unsigned getMemoryPressureCycles(unsigned IID) const { + assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!"); + const InstructionPressureInfo &Info = IPI.find(IID)->second; + return Info.MemoryPressureCycles; + } + + unsigned getResourcePressureCycles(unsigned IID) const { + assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!"); + const InstructionPressureInfo &Info = IPI.find(IID)->second; + return Info.ResourcePressureCycles; + } + + const char *resolveResourceName(uint64_t ResourceMask) const { + unsigned Index = getResourceStateIndex(ResourceMask); + unsigned ProcResID = ResIdx2ProcResID[Index]; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID); + return PRDesc.Name; + } + + void onInstructionDispatched(unsigned IID); + void onInstructionExecuted(unsigned IID); + + void handlePressureEvent(const HWPressureEvent &Event); + void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event); +}; + +// A dependency edge. +struct DependencyEdge { + enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE }; + + // Dependency edge descriptor. + // + // It specifies the dependency type, as well as the edge cost in cycles. + struct Dependency { + DependencyType Type; + uint64_t ResourceOrRegID; + uint64_t Cost; + }; + Dependency Dep; + + unsigned FromIID; + unsigned ToIID; + + // Used by the bottleneck analysis to compute the interference + // probability for processor resources. + unsigned Frequency; +}; + +// A dependency graph used by the bottleneck analysis to describe data +// dependencies and processor resource interferences between instructions. +// +// There is a node (an instance of struct DGNode) for every instruction in the +// input assembly sequence. Edges of the graph represent dependencies between +// instructions. +// +// Each edge of the graph is associated with a cost value which is used +// internally to rank dependency based on their impact on the runtime +// performance (see field DependencyEdge::Dependency::Cost). In general, the +// higher the cost of an edge, the higher the impact on performance. +// +// The cost of a dependency is a function of both the latency and the number of +// cycles where the dependency has been seen as critical (i.e. contributing to +// back-pressure increases). +// +// Loop carried dependencies are carefully expanded by the bottleneck analysis +// to guarantee that the graph stays acyclic. To this end, extra nodes are +// pre-allocated at construction time to describe instructions from "past and +// future" iterations. The graph is kept acyclic mainly because it simplifies the +// complexity of the algorithm that computes the critical sequence. +class DependencyGraph { + struct DGNode { + unsigned NumPredecessors; + unsigned NumVisitedPredecessors; + uint64_t Cost; + unsigned Depth; + + DependencyEdge CriticalPredecessor; + SmallVector<DependencyEdge, 8> OutgoingEdges; + }; + SmallVector<DGNode, 16> Nodes; + + DependencyGraph(const DependencyGraph &) = delete; + DependencyGraph &operator=(const DependencyGraph &) = delete; + + void addDependency(unsigned From, unsigned To, + DependencyEdge::Dependency &&DE); + + void pruneEdges(unsigned Iterations); + void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const; + void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations); + +#ifndef NDEBUG + void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE, + MCInstPrinter &MCIP) const; +#endif + +public: + DependencyGraph(unsigned Size) : Nodes(Size) {} + + void addRegisterDep(unsigned From, unsigned To, unsigned RegID, + unsigned Cost) { + addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost}); + } + + void addMemoryDep(unsigned From, unsigned To, unsigned Cost) { + addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost}); + } + + void addResourceDep(unsigned From, unsigned To, uint64_t Mask, + unsigned Cost) { + addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost}); + } + + // Called by the bottleneck analysis at the end of simulation to propagate + // costs through the edges of the graph, and compute a critical path. + void finalizeGraph(unsigned Iterations) { + SmallVector<unsigned, 16> RootSet; + pruneEdges(Iterations); + initializeRootSet(RootSet); + propagateThroughEdges(RootSet, Iterations); + } + + // Returns a sequence of edges representing the critical sequence based on the + // simulated run. It assumes that the graph has already been finalized (i.e. + // method `finalizeGraph()` has already been called on this graph). + void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const; + +#ifndef NDEBUG + void dump(raw_ostream &OS, MCInstPrinter &MCIP) const; +#endif +}; + +/// A view that collects and prints a few performance numbers. +class BottleneckAnalysis : public View { + const MCSubtargetInfo &STI; + MCInstPrinter &MCIP; + PressureTracker Tracker; + DependencyGraph DG; + + ArrayRef<MCInst> Source; + unsigned Iterations; + unsigned TotalCycles; + + bool PressureIncreasedBecauseOfResources; + bool PressureIncreasedBecauseOfRegisterDependencies; + bool PressureIncreasedBecauseOfMemoryDependencies; + // True if throughput was affected by dispatch stalls. + bool SeenStallCycles; + + struct BackPressureInfo { + // Cycles where backpressure increased. + unsigned PressureIncreaseCycles; + // Cycles where backpressure increased because of pipeline pressure. + unsigned ResourcePressureCycles; + // Cycles where backpressure increased because of data dependencies. + unsigned DataDependencyCycles; + // Cycles where backpressure increased because of register dependencies. + unsigned RegisterDependencyCycles; + // Cycles where backpressure increased because of memory dependencies. + unsigned MemoryDependencyCycles; + }; + BackPressureInfo BPI; + + // Used to populate the dependency graph DG. + void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy); + void addMemoryDep(unsigned From, unsigned To, unsigned Cy); + void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy); + + // Prints a bottleneck message to OS. + void printBottleneckHints(raw_ostream &OS) const; + void printCriticalSequence(raw_ostream &OS) const; + +public: + BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP, + ArrayRef<MCInst> Sequence, unsigned Iterations); + + void onCycleEnd() override; + void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; } + void onEvent(const HWPressureEvent &Event) override; + void onEvent(const HWInstructionEvent &Event) override; + + void printView(raw_ostream &OS) const override; + +#ifndef NDEBUG + void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); } +#endif +}; + +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp b/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp new file mode 100644 index 000000000000..557b8ba17b17 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp @@ -0,0 +1,85 @@ +//===--------------------- DispatchStatistics.cpp ---------------------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the DispatchStatistics interface. +/// +//===----------------------------------------------------------------------===// + +#include "Views/DispatchStatistics.h" +#include "llvm/Support/Format.h" + +namespace llvm { +namespace mca { + +void DispatchStatistics::onEvent(const HWStallEvent &Event) { + if (Event.Type < HWStallEvent::LastGenericEvent) + HWStalls[Event.Type]++; +} + +void DispatchStatistics::onEvent(const HWInstructionEvent &Event) { + if (Event.Type != HWInstructionEvent::Dispatched) + return; + + const auto &DE = static_cast<const HWInstructionDispatchedEvent &>(Event); + NumDispatched += DE.MicroOpcodes; +} + +void DispatchStatistics::printDispatchHistogram(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nDispatch Logic - " + << "number of cycles where we saw N micro opcodes dispatched:\n"; + TempStream << "[# dispatched], [# cycles]\n"; + for (const std::pair<unsigned, unsigned> &Entry : DispatchGroupSizePerCycle) { + double Percentage = ((double)Entry.second / NumCycles) * 100.0; + TempStream << " " << Entry.first << ", " << Entry.second + << " (" << format("%.1f", floor((Percentage * 10) + 0.5) / 10) + << "%)\n"; + } + + TempStream.flush(); + OS << Buffer; +} + +static void printStalls(raw_ostream &OS, unsigned NumStalls, + unsigned NumCycles) { + if (!NumStalls) { + OS << NumStalls; + return; + } + + double Percentage = ((double)NumStalls / NumCycles) * 100.0; + OS << NumStalls << " (" + << format("%.1f", floor((Percentage * 10) + 0.5) / 10) << "%)"; +} + +void DispatchStatistics::printDispatchStalls(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream SS(Buffer); + SS << "\n\nDynamic Dispatch Stall Cycles:\n"; + SS << "RAT - Register unavailable: "; + printStalls(SS, HWStalls[HWStallEvent::RegisterFileStall], NumCycles); + SS << "\nRCU - Retire tokens unavailable: "; + printStalls(SS, HWStalls[HWStallEvent::RetireControlUnitStall], NumCycles); + SS << "\nSCHEDQ - Scheduler full: "; + printStalls(SS, HWStalls[HWStallEvent::SchedulerQueueFull], NumCycles); + SS << "\nLQ - Load queue full: "; + printStalls(SS, HWStalls[HWStallEvent::LoadQueueFull], NumCycles); + SS << "\nSQ - Store queue full: "; + printStalls(SS, HWStalls[HWStallEvent::StoreQueueFull], NumCycles); + SS << "\nGROUP - Static restrictions on the dispatch group: "; + printStalls(SS, HWStalls[HWStallEvent::DispatchGroupStall], NumCycles); + SS << '\n'; + SS.flush(); + OS << Buffer; +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/DispatchStatistics.h b/llvm/tools/llvm-mca/Views/DispatchStatistics.h new file mode 100644 index 000000000000..07c0f5a4c68f --- /dev/null +++ b/llvm/tools/llvm-mca/Views/DispatchStatistics.h @@ -0,0 +1,85 @@ +//===--------------------- DispatchStatistics.h -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements a view that prints a few statistics related to the +/// dispatch logic. It collects and analyzes instruction dispatch events as +/// well as static/dynamic dispatch stall events. +/// +/// Example: +/// ======== +/// +/// Dynamic Dispatch Stall Cycles: +/// RAT - Register unavailable: 0 +/// RCU - Retire tokens unavailable: 0 +/// SCHEDQ - Scheduler full: 42 +/// LQ - Load queue full: 0 +/// SQ - Store queue full: 0 +/// GROUP - Static restrictions on the dispatch group: 0 +/// +/// +/// Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +/// [# dispatched], [# cycles] +/// 0, 15 (11.5%) +/// 2, 4 (3.1%) +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCHVIEW_H +#define LLVM_TOOLS_LLVM_MCA_DISPATCHVIEW_H + +#include "Views/View.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include <map> + +namespace llvm { +namespace mca { + +class DispatchStatistics : public View { + unsigned NumDispatched; + unsigned NumCycles; + + // Counts dispatch stall events caused by unavailability of resources. There + // is one counter for every generic stall kind (see class HWStallEvent). + llvm::SmallVector<unsigned, 8> HWStalls; + + using Histogram = std::map<unsigned, unsigned>; + Histogram DispatchGroupSizePerCycle; + + void updateHistograms() { + DispatchGroupSizePerCycle[NumDispatched]++; + NumDispatched = 0; + } + + void printDispatchHistogram(llvm::raw_ostream &OS) const; + + void printDispatchStalls(llvm::raw_ostream &OS) const; + +public: + DispatchStatistics() + : NumDispatched(0), NumCycles(0), + HWStalls(HWStallEvent::LastGenericEvent) {} + + void onEvent(const HWStallEvent &Event) override; + + void onEvent(const HWInstructionEvent &Event) override; + + void onCycleBegin() override { NumCycles++; } + + void onCycleEnd() override { updateHistograms(); } + + void printView(llvm::raw_ostream &OS) const override { + printDispatchStalls(OS); + printDispatchHistogram(OS); + } +}; +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp new file mode 100644 index 000000000000..a6f9153b4945 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp @@ -0,0 +1,112 @@ +//===--------------------- InstructionInfoView.cpp --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the InstructionInfoView API. +/// +//===----------------------------------------------------------------------===// + +#include "Views/InstructionInfoView.h" +#include "llvm/Support/FormattedStream.h" + +namespace llvm { +namespace mca { + +void InstructionInfoView::printView(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + const MCSchedModel &SM = STI.getSchedModel(); + + std::string Instruction; + raw_string_ostream InstrStream(Instruction); + + TempStream << "\n\nInstruction Info:\n"; + TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n" + << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects (U)\n"; + if (PrintEncodings) { + TempStream << "[7]: Encoding Size\n"; + TempStream << "\n[1] [2] [3] [4] [5] [6] [7] " + << "Encodings: Instructions:\n"; + } else { + TempStream << "\n[1] [2] [3] [4] [5] [6] Instructions:\n"; + } + + for (unsigned I = 0, E = Source.size(); I < E; ++I) { + const MCInst &Inst = Source[I]; + const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode()); + + // Obtain the scheduling class information from the instruction. + unsigned SchedClassID = MCDesc.getSchedClass(); + unsigned CPUID = SM.getProcessorID(); + + // Try to solve variant scheduling classes. + while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant()) + SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &Inst, CPUID); + + const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); + unsigned NumMicroOpcodes = SCDesc.NumMicroOps; + unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc); + // Add extra latency due to delays in the forwarding data paths. + Latency += MCSchedModel::getForwardingDelayCycles( + STI.getReadAdvanceEntries(SCDesc)); + Optional<double> RThroughput = + MCSchedModel::getReciprocalThroughput(STI, SCDesc); + + TempStream << ' ' << NumMicroOpcodes << " "; + if (NumMicroOpcodes < 10) + TempStream << " "; + else if (NumMicroOpcodes < 100) + TempStream << ' '; + TempStream << Latency << " "; + if (Latency < 10) + TempStream << " "; + else if (Latency < 100) + TempStream << ' '; + + if (RThroughput.hasValue()) { + double RT = RThroughput.getValue(); + TempStream << format("%.2f", RT) << ' '; + if (RT < 10.0) + TempStream << " "; + else if (RT < 100.0) + TempStream << ' '; + } else { + TempStream << " - "; + } + TempStream << (MCDesc.mayLoad() ? " * " : " "); + TempStream << (MCDesc.mayStore() ? " * " : " "); + TempStream << (MCDesc.hasUnmodeledSideEffects() ? " U " : " "); + + if (PrintEncodings) { + StringRef Encoding(CE.getEncoding(I)); + unsigned EncodingSize = Encoding.size(); + TempStream << " " << EncodingSize + << (EncodingSize < 10 ? " " : " "); + TempStream.flush(); + formatted_raw_ostream FOS(TempStream); + for (unsigned i = 0, e = Encoding.size(); i != e; ++i) + FOS << format("%02x ", (uint8_t)Encoding[i]); + FOS.PadToColumn(30); + FOS.flush(); + } + + MCIP.printInst(&Inst, InstrStream, "", STI); + InstrStream.flush(); + + // Consume any tabs or spaces at the beginning of the string. + StringRef Str(Instruction); + Str = Str.ltrim(); + TempStream << Str << '\n'; + Instruction = ""; + } + + TempStream.flush(); + OS << Buffer; +} +} // namespace mca. +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.h b/llvm/tools/llvm-mca/Views/InstructionInfoView.h new file mode 100644 index 000000000000..0e948304119f --- /dev/null +++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.h @@ -0,0 +1,73 @@ +//===--------------------- InstructionInfoView.h ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the instruction info view. +/// +/// The goal fo the instruction info view is to print the latency and reciprocal +/// throughput information for every instruction in the input sequence. +/// This section also reports extra information related to the number of micro +/// opcodes, and opcode properties (i.e. 'MayLoad', 'MayStore', 'HasSideEffects) +/// +/// Example: +/// +/// Instruction Info: +/// [1]: #uOps +/// [2]: Latency +/// [3]: RThroughput +/// [4]: MayLoad +/// [5]: MayStore +/// [6]: HasSideEffects +/// +/// [1] [2] [3] [4] [5] [6] Instructions: +/// 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2 +/// 1 3 1.00 vhaddps %xmm2, %xmm2, %xmm3 +/// 1 3 1.00 vhaddps %xmm3, %xmm3, %xmm4 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H +#define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H + +#include "Views/View.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MCA/CodeEmitter.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "llvm-mca" + +namespace llvm { +namespace mca { + +/// A view that prints out generic instruction information. +class InstructionInfoView : public View { + const llvm::MCSubtargetInfo &STI; + const llvm::MCInstrInfo &MCII; + CodeEmitter &CE; + bool PrintEncodings; + llvm::ArrayRef<llvm::MCInst> Source; + llvm::MCInstPrinter &MCIP; + +public: + InstructionInfoView(const llvm::MCSubtargetInfo &ST, + const llvm::MCInstrInfo &II, CodeEmitter &C, + bool ShouldPrintEncodings, llvm::ArrayRef<llvm::MCInst> S, + llvm::MCInstPrinter &IP) + : STI(ST), MCII(II), CE(C), PrintEncodings(ShouldPrintEncodings), + Source(S), MCIP(IP) {} + + void printView(llvm::raw_ostream &OS) const override; +}; +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/Views/RegisterFileStatistics.cpp b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.cpp new file mode 100644 index 000000000000..58736ee0d18c --- /dev/null +++ b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.cpp @@ -0,0 +1,167 @@ +//===--------------------- RegisterFileStatistics.cpp -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the RegisterFileStatistics interface. +/// +//===----------------------------------------------------------------------===// + +#include "Views/RegisterFileStatistics.h" +#include "llvm/Support/Format.h" + +namespace llvm { +namespace mca { + +RegisterFileStatistics::RegisterFileStatistics(const MCSubtargetInfo &sti) + : STI(sti) { + const MCSchedModel &SM = STI.getSchedModel(); + RegisterFileUsage RFUEmpty = {0, 0, 0}; + MoveEliminationInfo MEIEmpty = {0, 0, 0, 0, 0}; + if (!SM.hasExtraProcessorInfo()) { + // Assume a single register file. + PRFUsage.emplace_back(RFUEmpty); + MoveElimInfo.emplace_back(MEIEmpty); + return; + } + + // Initialize a RegisterFileUsage for every user defined register file, plus + // the default register file which is always at index #0. + const MCExtraProcessorInfo &PI = SM.getExtraProcessorInfo(); + // There is always an "InvalidRegisterFile" entry in tablegen. That entry can + // be skipped. If there are no user defined register files, then reserve a + // single entry for the default register file at index #0. + unsigned NumRegFiles = std::max(PI.NumRegisterFiles, 1U); + + PRFUsage.resize(NumRegFiles); + std::fill(PRFUsage.begin(), PRFUsage.end(), RFUEmpty); + + MoveElimInfo.resize(NumRegFiles); + std::fill(MoveElimInfo.begin(), MoveElimInfo.end(), MEIEmpty); +} + +void RegisterFileStatistics::updateRegisterFileUsage( + ArrayRef<unsigned> UsedPhysRegs) { + for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I) { + RegisterFileUsage &RFU = PRFUsage[I]; + unsigned NumUsedPhysRegs = UsedPhysRegs[I]; + RFU.CurrentlyUsedMappings += NumUsedPhysRegs; + RFU.TotalMappings += NumUsedPhysRegs; + RFU.MaxUsedMappings = + std::max(RFU.MaxUsedMappings, RFU.CurrentlyUsedMappings); + } +} + +void RegisterFileStatistics::updateMoveElimInfo(const Instruction &Inst) { + if (!Inst.isOptimizableMove()) + return; + + assert(Inst.getDefs().size() == 1 && "Expected a single definition!"); + assert(Inst.getUses().size() == 1 && "Expected a single register use!"); + const WriteState &WS = Inst.getDefs()[0]; + const ReadState &RS = Inst.getUses()[0]; + + MoveEliminationInfo &Info = + MoveElimInfo[Inst.getDefs()[0].getRegisterFileID()]; + Info.TotalMoveEliminationCandidates++; + if (WS.isEliminated()) + Info.CurrentMovesEliminated++; + if (WS.isWriteZero() && RS.isReadZero()) + Info.TotalMovesThatPropagateZero++; +} + +void RegisterFileStatistics::onEvent(const HWInstructionEvent &Event) { + switch (Event.Type) { + default: + break; + case HWInstructionEvent::Retired: { + const auto &RE = static_cast<const HWInstructionRetiredEvent &>(Event); + for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I) + PRFUsage[I].CurrentlyUsedMappings -= RE.FreedPhysRegs[I]; + break; + } + case HWInstructionEvent::Dispatched: { + const auto &DE = static_cast<const HWInstructionDispatchedEvent &>(Event); + updateRegisterFileUsage(DE.UsedPhysRegs); + updateMoveElimInfo(*DE.IR.getInstruction()); + } + } +} + +void RegisterFileStatistics::onCycleEnd() { + for (MoveEliminationInfo &MEI : MoveElimInfo) { + unsigned &CurrentMax = MEI.MaxMovesEliminatedPerCycle; + CurrentMax = std::max(CurrentMax, MEI.CurrentMovesEliminated); + MEI.TotalMovesEliminated += MEI.CurrentMovesEliminated; + MEI.CurrentMovesEliminated = 0; + } +} + +void RegisterFileStatistics::printView(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + TempStream << "\n\nRegister File statistics:"; + const RegisterFileUsage &GlobalUsage = PRFUsage[0]; + TempStream << "\nTotal number of mappings created: " + << GlobalUsage.TotalMappings; + TempStream << "\nMax number of mappings used: " + << GlobalUsage.MaxUsedMappings << '\n'; + + for (unsigned I = 1, E = PRFUsage.size(); I < E; ++I) { + const RegisterFileUsage &RFU = PRFUsage[I]; + // Obtain the register file descriptor from the scheduling model. + assert(STI.getSchedModel().hasExtraProcessorInfo() && + "Unable to find register file info!"); + const MCExtraProcessorInfo &PI = + STI.getSchedModel().getExtraProcessorInfo(); + assert(I <= PI.NumRegisterFiles && "Unexpected register file index!"); + const MCRegisterFileDesc &RFDesc = PI.RegisterFiles[I]; + // Skip invalid register files. + if (!RFDesc.NumPhysRegs) + continue; + + TempStream << "\n* Register File #" << I; + TempStream << " -- " << StringRef(RFDesc.Name) << ':'; + TempStream << "\n Number of physical registers: "; + if (!RFDesc.NumPhysRegs) + TempStream << "unbounded"; + else + TempStream << RFDesc.NumPhysRegs; + TempStream << "\n Total number of mappings created: " + << RFU.TotalMappings; + TempStream << "\n Max number of mappings used: " + << RFU.MaxUsedMappings << '\n'; + const MoveEliminationInfo &MEI = MoveElimInfo[I]; + + if (MEI.TotalMoveEliminationCandidates) { + TempStream << " Number of optimizable moves: " + << MEI.TotalMoveEliminationCandidates; + double EliminatedMovProportion = (double)MEI.TotalMovesEliminated / + MEI.TotalMoveEliminationCandidates * + 100.0; + double ZeroMovProportion = (double)MEI.TotalMovesThatPropagateZero / + MEI.TotalMoveEliminationCandidates * 100.0; + TempStream << "\n Number of moves eliminated: " + << MEI.TotalMovesEliminated << " " + << format("(%.1f%%)", + floor((EliminatedMovProportion * 10) + 0.5) / 10); + TempStream << "\n Number of zero moves: " + << MEI.TotalMovesThatPropagateZero << " " + << format("(%.1f%%)", + floor((ZeroMovProportion * 10) + 0.5) / 10); + TempStream << "\n Max moves eliminated per cycle: " + << MEI.MaxMovesEliminatedPerCycle << '\n'; + } + } + + TempStream.flush(); + OS << Buffer; +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h new file mode 100644 index 000000000000..a2273dd48b22 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h @@ -0,0 +1,80 @@ +//===--------------------- RegisterFileStatistics.h -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This view collects and prints register file usage statistics. +/// +/// Example (-mcpu=btver2): +/// ======================== +/// +/// Register File statistics: +/// Total number of mappings created: 6 +/// Max number of mappings used: 3 +/// +/// * Register File #1 -- FpuPRF: +/// Number of physical registers: 72 +/// Total number of mappings created: 0 +/// Max number of mappings used: 0 +/// Number of optimizable moves: 200 +/// Number of moves eliminated: 200 (100.0%) +/// Number of zero moves: 200 (100.0%) +/// Max moves eliminated per cycle: 2 +/// +/// * Register File #2 -- IntegerPRF: +/// Number of physical registers: 64 +/// Total number of mappings created: 6 +/// Max number of mappings used: 3 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_REGISTERFILESTATISTICS_H +#define LLVM_TOOLS_LLVM_MCA_REGISTERFILESTATISTICS_H + +#include "Views/View.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCSubtargetInfo.h" + +namespace llvm { +namespace mca { + +class RegisterFileStatistics : public View { + const llvm::MCSubtargetInfo &STI; + + // Used to track the number of physical registers used in a register file. + struct RegisterFileUsage { + unsigned TotalMappings; + unsigned MaxUsedMappings; + unsigned CurrentlyUsedMappings; + }; + + struct MoveEliminationInfo { + unsigned TotalMoveEliminationCandidates; + unsigned TotalMovesEliminated; + unsigned TotalMovesThatPropagateZero; + unsigned MaxMovesEliminatedPerCycle; + unsigned CurrentMovesEliminated; + }; + + // There is one entry for each register file implemented by the processor. + llvm::SmallVector<RegisterFileUsage, 4> PRFUsage; + llvm::SmallVector<MoveEliminationInfo, 4> MoveElimInfo; + + void updateRegisterFileUsage(ArrayRef<unsigned> UsedPhysRegs); + void updateMoveElimInfo(const Instruction &Inst); + +public: + RegisterFileStatistics(const llvm::MCSubtargetInfo &sti); + + void onCycleEnd() override; + void onEvent(const HWInstructionEvent &Event) override; + void printView(llvm::raw_ostream &OS) const override; +}; +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp b/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp new file mode 100644 index 000000000000..38a2478cf4fe --- /dev/null +++ b/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp @@ -0,0 +1,184 @@ +//===--------------------- ResourcePressureView.cpp -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements methods in the ResourcePressureView interface. +/// +//===----------------------------------------------------------------------===// + +#include "Views/ResourcePressureView.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +namespace mca { + +ResourcePressureView::ResourcePressureView(const llvm::MCSubtargetInfo &sti, + MCInstPrinter &Printer, + ArrayRef<MCInst> S) + : STI(sti), MCIP(Printer), Source(S), LastInstructionIdx(0) { + // Populate the map of resource descriptors. + unsigned R2VIndex = 0; + const MCSchedModel &SM = STI.getSchedModel(); + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(I); + unsigned NumUnits = ProcResource.NumUnits; + // Skip groups and invalid resources with zero units. + if (ProcResource.SubUnitsIdxBegin || !NumUnits) + continue; + + Resource2VecIndex.insert(std::pair<unsigned, unsigned>(I, R2VIndex)); + R2VIndex += ProcResource.NumUnits; + } + + NumResourceUnits = R2VIndex; + ResourceUsage.resize(NumResourceUnits * (Source.size() + 1)); + std::fill(ResourceUsage.begin(), ResourceUsage.end(), 0.0); +} + +void ResourcePressureView::onEvent(const HWInstructionEvent &Event) { + if (Event.Type == HWInstructionEvent::Dispatched) { + LastInstructionIdx = Event.IR.getSourceIndex(); + return; + } + + // We're only interested in Issue events. + if (Event.Type != HWInstructionEvent::Issued) + return; + + const auto &IssueEvent = static_cast<const HWInstructionIssuedEvent &>(Event); + const unsigned SourceIdx = Event.IR.getSourceIndex() % Source.size(); + for (const std::pair<ResourceRef, ResourceCycles> &Use : + IssueEvent.UsedResources) { + const ResourceRef &RR = Use.first; + assert(Resource2VecIndex.find(RR.first) != Resource2VecIndex.end()); + unsigned R2VIndex = Resource2VecIndex[RR.first]; + R2VIndex += countTrailingZeros(RR.second); + ResourceUsage[R2VIndex + NumResourceUnits * SourceIdx] += Use.second; + ResourceUsage[R2VIndex + NumResourceUnits * Source.size()] += Use.second; + } +} + +static void printColumnNames(formatted_raw_ostream &OS, + const MCSchedModel &SM) { + unsigned Column = OS.getColumn(); + for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds(); + I < E; ++I) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(I); + unsigned NumUnits = ProcResource.NumUnits; + // Skip groups and invalid resources with zero units. + if (ProcResource.SubUnitsIdxBegin || !NumUnits) + continue; + + for (unsigned J = 0; J < NumUnits; ++J) { + Column += 7; + OS << "[" << ResourceIndex; + if (NumUnits > 1) + OS << '.' << J; + OS << ']'; + OS.PadToColumn(Column); + } + + ResourceIndex++; + } +} + +static void printResourcePressure(formatted_raw_ostream &OS, double Pressure, + unsigned Col) { + if (!Pressure || Pressure < 0.005) { + OS << " - "; + } else { + // Round to the value to the nearest hundredth and then print it. + OS << format("%.2f", floor((Pressure * 100) + 0.5) / 100); + } + OS.PadToColumn(Col); +} + +void ResourcePressureView::printResourcePressurePerIter(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + formatted_raw_ostream FOS(TempStream); + + FOS << "\n\nResources:\n"; + const MCSchedModel &SM = STI.getSchedModel(); + for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds(); + I < E; ++I) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(I); + unsigned NumUnits = ProcResource.NumUnits; + // Skip groups and invalid resources with zero units. + if (ProcResource.SubUnitsIdxBegin || !NumUnits) + continue; + + for (unsigned J = 0; J < NumUnits; ++J) { + FOS << '[' << ResourceIndex; + if (NumUnits > 1) + FOS << '.' << J; + FOS << ']'; + FOS.PadToColumn(6); + FOS << "- " << ProcResource.Name << '\n'; + } + + ResourceIndex++; + } + + FOS << "\n\nResource pressure per iteration:\n"; + FOS.flush(); + printColumnNames(FOS, SM); + FOS << '\n'; + FOS.flush(); + + const unsigned Executions = LastInstructionIdx / Source.size() + 1; + for (unsigned I = 0, E = NumResourceUnits; I < E; ++I) { + double Usage = ResourceUsage[I + Source.size() * E]; + printResourcePressure(FOS, Usage / Executions, (I + 1) * 7); + } + + FOS.flush(); + OS << Buffer; +} + +void ResourcePressureView::printResourcePressurePerInst(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + formatted_raw_ostream FOS(TempStream); + + FOS << "\n\nResource pressure by instruction:\n"; + printColumnNames(FOS, STI.getSchedModel()); + FOS << "Instructions:\n"; + + std::string Instruction; + raw_string_ostream InstrStream(Instruction); + + unsigned InstrIndex = 0; + const unsigned Executions = LastInstructionIdx / Source.size() + 1; + for (const MCInst &MCI : Source) { + unsigned BaseEltIdx = InstrIndex * NumResourceUnits; + for (unsigned J = 0; J < NumResourceUnits; ++J) { + double Usage = ResourceUsage[J + BaseEltIdx]; + printResourcePressure(FOS, Usage / Executions, (J + 1) * 7); + } + + MCIP.printInst(&MCI, InstrStream, "", STI); + InstrStream.flush(); + StringRef Str(Instruction); + + // Remove any tabs or spaces at the beginning of the instruction. + Str = Str.ltrim(); + + FOS << Str << '\n'; + Instruction = ""; + + FOS.flush(); + OS << Buffer; + Buffer = ""; + + ++InstrIndex; + } +} +} // namespace mca +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/ResourcePressureView.h b/llvm/tools/llvm-mca/Views/ResourcePressureView.h new file mode 100644 index 000000000000..0fa0b9a36aa3 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/ResourcePressureView.h @@ -0,0 +1,103 @@ +//===--------------------- ResourcePressureView.h ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file define class ResourcePressureView. +/// Class ResourcePressureView observes hardware events generated by +/// the Pipeline object and collects statistics related to resource usage at +/// instruction granularity. +/// Resource pressure information is then printed out to a stream in the +/// form of a table like the one from the example below: +/// +/// Resources: +/// [0] - JALU0 +/// [1] - JALU1 +/// [2] - JDiv +/// [3] - JFPM +/// [4] - JFPU0 +/// [5] - JFPU1 +/// [6] - JLAGU +/// [7] - JSAGU +/// [8] - JSTC +/// [9] - JVIMUL +/// +/// Resource pressure per iteration: +/// [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +/// 0.00 0.00 0.00 0.00 2.00 2.00 0.00 0.00 0.00 0.00 +/// +/// Resource pressure by instruction: +/// [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +/// - - - - - 1.00 - - - - vpermilpd $1, %xmm0, +/// %xmm1 +/// - - - - 1.00 - - - - - vaddps %xmm0, %xmm1, +/// %xmm2 +/// - - - - - 1.00 - - - - vmovshdup %xmm2, %xmm3 +/// - - - - 1.00 - - - - - vaddss %xmm2, %xmm3, +/// %xmm4 +/// +/// In this example, we have AVX code executed on AMD Jaguar (btver2). +/// Both shuffles and vector floating point add operations on XMM registers have +/// a reciprocal throughput of 1cy. +/// Each add is issued to pipeline JFPU0, while each shuffle is issued to +/// pipeline JFPU1. The overall pressure per iteration is reported by two +/// tables: the first smaller table is the resource pressure per iteration; +/// the second table reports resource pressure per instruction. Values are the +/// average resource cycles consumed by an instruction. +/// Every vector add from the example uses resource JFPU0 for an average of 1cy +/// per iteration. Consequently, the resource pressure on JFPU0 is of 2cy per +/// iteration. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H +#define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H + +#include "Views/View.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" + +namespace llvm { +namespace mca { + +/// This class collects resource pressure statistics and it is able to print +/// out all the collected information as a table to an output stream. +class ResourcePressureView : public View { + const llvm::MCSubtargetInfo &STI; + llvm::MCInstPrinter &MCIP; + llvm::ArrayRef<llvm::MCInst> Source; + unsigned LastInstructionIdx; + + // Map to quickly obtain the ResourceUsage column index from a processor + // resource ID. + llvm::DenseMap<unsigned, unsigned> Resource2VecIndex; + + // Table of resources used by instructions. + std::vector<ResourceCycles> ResourceUsage; + unsigned NumResourceUnits; + + void printResourcePressurePerIter(llvm::raw_ostream &OS) const; + void printResourcePressurePerInst(llvm::raw_ostream &OS) const; + +public: + ResourcePressureView(const llvm::MCSubtargetInfo &sti, + llvm::MCInstPrinter &Printer, + llvm::ArrayRef<llvm::MCInst> S); + + void onEvent(const HWInstructionEvent &Event) override; + void printView(llvm::raw_ostream &OS) const override { + printResourcePressurePerIter(OS); + printResourcePressurePerInst(OS); + } +}; +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp new file mode 100644 index 000000000000..cb4fbae78039 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp @@ -0,0 +1,90 @@ +//===--------------------- RetireControlUnitStatistics.cpp ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the RetireControlUnitStatistics interface. +/// +//===----------------------------------------------------------------------===// + +#include "Views/RetireControlUnitStatistics.h" +#include "llvm/Support/Format.h" + +namespace llvm { +namespace mca { + +RetireControlUnitStatistics::RetireControlUnitStatistics(const MCSchedModel &SM) + : NumRetired(0), NumCycles(0), EntriesInUse(0), MaxUsedEntries(0), + SumOfUsedEntries(0) { + TotalROBEntries = SM.MicroOpBufferSize; + if (SM.hasExtraProcessorInfo()) { + const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo(); + if (EPI.ReorderBufferSize) + TotalROBEntries = EPI.ReorderBufferSize; + } +} + +void RetireControlUnitStatistics::onEvent(const HWInstructionEvent &Event) { + if (Event.Type == HWInstructionEvent::Dispatched) { + unsigned NumEntries = + static_cast<const HWInstructionDispatchedEvent &>(Event).MicroOpcodes; + EntriesInUse += NumEntries; + } + + if (Event.Type == HWInstructionEvent::Retired) { + unsigned ReleasedEntries = Event.IR.getInstruction()->getDesc().NumMicroOps; + assert(EntriesInUse >= ReleasedEntries && "Invalid internal state!"); + EntriesInUse -= ReleasedEntries; + ++NumRetired; + } +} + +void RetireControlUnitStatistics::onCycleEnd() { + // Update histogram + RetiredPerCycle[NumRetired]++; + NumRetired = 0; + ++NumCycles; + MaxUsedEntries = std::max(MaxUsedEntries, EntriesInUse); + SumOfUsedEntries += EntriesInUse; +} + +void RetireControlUnitStatistics::printView(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nRetire Control Unit - " + << "number of cycles where we saw N instructions retired:\n"; + TempStream << "[# retired], [# cycles]\n"; + + for (const std::pair<unsigned, unsigned> &Entry : RetiredPerCycle) { + TempStream << " " << Entry.first; + if (Entry.first < 10) + TempStream << ", "; + else + TempStream << ", "; + TempStream << Entry.second << " (" + << format("%.1f", ((double)Entry.second / NumCycles) * 100.0) + << "%)\n"; + } + + unsigned AvgUsage = (double)SumOfUsedEntries / NumCycles; + double MaxUsagePercentage = ((double)MaxUsedEntries / TotalROBEntries) * 100.0; + double NormalizedMaxPercentage = floor((MaxUsagePercentage * 10) + 0.5) / 10; + double AvgUsagePercentage = ((double)AvgUsage / TotalROBEntries) * 100.0; + double NormalizedAvgPercentage = floor((AvgUsagePercentage * 10) + 0.5) / 10; + + TempStream << "\nTotal ROB Entries: " << TotalROBEntries + << "\nMax Used ROB Entries: " << MaxUsedEntries + << format(" ( %.1f%% )", NormalizedMaxPercentage) + << "\nAverage Used ROB Entries per cy: " << AvgUsage + << format(" ( %.1f%% )\n", NormalizedAvgPercentage); + + TempStream.flush(); + OS << Buffer; +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h new file mode 100644 index 000000000000..1a4d3dec5c56 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h @@ -0,0 +1,60 @@ +//===--------------------- RetireControlUnitStatistics.h --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines class RetireControlUnitStatistics: a view that knows how +/// to print general statistics related to the retire control unit. +/// +/// Example: +/// ======== +/// +/// Retire Control Unit - number of cycles where we saw N instructions retired: +/// [# retired], [# cycles] +/// 0, 109 (17.9%) +/// 1, 102 (16.7%) +/// 2, 399 (65.4%) +/// +/// Total ROB Entries: 64 +/// Max Used ROB Entries: 35 ( 54.7% ) +/// Average Used ROB Entries per cy: 32 ( 50.0% ) +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_RETIRECONTROLUNITSTATISTICS_H +#define LLVM_TOOLS_LLVM_MCA_RETIRECONTROLUNITSTATISTICS_H + +#include "Views/View.h" +#include "llvm/MC/MCSchedule.h" +#include <map> + +namespace llvm { +namespace mca { + +class RetireControlUnitStatistics : public View { + using Histogram = std::map<unsigned, unsigned>; + Histogram RetiredPerCycle; + + unsigned NumRetired; + unsigned NumCycles; + unsigned TotalROBEntries; + unsigned EntriesInUse; + unsigned MaxUsedEntries; + unsigned SumOfUsedEntries; + +public: + RetireControlUnitStatistics(const MCSchedModel &SM); + + void onEvent(const HWInstructionEvent &Event) override; + void onCycleEnd() override; + void printView(llvm::raw_ostream &OS) const override; +}; + +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp new file mode 100644 index 000000000000..bd0ba350ab68 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp @@ -0,0 +1,178 @@ +//===--------------------- SchedulerStatistics.cpp --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the SchedulerStatistics interface. +/// +//===----------------------------------------------------------------------===// + +#include "Views/SchedulerStatistics.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormattedStream.h" + +namespace llvm { +namespace mca { + +SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI) + : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0), + NumCycles(0), MostRecentLoadDispatched(~0U), + MostRecentStoreDispatched(~0U), + Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) { + if (SM.hasExtraProcessorInfo()) { + const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo(); + LQResourceID = EPI.LoadQueueID; + SQResourceID = EPI.StoreQueueID; + } +} + +// FIXME: This implementation works under the assumption that load/store queue +// entries are reserved at 'instruction dispatched' stage, and released at +// 'instruction executed' stage. This currently matches the behavior of LSUnit. +// +// The current design minimizes the number of events generated by the +// Dispatch/Execute stages, at the cost of doing extra bookkeeping in method +// `onEvent`. However, it introduces a subtle dependency between this view and +// how the LSUnit works. +// +// In future we should add a new "memory queue" event type, so that we stop +// making assumptions on how LSUnit internally works (See PR39828). +void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) { + if (Event.Type == HWInstructionEvent::Issued) { + const Instruction &Inst = *Event.IR.getInstruction(); + NumIssued += Inst.getDesc().NumMicroOps; + } else if (Event.Type == HWInstructionEvent::Dispatched) { + const Instruction &Inst = *Event.IR.getInstruction(); + const unsigned Index = Event.IR.getSourceIndex(); + if (LQResourceID && Inst.getDesc().MayLoad && + MostRecentLoadDispatched != Index) { + Usage[LQResourceID].SlotsInUse++; + MostRecentLoadDispatched = Index; + } + if (SQResourceID && Inst.getDesc().MayStore && + MostRecentStoreDispatched != Index) { + Usage[SQResourceID].SlotsInUse++; + MostRecentStoreDispatched = Index; + } + } else if (Event.Type == HWInstructionEvent::Executed) { + const Instruction &Inst = *Event.IR.getInstruction(); + if (LQResourceID && Inst.getDesc().MayLoad) { + assert(Usage[LQResourceID].SlotsInUse); + Usage[LQResourceID].SlotsInUse--; + } + if (SQResourceID && Inst.getDesc().MayStore) { + assert(Usage[SQResourceID].SlotsInUse); + Usage[SQResourceID].SlotsInUse--; + } + } +} + +void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */, + ArrayRef<unsigned> Buffers) { + for (const unsigned Buffer : Buffers) { + if (Buffer == LQResourceID || Buffer == SQResourceID) + continue; + Usage[Buffer].SlotsInUse++; + } +} + +void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */, + ArrayRef<unsigned> Buffers) { + for (const unsigned Buffer : Buffers) { + if (Buffer == LQResourceID || Buffer == SQResourceID) + continue; + Usage[Buffer].SlotsInUse--; + } +} + +void SchedulerStatistics::updateHistograms() { + for (BufferUsage &BU : Usage) { + BU.CumulativeNumUsedSlots += BU.SlotsInUse; + BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse); + } + + IssueWidthPerCycle[NumIssued]++; + NumIssued = 0; +} + +void SchedulerStatistics::printSchedulerStats(raw_ostream &OS) const { + OS << "\n\nSchedulers - " + << "number of cycles where we saw N micro opcodes issued:\n"; + OS << "[# issued], [# cycles]\n"; + + bool HasColors = OS.has_colors(); + const auto It = + std::max_element(IssueWidthPerCycle.begin(), IssueWidthPerCycle.end()); + for (const std::pair<unsigned, unsigned> &Entry : IssueWidthPerCycle) { + unsigned NumIssued = Entry.first; + if (NumIssued == It->first && HasColors) + OS.changeColor(raw_ostream::SAVEDCOLOR, true, false); + + unsigned IPC = Entry.second; + OS << " " << NumIssued << ", " << IPC << " (" + << format("%.1f", ((double)IPC / NumCycles) * 100) << "%)\n"; + if (HasColors) + OS.resetColor(); + } +} + +void SchedulerStatistics::printSchedulerUsage(raw_ostream &OS) const { + assert(NumCycles && "Unexpected number of cycles!"); + + OS << "\nScheduler's queue usage:\n"; + if (all_of(Usage, [](const BufferUsage &BU) { return !BU.MaxUsedSlots; })) { + OS << "No scheduler resources used.\n"; + return; + } + + OS << "[1] Resource name.\n" + << "[2] Average number of used buffer entries.\n" + << "[3] Maximum number of used buffer entries.\n" + << "[4] Total number of buffer entries.\n\n" + << " [1] [2] [3] [4]\n"; + + formatted_raw_ostream FOS(OS); + bool HasColors = FOS.has_colors(); + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(I); + if (ProcResource.BufferSize <= 0) + continue; + + const BufferUsage &BU = Usage[I]; + double AvgUsage = (double)BU.CumulativeNumUsedSlots / NumCycles; + double AlmostFullThreshold = (double)(ProcResource.BufferSize * 4) / 5; + unsigned NormalizedAvg = floor((AvgUsage * 10) + 0.5) / 10; + unsigned NormalizedThreshold = floor((AlmostFullThreshold * 10) + 0.5) / 10; + + FOS << ProcResource.Name; + FOS.PadToColumn(17); + if (HasColors && NormalizedAvg >= NormalizedThreshold) + FOS.changeColor(raw_ostream::YELLOW, true, false); + FOS << NormalizedAvg; + if (HasColors) + FOS.resetColor(); + FOS.PadToColumn(28); + if (HasColors && + BU.MaxUsedSlots == static_cast<unsigned>(ProcResource.BufferSize)) + FOS.changeColor(raw_ostream::RED, true, false); + FOS << BU.MaxUsedSlots; + if (HasColors) + FOS.resetColor(); + FOS.PadToColumn(39); + FOS << ProcResource.BufferSize << '\n'; + } + + FOS.flush(); +} + +void SchedulerStatistics::printView(raw_ostream &OS) const { + printSchedulerStats(OS); + printSchedulerUsage(OS); +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h new file mode 100644 index 000000000000..32711b4483b4 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h @@ -0,0 +1,95 @@ +//===--------------------- SchedulerStatistics.h ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines class SchedulerStatistics. Class SchedulerStatistics is a +/// View that listens to instruction issue events in order to print general +/// statistics related to the hardware schedulers. +/// +/// Example: +/// ======== +/// +/// Schedulers - number of cycles where we saw N instructions issued: +/// [# issued], [# cycles] +/// 0, 6 (2.9%) +/// 1, 106 (50.7%) +/// 2, 97 (46.4%) +/// +/// Scheduler's queue usage: +/// [1] Resource name. +/// [2] Average number of used buffer entries. +/// [3] Maximum number of used buffer entries. +/// [4] Total number of buffer entries. +/// +/// [1] [2] [3] [4] +/// JALU01 0 0 20 +/// JFPU01 15 18 18 +/// JLSAGU 0 0 12 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULERSTATISTICS_H +#define LLVM_TOOLS_LLVM_MCA_SCHEDULERSTATISTICS_H + +#include "Views/View.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include <map> + +namespace llvm { +namespace mca { + +class SchedulerStatistics final : public View { + const llvm::MCSchedModel &SM; + unsigned LQResourceID; + unsigned SQResourceID; + + unsigned NumIssued; + unsigned NumCycles; + + unsigned MostRecentLoadDispatched; + unsigned MostRecentStoreDispatched; + + // Tracks the usage of a scheduler's queue. + struct BufferUsage { + unsigned SlotsInUse; + unsigned MaxUsedSlots; + uint64_t CumulativeNumUsedSlots; + }; + + using Histogram = std::map<unsigned, unsigned>; + Histogram IssueWidthPerCycle; + + std::vector<BufferUsage> Usage; + + void updateHistograms(); + void printSchedulerStats(llvm::raw_ostream &OS) const; + void printSchedulerUsage(llvm::raw_ostream &OS) const; + +public: + SchedulerStatistics(const llvm::MCSubtargetInfo &STI); + void onEvent(const HWInstructionEvent &Event) override; + void onCycleBegin() override { NumCycles++; } + void onCycleEnd() override { updateHistograms(); } + + // Increases the number of used scheduler queue slots of every buffered + // resource in the Buffers set. + void onReservedBuffers(const InstRef &IR, + llvm::ArrayRef<unsigned> Buffers) override; + + // Decreases by one the number of used scheduler queue slots of every + // buffered resource in the Buffers set. + void onReleasedBuffers(const InstRef &IR, + llvm::ArrayRef<unsigned> Buffers) override; + + void printView(llvm::raw_ostream &OS) const override; +}; +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/Views/SummaryView.cpp b/llvm/tools/llvm-mca/Views/SummaryView.cpp new file mode 100644 index 000000000000..ef5550048f4c --- /dev/null +++ b/llvm/tools/llvm-mca/Views/SummaryView.cpp @@ -0,0 +1,94 @@ +//===--------------------- SummaryView.cpp -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the functionalities used by the SummaryView to print +/// the report information. +/// +//===----------------------------------------------------------------------===// + +#include "Views/SummaryView.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/Support.h" +#include "llvm/Support/Format.h" + +namespace llvm { +namespace mca { + +#define DEBUG_TYPE "llvm-mca" + +SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef<MCInst> S, + unsigned Width) + : SM(Model), Source(S), DispatchWidth(Width?Width: Model.IssueWidth), + LastInstructionIdx(0), + TotalCycles(0), NumMicroOps(0), + ProcResourceUsage(Model.getNumProcResourceKinds(), 0), + ProcResourceMasks(Model.getNumProcResourceKinds()), + ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0) { + computeProcResourceMasks(SM, ProcResourceMasks); + for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { + unsigned Index = getResourceStateIndex(ProcResourceMasks[I]); + ResIdx2ProcResID[Index] = I; + } +} + +void SummaryView::onEvent(const HWInstructionEvent &Event) { + if (Event.Type == HWInstructionEvent::Dispatched) + LastInstructionIdx = Event.IR.getSourceIndex(); + + // We are only interested in the "instruction retired" events generated by + // the retire stage for instructions that are part of iteration #0. + if (Event.Type != HWInstructionEvent::Retired || + Event.IR.getSourceIndex() >= Source.size()) + return; + + // Update the cumulative number of resource cycles based on the processor + // resource usage information available from the instruction descriptor. We + // need to compute the cumulative number of resource cycles for every + // processor resource which is consumed by an instruction of the block. + const Instruction &Inst = *Event.IR.getInstruction(); + const InstrDesc &Desc = Inst.getDesc(); + NumMicroOps += Desc.NumMicroOps; + for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) { + if (RU.second.size()) { + unsigned ProcResID = ResIdx2ProcResID[getResourceStateIndex(RU.first)]; + ProcResourceUsage[ProcResID] += RU.second.size(); + } + } +} + +void SummaryView::printView(raw_ostream &OS) const { + unsigned Instructions = Source.size(); + unsigned Iterations = (LastInstructionIdx / Instructions) + 1; + unsigned TotalInstructions = Instructions * Iterations; + unsigned TotalUOps = NumMicroOps * Iterations; + double IPC = (double)TotalInstructions / TotalCycles; + double UOpsPerCycle = (double)TotalUOps / TotalCycles; + double BlockRThroughput = computeBlockRThroughput( + SM, DispatchWidth, NumMicroOps, ProcResourceUsage); + + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "Iterations: " << Iterations; + TempStream << "\nInstructions: " << TotalInstructions; + TempStream << "\nTotal Cycles: " << TotalCycles; + TempStream << "\nTotal uOps: " << TotalUOps << '\n'; + TempStream << "\nDispatch Width: " << DispatchWidth; + TempStream << "\nuOps Per Cycle: " + << format("%.2f", floor((UOpsPerCycle * 100) + 0.5) / 100); + TempStream << "\nIPC: " + << format("%.2f", floor((IPC * 100) + 0.5) / 100); + TempStream << "\nBlock RThroughput: " + << format("%.1f", floor((BlockRThroughput * 10) + 0.5) / 10) + << '\n'; + TempStream.flush(); + OS << Buffer; +} + +} // namespace mca. +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/SummaryView.h b/llvm/tools/llvm-mca/Views/SummaryView.h new file mode 100644 index 000000000000..9be31b7d51bd --- /dev/null +++ b/llvm/tools/llvm-mca/Views/SummaryView.h @@ -0,0 +1,80 @@ +//===--------------------- SummaryView.h ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the summary view. +/// +/// The goal of the summary view is to give a very quick overview of the +/// performance throughput. Below is an example of summary view: +/// +/// +/// Iterations: 300 +/// Instructions: 900 +/// Total Cycles: 610 +/// Dispatch Width: 2 +/// IPC: 1.48 +/// Block RThroughput: 2.0 +/// +/// The summary view collects a few performance numbers. The two main +/// performance indicators are 'Total Cycles' and IPC (Instructions Per Cycle). +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H +#define LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H + +#include "Views/View.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +namespace mca { + +/// A view that collects and prints a few performance numbers. +class SummaryView : public View { + const llvm::MCSchedModel &SM; + llvm::ArrayRef<llvm::MCInst> Source; + const unsigned DispatchWidth; + unsigned LastInstructionIdx; + unsigned TotalCycles; + // The total number of micro opcodes contributed by a block of instructions. + unsigned NumMicroOps; + + // For each processor resource, this vector stores the cumulative number of + // resource cycles consumed by the analyzed code block. + llvm::SmallVector<unsigned, 8> ProcResourceUsage; + + // Each processor resource is associated with a so-called processor resource + // mask. This vector allows to correlate processor resource IDs with processor + // resource masks. There is exactly one element per each processor resource + // declared by the scheduling model. + llvm::SmallVector<uint64_t, 8> ProcResourceMasks; + + // Used to map resource indices to actual processor resource IDs. + llvm::SmallVector<unsigned, 8> ResIdx2ProcResID; + + // Compute the reciprocal throughput for the analyzed code block. + // The reciprocal block throughput is computed as the MAX between: + // - NumMicroOps / DispatchWidth + // - Total Resource Cycles / #Units (for every resource consumed). + double getBlockRThroughput() const; + +public: + SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef<llvm::MCInst> S, + unsigned Width); + + void onCycleEnd() override { ++TotalCycles; } + void onEvent(const HWInstructionEvent &Event) override; + void printView(llvm::raw_ostream &OS) const override; +}; + +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/Views/TimelineView.cpp b/llvm/tools/llvm-mca/Views/TimelineView.cpp new file mode 100644 index 000000000000..1e7caa297ac6 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/TimelineView.cpp @@ -0,0 +1,325 @@ +//===--------------------- TimelineView.cpp ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \brief +/// +/// This file implements the TimelineView interface. +/// +//===----------------------------------------------------------------------===// + +#include "Views/TimelineView.h" +#include <numeric> + +namespace llvm { +namespace mca { + +TimelineView::TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer, + llvm::ArrayRef<llvm::MCInst> S, unsigned Iterations, + unsigned Cycles) + : STI(sti), MCIP(Printer), Source(S), CurrentCycle(0), + MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0), WaitTime(S.size()), + UsedBuffer(S.size()) { + unsigned NumInstructions = Source.size(); + assert(Iterations && "Invalid number of iterations specified!"); + NumInstructions *= Iterations; + Timeline.resize(NumInstructions); + TimelineViewEntry InvalidTVEntry = {-1, 0, 0, 0, 0}; + std::fill(Timeline.begin(), Timeline.end(), InvalidTVEntry); + + WaitTimeEntry NullWTEntry = {0, 0, 0}; + std::fill(WaitTime.begin(), WaitTime.end(), NullWTEntry); + + std::pair<unsigned, int> NullUsedBufferEntry = {/* Invalid resource ID*/ 0, + /* unknown buffer size */ -1}; + std::fill(UsedBuffer.begin(), UsedBuffer.end(), NullUsedBufferEntry); +} + +void TimelineView::onReservedBuffers(const InstRef &IR, + ArrayRef<unsigned> Buffers) { + if (IR.getSourceIndex() >= Source.size()) + return; + + const MCSchedModel &SM = STI.getSchedModel(); + std::pair<unsigned, int> BufferInfo = {0, -1}; + for (const unsigned Buffer : Buffers) { + const MCProcResourceDesc &MCDesc = *SM.getProcResource(Buffer); + if (!BufferInfo.first || BufferInfo.second > MCDesc.BufferSize) { + BufferInfo.first = Buffer; + BufferInfo.second = MCDesc.BufferSize; + } + } + + UsedBuffer[IR.getSourceIndex()] = BufferInfo; +} + +void TimelineView::onEvent(const HWInstructionEvent &Event) { + const unsigned Index = Event.IR.getSourceIndex(); + if (Index >= Timeline.size()) + return; + + switch (Event.Type) { + case HWInstructionEvent::Retired: { + TimelineViewEntry &TVEntry = Timeline[Index]; + if (CurrentCycle < MaxCycle) + TVEntry.CycleRetired = CurrentCycle; + + // Update the WaitTime entry which corresponds to this Index. + assert(TVEntry.CycleDispatched >= 0 && "Invalid TVEntry found!"); + unsigned CycleDispatched = static_cast<unsigned>(TVEntry.CycleDispatched); + WaitTimeEntry &WTEntry = WaitTime[Index % Source.size()]; + WTEntry.CyclesSpentInSchedulerQueue += + TVEntry.CycleIssued - CycleDispatched; + assert(CycleDispatched <= TVEntry.CycleReady && + "Instruction cannot be ready if it hasn't been dispatched yet!"); + WTEntry.CyclesSpentInSQWhileReady += + TVEntry.CycleIssued - TVEntry.CycleReady; + WTEntry.CyclesSpentAfterWBAndBeforeRetire += + (CurrentCycle - 1) - TVEntry.CycleExecuted; + break; + } + case HWInstructionEvent::Ready: + Timeline[Index].CycleReady = CurrentCycle; + break; + case HWInstructionEvent::Issued: + Timeline[Index].CycleIssued = CurrentCycle; + break; + case HWInstructionEvent::Executed: + Timeline[Index].CycleExecuted = CurrentCycle; + break; + case HWInstructionEvent::Dispatched: + // There may be multiple dispatch events. Microcoded instructions that are + // expanded into multiple uOps may require multiple dispatch cycles. Here, + // we want to capture the first dispatch cycle. + if (Timeline[Index].CycleDispatched == -1) + Timeline[Index].CycleDispatched = static_cast<int>(CurrentCycle); + break; + default: + return; + } + if (CurrentCycle < MaxCycle) + LastCycle = std::max(LastCycle, CurrentCycle); +} + +static raw_ostream::Colors chooseColor(unsigned CumulativeCycles, + unsigned Executions, int BufferSize) { + if (CumulativeCycles && BufferSize < 0) + return raw_ostream::MAGENTA; + unsigned Size = static_cast<unsigned>(BufferSize); + if (CumulativeCycles >= Size * Executions) + return raw_ostream::RED; + if ((CumulativeCycles * 2) >= Size * Executions) + return raw_ostream::YELLOW; + return raw_ostream::SAVEDCOLOR; +} + +static void tryChangeColor(raw_ostream &OS, unsigned Cycles, + unsigned Executions, int BufferSize) { + if (!OS.has_colors()) + return; + + raw_ostream::Colors Color = chooseColor(Cycles, Executions, BufferSize); + if (Color == raw_ostream::SAVEDCOLOR) { + OS.resetColor(); + return; + } + OS.changeColor(Color, /* bold */ true, /* BG */ false); +} + +void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS, + const WaitTimeEntry &Entry, + unsigned SourceIndex, + unsigned Executions) const { + bool PrintingTotals = SourceIndex == Source.size(); + unsigned CumulativeExecutions = PrintingTotals ? Timeline.size() : Executions; + + if (!PrintingTotals) + OS << SourceIndex << '.'; + + OS.PadToColumn(7); + + double AverageTime1, AverageTime2, AverageTime3; + AverageTime1 = + (double)Entry.CyclesSpentInSchedulerQueue / CumulativeExecutions; + AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / CumulativeExecutions; + AverageTime3 = + (double)Entry.CyclesSpentAfterWBAndBeforeRetire / CumulativeExecutions; + + OS << Executions; + OS.PadToColumn(13); + + int BufferSize = PrintingTotals ? 0 : UsedBuffer[SourceIndex].second; + if (!PrintingTotals) + tryChangeColor(OS, Entry.CyclesSpentInSchedulerQueue, CumulativeExecutions, + BufferSize); + OS << format("%.1f", floor((AverageTime1 * 10) + 0.5) / 10); + OS.PadToColumn(20); + if (!PrintingTotals) + tryChangeColor(OS, Entry.CyclesSpentInSQWhileReady, CumulativeExecutions, + BufferSize); + OS << format("%.1f", floor((AverageTime2 * 10) + 0.5) / 10); + OS.PadToColumn(27); + if (!PrintingTotals) + tryChangeColor(OS, Entry.CyclesSpentAfterWBAndBeforeRetire, + CumulativeExecutions, STI.getSchedModel().MicroOpBufferSize); + OS << format("%.1f", floor((AverageTime3 * 10) + 0.5) / 10); + + if (OS.has_colors()) + OS.resetColor(); + OS.PadToColumn(34); +} + +void TimelineView::printAverageWaitTimes(raw_ostream &OS) const { + std::string Header = + "\n\nAverage Wait times (based on the timeline view):\n" + "[0]: Executions\n" + "[1]: Average time spent waiting in a scheduler's queue\n" + "[2]: Average time spent waiting in a scheduler's queue while ready\n" + "[3]: Average time elapsed from WB until retire stage\n\n" + " [0] [1] [2] [3]\n"; + OS << Header; + + // Use a different string stream for printing instructions. + std::string Instruction; + raw_string_ostream InstrStream(Instruction); + + formatted_raw_ostream FOS(OS); + unsigned Executions = Timeline.size() / Source.size(); + unsigned IID = 0; + for (const MCInst &Inst : Source) { + printWaitTimeEntry(FOS, WaitTime[IID], IID, Executions); + // Append the instruction info at the end of the line. + MCIP.printInst(&Inst, InstrStream, "", STI); + InstrStream.flush(); + + // Consume any tabs or spaces at the beginning of the string. + StringRef Str(Instruction); + Str = Str.ltrim(); + FOS << " " << Str << '\n'; + FOS.flush(); + Instruction = ""; + + ++IID; + } + + // If the timeline contains more than one instruction, + // let's also print global averages. + if (Source.size() != 1) { + WaitTimeEntry TotalWaitTime = std::accumulate( + WaitTime.begin(), WaitTime.end(), WaitTimeEntry{0, 0, 0}, + [](const WaitTimeEntry &A, const WaitTimeEntry &B) { + return WaitTimeEntry{ + A.CyclesSpentInSchedulerQueue + B.CyclesSpentInSchedulerQueue, + A.CyclesSpentInSQWhileReady + B.CyclesSpentInSQWhileReady, + A.CyclesSpentAfterWBAndBeforeRetire + + B.CyclesSpentAfterWBAndBeforeRetire}; + }); + printWaitTimeEntry(FOS, TotalWaitTime, IID, Executions); + FOS << " " + << "<total>" << '\n'; + InstrStream.flush(); + } +} + +void TimelineView::printTimelineViewEntry(formatted_raw_ostream &OS, + const TimelineViewEntry &Entry, + unsigned Iteration, + unsigned SourceIndex) const { + if (Iteration == 0 && SourceIndex == 0) + OS << '\n'; + OS << '[' << Iteration << ',' << SourceIndex << ']'; + OS.PadToColumn(10); + assert(Entry.CycleDispatched >= 0 && "Invalid TimelineViewEntry!"); + unsigned CycleDispatched = static_cast<unsigned>(Entry.CycleDispatched); + for (unsigned I = 0, E = CycleDispatched; I < E; ++I) + OS << ((I % 5 == 0) ? '.' : ' '); + OS << TimelineView::DisplayChar::Dispatched; + if (CycleDispatched != Entry.CycleExecuted) { + // Zero latency instructions have the same value for CycleDispatched, + // CycleIssued and CycleExecuted. + for (unsigned I = CycleDispatched + 1, E = Entry.CycleIssued; I < E; ++I) + OS << TimelineView::DisplayChar::Waiting; + if (Entry.CycleIssued == Entry.CycleExecuted) + OS << TimelineView::DisplayChar::DisplayChar::Executed; + else { + if (CycleDispatched != Entry.CycleIssued) + OS << TimelineView::DisplayChar::Executing; + for (unsigned I = Entry.CycleIssued + 1, E = Entry.CycleExecuted; I < E; + ++I) + OS << TimelineView::DisplayChar::Executing; + OS << TimelineView::DisplayChar::Executed; + } + } + + for (unsigned I = Entry.CycleExecuted + 1, E = Entry.CycleRetired; I < E; ++I) + OS << TimelineView::DisplayChar::RetireLag; + OS << TimelineView::DisplayChar::Retired; + + // Skip other columns. + for (unsigned I = Entry.CycleRetired + 1, E = LastCycle; I <= E; ++I) + OS << ((I % 5 == 0 || I == LastCycle) ? '.' : ' '); +} + +static void printTimelineHeader(formatted_raw_ostream &OS, unsigned Cycles) { + OS << "\n\nTimeline view:\n"; + if (Cycles >= 10) { + OS.PadToColumn(10); + for (unsigned I = 0; I <= Cycles; ++I) { + if (((I / 10) & 1) == 0) + OS << ' '; + else + OS << I % 10; + } + OS << '\n'; + } + + OS << "Index"; + OS.PadToColumn(10); + for (unsigned I = 0; I <= Cycles; ++I) { + if (((I / 10) & 1) == 0) + OS << I % 10; + else + OS << ' '; + } + OS << '\n'; +} + +void TimelineView::printTimeline(raw_ostream &OS) const { + formatted_raw_ostream FOS(OS); + printTimelineHeader(FOS, LastCycle); + FOS.flush(); + + // Use a different string stream for the instruction. + std::string Instruction; + raw_string_ostream InstrStream(Instruction); + + unsigned IID = 0; + const unsigned Iterations = Timeline.size() / Source.size(); + for (unsigned Iteration = 0; Iteration < Iterations; ++Iteration) { + for (const MCInst &Inst : Source) { + const TimelineViewEntry &Entry = Timeline[IID]; + if (Entry.CycleRetired == 0) + return; + + unsigned SourceIndex = IID % Source.size(); + printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex); + // Append the instruction info at the end of the line. + MCIP.printInst(&Inst, InstrStream, "", STI); + InstrStream.flush(); + + // Consume any tabs or spaces at the beginning of the string. + StringRef Str(Instruction); + Str = Str.ltrim(); + FOS << " " << Str << '\n'; + FOS.flush(); + Instruction = ""; + + ++IID; + } + } +} +} // namespace mca +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/TimelineView.h b/llvm/tools/llvm-mca/Views/TimelineView.h new file mode 100644 index 000000000000..9bec3b87db45 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/TimelineView.h @@ -0,0 +1,189 @@ +//===--------------------- TimelineView.h -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \brief +/// +/// This file implements a timeline view for the llvm-mca tool. +/// +/// Class TimelineView observes events generated by the pipeline. For every +/// instruction executed by the pipeline, it stores information related to +/// state transition. It then plots that information in the form of a table +/// as reported by the example below: +/// +/// Timeline view: +/// 0123456 +/// Index 0123456789 +/// +/// [0,0] DeER . . .. vmovshdup %xmm0, %xmm1 +/// [0,1] DeER . . .. vpermilpd $1, %xmm0, %xmm2 +/// [0,2] .DeER. . .. vpermilps $231, %xmm0, %xmm5 +/// [0,3] .DeeeER . .. vaddss %xmm1, %xmm0, %xmm3 +/// [0,4] . D==eeeER. .. vaddss %xmm3, %xmm2, %xmm4 +/// [0,5] . D=====eeeER .. vaddss %xmm4, %xmm5, %xmm6 +/// +/// [1,0] . DeE------R .. vmovshdup %xmm0, %xmm1 +/// [1,1] . DeE------R .. vpermilpd $1, %xmm0, %xmm2 +/// [1,2] . DeE-----R .. vpermilps $231, %xmm0, %xmm5 +/// [1,3] . D=eeeE--R .. vaddss %xmm1, %xmm0, %xmm3 +/// [1,4] . D===eeeER .. vaddss %xmm3, %xmm2, %xmm4 +/// [1,5] . D======eeeER vaddss %xmm4, %xmm5, %xmm6 +/// +/// There is an entry for every instruction in the input assembly sequence. +/// The first field is a pair of numbers obtained from the instruction index. +/// The first element of the pair is the iteration index, while the second +/// element of the pair is a sequence number (i.e. a position in the assembly +/// sequence). +/// The second field of the table is the actual timeline information; each +/// column is the information related to a specific cycle of execution. +/// The timeline of an instruction is described by a sequence of character +/// where each character represents the instruction state at a specific cycle. +/// +/// Possible instruction states are: +/// D: Instruction Dispatched +/// e: Instruction Executing +/// E: Instruction Executed (write-back stage) +/// R: Instruction retired +/// =: Instruction waiting in the Scheduler's queue +/// -: Instruction executed, waiting to retire in order. +/// +/// dots ('.') and empty spaces are cycles where the instruction is not +/// in-flight. +/// +/// The last column is the assembly instruction associated to the entry. +/// +/// Based on the timeline view information from the example, instruction 0 +/// at iteration 0 was dispatched at cycle 0, and was retired at cycle 3. +/// Instruction [0,1] was also dispatched at cycle 0, and it retired at +/// the same cycle than instruction [0,0]. +/// Instruction [0,4] has been dispatched at cycle 2. However, it had to +/// wait for two cycles before being issued. That is because operands +/// became ready only at cycle 5. +/// +/// This view helps further understanding bottlenecks and the impact of +/// resource pressure on the code. +/// +/// To better understand why instructions had to wait for multiple cycles in +/// the scheduler's queue, class TimelineView also reports extra timing info +/// in another table named "Average Wait times" (see example below). +/// +/// +/// Average Wait times (based on the timeline view): +/// [0]: Executions +/// [1]: Average time spent waiting in a scheduler's queue +/// [2]: Average time spent waiting in a scheduler's queue while ready +/// [3]: Average time elapsed from WB until retire stage +/// +/// [0] [1] [2] [3] +/// 0. 2 1.0 1.0 3.0 vmovshdup %xmm0, %xmm1 +/// 1. 2 1.0 1.0 3.0 vpermilpd $1, %xmm0, %xmm2 +/// 2. 2 1.0 1.0 2.5 vpermilps $231, %xmm0, %xmm5 +/// 3. 2 1.5 0.5 1.0 vaddss %xmm1, %xmm0, %xmm3 +/// 4. 2 3.5 0.0 0.0 vaddss %xmm3, %xmm2, %xmm4 +/// 5. 2 6.5 0.0 0.0 vaddss %xmm4, %xmm5, %xmm6 +/// 2 2.4 0.6 1.6 <total> +/// +/// By comparing column [2] with column [1], we get an idea about how many +/// cycles were spent in the scheduler's queue due to data dependencies. +/// +/// In this example, instruction 5 spent an average of ~6 cycles in the +/// scheduler's queue. As soon as operands became ready, the instruction +/// was immediately issued to the pipeline(s). +/// That is expected because instruction 5 cannot transition to the "ready" +/// state until %xmm4 is written by instruction 4. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H +#define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H + +#include "Views/View.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +namespace mca { + +/// This class listens to instruction state transition events +/// in order to construct a timeline information. +/// +/// For every instruction executed by the Pipeline, this class constructs +/// a TimelineViewEntry object. TimelineViewEntry objects are then used +/// to print the timeline information, as well as the "average wait times" +/// for every instruction in the input assembly sequence. +class TimelineView : public View { + const llvm::MCSubtargetInfo &STI; + llvm::MCInstPrinter &MCIP; + llvm::ArrayRef<llvm::MCInst> Source; + + unsigned CurrentCycle; + unsigned MaxCycle; + unsigned LastCycle; + + struct TimelineViewEntry { + int CycleDispatched; // A negative value is an "invalid cycle". + unsigned CycleReady; + unsigned CycleIssued; + unsigned CycleExecuted; + unsigned CycleRetired; + }; + std::vector<TimelineViewEntry> Timeline; + + struct WaitTimeEntry { + unsigned CyclesSpentInSchedulerQueue; + unsigned CyclesSpentInSQWhileReady; + unsigned CyclesSpentAfterWBAndBeforeRetire; + }; + std::vector<WaitTimeEntry> WaitTime; + + // This field is used to map instructions to buffered resources. + // Elements of this vector are <resourceID, BufferSizer> pairs. + std::vector<std::pair<unsigned, int>> UsedBuffer; + + void printTimelineViewEntry(llvm::formatted_raw_ostream &OS, + const TimelineViewEntry &E, unsigned Iteration, + unsigned SourceIndex) const; + void printWaitTimeEntry(llvm::formatted_raw_ostream &OS, + const WaitTimeEntry &E, unsigned Index, + unsigned Executions) const; + + // Display characters for the TimelineView report output. + struct DisplayChar { + static const char Dispatched = 'D'; + static const char Executed = 'E'; + static const char Retired = 'R'; + static const char Waiting = '='; // Instruction is waiting in the scheduler. + static const char Executing = 'e'; + static const char RetireLag = '-'; // The instruction is waiting to retire. + }; + +public: + TimelineView(const llvm::MCSubtargetInfo &sti, llvm::MCInstPrinter &Printer, + llvm::ArrayRef<llvm::MCInst> S, unsigned Iterations, + unsigned Cycles); + + // Event handlers. + void onCycleEnd() override { ++CurrentCycle; } + void onEvent(const HWInstructionEvent &Event) override; + void onReservedBuffers(const InstRef &IR, + llvm::ArrayRef<unsigned> Buffers) override; + + // print functionalities. + void printTimeline(llvm::raw_ostream &OS) const; + void printAverageWaitTimes(llvm::raw_ostream &OS) const; + void printView(llvm::raw_ostream &OS) const override { + printTimeline(OS); + printAverageWaitTimes(OS); + } +}; +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/Views/View.cpp b/llvm/tools/llvm-mca/Views/View.cpp new file mode 100644 index 000000000000..8e5c34d2d5c2 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/View.cpp @@ -0,0 +1,21 @@ +//===----------------------- View.cpp ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines the virtual anchor method in View.h to pin the vtable. +/// +//===----------------------------------------------------------------------===// + +#include "Views/View.h" + +namespace llvm { +namespace mca { + +void View::anchor() {} +} // namespace mca +} // namespace llvm diff --git a/llvm/tools/llvm-mca/Views/View.h b/llvm/tools/llvm-mca/Views/View.h new file mode 100644 index 000000000000..3b52511b4d29 --- /dev/null +++ b/llvm/tools/llvm-mca/Views/View.h @@ -0,0 +1,33 @@ +//===----------------------- View.h -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines the main interface for Views. Each view contributes a +/// portion of the final report generated by the tool. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_VIEW_H +#define LLVM_TOOLS_LLVM_MCA_VIEW_H + +#include "llvm/MCA/HWEventListener.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +namespace mca { + +class View : public HWEventListener { +public: + virtual void printView(llvm::raw_ostream &OS) const = 0; + virtual ~View() = default; + void anchor() override; +}; +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp new file mode 100644 index 000000000000..99c45eebdd88 --- /dev/null +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -0,0 +1,565 @@ +//===-- llvm-mca.cpp - Machine Code Analyzer -------------------*- C++ -* -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This utility is a simple driver that allows static performance analysis on +// machine code similarly to how IACA (Intel Architecture Code Analyzer) works. +// +// llvm-mca [options] <file-name> +// -march <type> +// -mcpu <cpu> +// -o <file> +// +// The target defaults to the host target. +// The cpu defaults to the 'native' host cpu. +// The output defaults to standard output. +// +//===----------------------------------------------------------------------===// + +#include "CodeRegion.h" +#include "CodeRegionGenerator.h" +#include "PipelinePrinter.h" +#include "Views/BottleneckAnalysis.h" +#include "Views/DispatchStatistics.h" +#include "Views/InstructionInfoView.h" +#include "Views/RegisterFileStatistics.h" +#include "Views/ResourcePressureView.h" +#include "Views/RetireControlUnitStatistics.h" +#include "Views/SchedulerStatistics.h" +#include "Views/SummaryView.h" +#include "Views/TimelineView.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptionsCommandFlags.inc" +#include "llvm/MCA/CodeEmitter.h" +#include "llvm/MCA/Context.h" +#include "llvm/MCA/InstrBuilder.h" +#include "llvm/MCA/Pipeline.h" +#include "llvm/MCA/Stages/EntryStage.h" +#include "llvm/MCA/Stages/InstructionTables.h" +#include "llvm/MCA/Support.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Support/WithColor.h" + +using namespace llvm; + +static cl::OptionCategory ToolOptions("Tool Options"); +static cl::OptionCategory ViewOptions("View Options"); + +static cl::opt<std::string> InputFilename(cl::Positional, + cl::desc("<input file>"), + cl::cat(ToolOptions), cl::init("-")); + +static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"), + cl::init("-"), cl::cat(ToolOptions), + cl::value_desc("filename")); + +static cl::opt<std::string> + ArchName("march", + cl::desc("Target architecture. " + "See -version for available targets"), + cl::cat(ToolOptions)); + +static cl::opt<std::string> + TripleName("mtriple", + cl::desc("Target triple. See -version for available targets"), + cl::cat(ToolOptions)); + +static cl::opt<std::string> + MCPU("mcpu", + cl::desc("Target a specific cpu type (-mcpu=help for details)"), + cl::value_desc("cpu-name"), cl::cat(ToolOptions), cl::init("native")); + +static cl::opt<std::string> + MATTR("mattr", + cl::desc("Additional target features."), + cl::cat(ToolOptions)); + +static cl::opt<int> + OutputAsmVariant("output-asm-variant", + cl::desc("Syntax variant to use for output printing"), + cl::cat(ToolOptions), cl::init(-1)); + +static cl::opt<bool> + PrintImmHex("print-imm-hex", cl::cat(ToolOptions), cl::init(false), + cl::desc("Prefer hex format when printing immediate values")); + +static cl::opt<unsigned> Iterations("iterations", + cl::desc("Number of iterations to run"), + cl::cat(ToolOptions), cl::init(0)); + +static cl::opt<unsigned> + DispatchWidth("dispatch", cl::desc("Override the processor dispatch width"), + cl::cat(ToolOptions), cl::init(0)); + +static cl::opt<unsigned> + RegisterFileSize("register-file-size", + cl::desc("Maximum number of physical registers which can " + "be used for register mappings"), + cl::cat(ToolOptions), cl::init(0)); + +static cl::opt<unsigned> + MicroOpQueue("micro-op-queue-size", cl::Hidden, + cl::desc("Number of entries in the micro-op queue"), + cl::cat(ToolOptions), cl::init(0)); + +static cl::opt<unsigned> + DecoderThroughput("decoder-throughput", cl::Hidden, + cl::desc("Maximum throughput from the decoders " + "(instructions per cycle)"), + cl::cat(ToolOptions), cl::init(0)); + +static cl::opt<bool> + PrintRegisterFileStats("register-file-stats", + cl::desc("Print register file statistics"), + cl::cat(ViewOptions), cl::init(false)); + +static cl::opt<bool> PrintDispatchStats("dispatch-stats", + cl::desc("Print dispatch statistics"), + cl::cat(ViewOptions), cl::init(false)); + +static cl::opt<bool> + PrintSummaryView("summary-view", cl::Hidden, + cl::desc("Print summary view (enabled by default)"), + cl::cat(ViewOptions), cl::init(true)); + +static cl::opt<bool> PrintSchedulerStats("scheduler-stats", + cl::desc("Print scheduler statistics"), + cl::cat(ViewOptions), cl::init(false)); + +static cl::opt<bool> + PrintRetireStats("retire-stats", + cl::desc("Print retire control unit statistics"), + cl::cat(ViewOptions), cl::init(false)); + +static cl::opt<bool> PrintResourcePressureView( + "resource-pressure", + cl::desc("Print the resource pressure view (enabled by default)"), + cl::cat(ViewOptions), cl::init(true)); + +static cl::opt<bool> PrintTimelineView("timeline", + cl::desc("Print the timeline view"), + cl::cat(ViewOptions), cl::init(false)); + +static cl::opt<unsigned> TimelineMaxIterations( + "timeline-max-iterations", + cl::desc("Maximum number of iterations to print in timeline view"), + cl::cat(ViewOptions), cl::init(0)); + +static cl::opt<unsigned> TimelineMaxCycles( + "timeline-max-cycles", + cl::desc( + "Maximum number of cycles in the timeline view. Defaults to 80 cycles"), + cl::cat(ViewOptions), cl::init(80)); + +static cl::opt<bool> + AssumeNoAlias("noalias", + cl::desc("If set, assume that loads and stores do not alias"), + cl::cat(ToolOptions), cl::init(true)); + +static cl::opt<unsigned> LoadQueueSize("lqueue", + cl::desc("Size of the load queue"), + cl::cat(ToolOptions), cl::init(0)); + +static cl::opt<unsigned> StoreQueueSize("squeue", + cl::desc("Size of the store queue"), + cl::cat(ToolOptions), cl::init(0)); + +static cl::opt<bool> + PrintInstructionTables("instruction-tables", + cl::desc("Print instruction tables"), + cl::cat(ToolOptions), cl::init(false)); + +static cl::opt<bool> PrintInstructionInfoView( + "instruction-info", + cl::desc("Print the instruction info view (enabled by default)"), + cl::cat(ViewOptions), cl::init(true)); + +static cl::opt<bool> EnableAllStats("all-stats", + cl::desc("Print all hardware statistics"), + cl::cat(ViewOptions), cl::init(false)); + +static cl::opt<bool> + EnableAllViews("all-views", + cl::desc("Print all views including hardware statistics"), + cl::cat(ViewOptions), cl::init(false)); + +static cl::opt<bool> EnableBottleneckAnalysis( + "bottleneck-analysis", + cl::desc("Enable bottleneck analysis (disabled by default)"), + cl::cat(ViewOptions), cl::init(false)); + +static cl::opt<bool> ShowEncoding( + "show-encoding", + cl::desc("Print encoding information in the instruction info view"), + cl::cat(ViewOptions), cl::init(false)); + +namespace { + +const Target *getTarget(const char *ProgName) { + if (TripleName.empty()) + TripleName = Triple::normalize(sys::getDefaultTargetTriple()); + Triple TheTriple(TripleName); + + // Get the target specific parser. + std::string Error; + const Target *TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) { + errs() << ProgName << ": " << Error; + return nullptr; + } + + // Return the found target. + return TheTarget; +} + +ErrorOr<std::unique_ptr<ToolOutputFile>> getOutputStream() { + if (OutputFilename == "") + OutputFilename = "-"; + std::error_code EC; + auto Out = + std::make_unique<ToolOutputFile>(OutputFilename, EC, sys::fs::OF_Text); + if (!EC) + return std::move(Out); + return EC; +} +} // end of anonymous namespace + +static void processOptionImpl(cl::opt<bool> &O, const cl::opt<bool> &Default) { + if (!O.getNumOccurrences() || O.getPosition() < Default.getPosition()) + O = Default.getValue(); +} + +static void processViewOptions() { + if (!EnableAllViews.getNumOccurrences() && + !EnableAllStats.getNumOccurrences()) + return; + + if (EnableAllViews.getNumOccurrences()) { + processOptionImpl(PrintSummaryView, EnableAllViews); + processOptionImpl(EnableBottleneckAnalysis, EnableAllViews); + processOptionImpl(PrintResourcePressureView, EnableAllViews); + processOptionImpl(PrintTimelineView, EnableAllViews); + processOptionImpl(PrintInstructionInfoView, EnableAllViews); + } + + const cl::opt<bool> &Default = + EnableAllViews.getPosition() < EnableAllStats.getPosition() + ? EnableAllStats + : EnableAllViews; + processOptionImpl(PrintRegisterFileStats, Default); + processOptionImpl(PrintDispatchStats, Default); + processOptionImpl(PrintSchedulerStats, Default); + processOptionImpl(PrintRetireStats, Default); +} + +// Returns true on success. +static bool runPipeline(mca::Pipeline &P) { + // Handle pipeline errors here. + Expected<unsigned> Cycles = P.run(); + if (!Cycles) { + WithColor::error() << toString(Cycles.takeError()); + return false; + } + return true; +} + +int main(int argc, char **argv) { + InitLLVM X(argc, argv); + + // Initialize targets and assembly parsers. + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllAsmParsers(); + + // Enable printing of available targets when flag --version is specified. + cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); + + cl::HideUnrelatedOptions({&ToolOptions, &ViewOptions}); + + // Parse flags and initialize target options. + cl::ParseCommandLineOptions(argc, argv, + "llvm machine code performance analyzer.\n"); + + // Get the target from the triple. If a triple is not specified, then select + // the default triple for the host. If the triple doesn't correspond to any + // registered target, then exit with an error message. + const char *ProgName = argv[0]; + const Target *TheTarget = getTarget(ProgName); + if (!TheTarget) + return 1; + + // GetTarget() may replaced TripleName with a default triple. + // For safety, reconstruct the Triple object. + Triple TheTriple(TripleName); + + ErrorOr<std::unique_ptr<MemoryBuffer>> BufferPtr = + MemoryBuffer::getFileOrSTDIN(InputFilename); + if (std::error_code EC = BufferPtr.getError()) { + WithColor::error() << InputFilename << ": " << EC.message() << '\n'; + return 1; + } + + // Apply overrides to llvm-mca specific options. + processViewOptions(); + + if (!MCPU.compare("native")) + MCPU = llvm::sys::getHostCPUName(); + + std::unique_ptr<MCSubtargetInfo> STI( + TheTarget->createMCSubtargetInfo(TripleName, MCPU, MATTR)); + if (!STI->isCPUStringValid(MCPU)) + return 1; + + if (!PrintInstructionTables && !STI->getSchedModel().isOutOfOrder()) { + WithColor::error() << "please specify an out-of-order cpu. '" << MCPU + << "' is an in-order cpu.\n"; + return 1; + } + + if (!STI->getSchedModel().hasInstrSchedModel()) { + WithColor::error() + << "unable to find instruction-level scheduling information for" + << " target triple '" << TheTriple.normalize() << "' and cpu '" << MCPU + << "'.\n"; + + if (STI->getSchedModel().InstrItineraries) + WithColor::note() + << "cpu '" << MCPU << "' provides itineraries. However, " + << "instruction itineraries are currently unsupported.\n"; + return 1; + } + + std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName)); + assert(MRI && "Unable to create target register info!"); + + std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName)); + assert(MAI && "Unable to create target asm info!"); + + MCObjectFileInfo MOFI; + SourceMgr SrcMgr; + + // Tell SrcMgr about this buffer, which is what the parser will pick up. + SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc()); + + MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr); + + MOFI.InitMCObjectFileInfo(TheTriple, /* PIC= */ false, Ctx); + + std::unique_ptr<buffer_ostream> BOS; + + std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo()); + + std::unique_ptr<MCInstrAnalysis> MCIA( + TheTarget->createMCInstrAnalysis(MCII.get())); + + // Parse the input and create CodeRegions that llvm-mca can analyze. + mca::AsmCodeRegionGenerator CRG(*TheTarget, SrcMgr, Ctx, *MAI, *STI, *MCII); + Expected<const mca::CodeRegions &> RegionsOrErr = CRG.parseCodeRegions(); + if (!RegionsOrErr) { + if (auto Err = + handleErrors(RegionsOrErr.takeError(), [](const StringError &E) { + WithColor::error() << E.getMessage() << '\n'; + })) { + // Default case. + WithColor::error() << toString(std::move(Err)) << '\n'; + } + return 1; + } + const mca::CodeRegions &Regions = *RegionsOrErr; + + // Early exit if errors were found by the code region parsing logic. + if (!Regions.isValid()) + return 1; + + if (Regions.empty()) { + WithColor::error() << "no assembly instructions found.\n"; + return 1; + } + + // Now initialize the output file. + auto OF = getOutputStream(); + if (std::error_code EC = OF.getError()) { + WithColor::error() << EC.message() << '\n'; + return 1; + } + + unsigned AssemblerDialect = CRG.getAssemblerDialect(); + if (OutputAsmVariant >= 0) + AssemblerDialect = static_cast<unsigned>(OutputAsmVariant); + std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter( + Triple(TripleName), AssemblerDialect, *MAI, *MCII, *MRI)); + if (!IP) { + WithColor::error() + << "unable to create instruction printer for target triple '" + << TheTriple.normalize() << "' with assembly variant " + << AssemblerDialect << ".\n"; + return 1; + } + + // Set the display preference for hex vs. decimal immediates. + IP->setPrintImmHex(PrintImmHex); + + std::unique_ptr<ToolOutputFile> TOF = std::move(*OF); + + const MCSchedModel &SM = STI->getSchedModel(); + + // Create an instruction builder. + mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get()); + + // Create a context to control ownership of the pipeline hardware. + mca::Context MCA(*MRI, *STI); + + mca::PipelineOptions PO(MicroOpQueue, DecoderThroughput, DispatchWidth, + RegisterFileSize, LoadQueueSize, StoreQueueSize, + AssumeNoAlias, EnableBottleneckAnalysis); + + // Number each region in the sequence. + unsigned RegionIdx = 0; + + std::unique_ptr<MCCodeEmitter> MCE( + TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx)); + + std::unique_ptr<MCAsmBackend> MAB(TheTarget->createMCAsmBackend( + *STI, *MRI, InitMCTargetOptionsFromFlags())); + + for (const std::unique_ptr<mca::CodeRegion> &Region : Regions) { + // Skip empty code regions. + if (Region->empty()) + continue; + + // Don't print the header of this region if it is the default region, and + // it doesn't have an end location. + if (Region->startLoc().isValid() || Region->endLoc().isValid()) { + TOF->os() << "\n[" << RegionIdx++ << "] Code Region"; + StringRef Desc = Region->getDescription(); + if (!Desc.empty()) + TOF->os() << " - " << Desc; + TOF->os() << "\n\n"; + } + + // Lower the MCInst sequence into an mca::Instruction sequence. + ArrayRef<MCInst> Insts = Region->getInstructions(); + mca::CodeEmitter CE(*STI, *MAB, *MCE, Insts); + std::vector<std::unique_ptr<mca::Instruction>> LoweredSequence; + for (const MCInst &MCI : Insts) { + Expected<std::unique_ptr<mca::Instruction>> Inst = + IB.createInstruction(MCI); + if (!Inst) { + if (auto NewE = handleErrors( + Inst.takeError(), + [&IP, &STI](const mca::InstructionError<MCInst> &IE) { + std::string InstructionStr; + raw_string_ostream SS(InstructionStr); + WithColor::error() << IE.Message << '\n'; + IP->printInst(&IE.Inst, SS, "", *STI); + SS.flush(); + WithColor::note() + << "instruction: " << InstructionStr << '\n'; + })) { + // Default case. + WithColor::error() << toString(std::move(NewE)); + } + return 1; + } + + LoweredSequence.emplace_back(std::move(Inst.get())); + } + + mca::SourceMgr S(LoweredSequence, PrintInstructionTables ? 1 : Iterations); + + if (PrintInstructionTables) { + // Create a pipeline, stages, and a printer. + auto P = std::make_unique<mca::Pipeline>(); + P->appendStage(std::make_unique<mca::EntryStage>(S)); + P->appendStage(std::make_unique<mca::InstructionTables>(SM)); + mca::PipelinePrinter Printer(*P); + + // Create the views for this pipeline, execute, and emit a report. + if (PrintInstructionInfoView) { + Printer.addView(std::make_unique<mca::InstructionInfoView>( + *STI, *MCII, CE, ShowEncoding, Insts, *IP)); + } + Printer.addView( + std::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts)); + + if (!runPipeline(*P)) + return 1; + + Printer.printReport(TOF->os()); + continue; + } + + // Create a basic pipeline simulating an out-of-order backend. + auto P = MCA.createDefaultPipeline(PO, S); + mca::PipelinePrinter Printer(*P); + + if (PrintSummaryView) + Printer.addView( + std::make_unique<mca::SummaryView>(SM, Insts, DispatchWidth)); + + if (EnableBottleneckAnalysis) { + Printer.addView(std::make_unique<mca::BottleneckAnalysis>( + *STI, *IP, Insts, S.getNumIterations())); + } + + if (PrintInstructionInfoView) + Printer.addView(std::make_unique<mca::InstructionInfoView>( + *STI, *MCII, CE, ShowEncoding, Insts, *IP)); + + if (PrintDispatchStats) + Printer.addView(std::make_unique<mca::DispatchStatistics>()); + + if (PrintSchedulerStats) + Printer.addView(std::make_unique<mca::SchedulerStatistics>(*STI)); + + if (PrintRetireStats) + Printer.addView(std::make_unique<mca::RetireControlUnitStatistics>(SM)); + + if (PrintRegisterFileStats) + Printer.addView(std::make_unique<mca::RegisterFileStatistics>(*STI)); + + if (PrintResourcePressureView) + Printer.addView( + std::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts)); + + if (PrintTimelineView) { + unsigned TimelineIterations = + TimelineMaxIterations ? TimelineMaxIterations : 10; + Printer.addView(std::make_unique<mca::TimelineView>( + *STI, *IP, Insts, std::min(TimelineIterations, S.getNumIterations()), + TimelineMaxCycles)); + } + + if (!runPipeline(*P)) + return 1; + + Printer.printReport(TOF->os()); + + // Clear the InstrBuilder internal state in preparation for another round. + IB.clear(); + } + + TOF->keep(); + return 0; +} |