diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2023-12-09 13:28:42 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2023-12-09 13:28:42 +0000 |
| commit | b1c73532ee8997fe5dfbeb7d223027bdf99758a0 (patch) | |
| tree | 7d6e51c294ab6719475d660217aa0c0ad0526292 /lld/ELF/CallGraphSort.cpp | |
| parent | 7fa27ce4a07f19b07799a767fc29416f3b625afb (diff) | |
Diffstat (limited to 'lld/ELF/CallGraphSort.cpp')
| -rw-r--r-- | lld/ELF/CallGraphSort.cpp | 139 |
1 files changed, 106 insertions, 33 deletions
diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp index ff72731b1f38..a0cf491bbae3 100644 --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/ELF/CallGraphSort.cpp @@ -6,38 +6,21 @@ // //===----------------------------------------------------------------------===// /// -/// Implementation of Call-Chain Clustering from: Optimizing Function Placement -/// for Large-Scale Data-Center Applications -/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf -/// -/// The goal of this algorithm is to improve runtime performance of the final -/// executable by arranging code sections such that page table and i-cache -/// misses are minimized. -/// -/// Definitions: -/// * Cluster -/// * An ordered list of input sections which are laid out as a unit. At the -/// beginning of the algorithm each input section has its own cluster and -/// the weight of the cluster is the sum of the weight of all incoming -/// edges. -/// * Call-Chain Clustering (C³) Heuristic -/// * Defines when and how clusters are combined. Pick the highest weighted -/// input section then add it to its most likely predecessor if it wouldn't -/// penalize it too much. -/// * Density -/// * The weight of the cluster divided by the size of the cluster. This is a -/// proxy for the amount of execution time spent per byte of the cluster. -/// -/// It does so given a call graph profile by the following: -/// * Build a weighted call graph from the call graph profile -/// * Sort input sections by weight -/// * For each input section starting with the highest weight -/// * Find its most likely predecessor cluster -/// * Check if the combined cluster would be too large, or would have too low -/// a density. -/// * If not, then combine the clusters. -/// * Sort non-empty clusters by density +/// The file is responsible for sorting sections using LLVM call graph profile +/// data by placing frequently executed code sections together. The goal of the +/// placement is to improve the runtime performance of the final executable by +/// arranging code sections so that i-TLB misses and i-cache misses are reduced. /// +/// The algorithm first builds a call graph based on the profile data and then +/// iteratively merges "chains" (ordered lists) of input sections which will be +/// laid out as a unit. There are two implementations for deciding how to +/// merge a pair of chains: +/// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows +/// "Optimizing Function Placement for Large-Scale Data-Center Applications" +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which +/// typically produces layouts with higher locality, and hence, yields fewer +/// instruction cache misses on large binaries. //===----------------------------------------------------------------------===// #include "CallGraphSort.h" @@ -45,6 +28,7 @@ #include "InputSection.h" #include "Symbols.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include <numeric> @@ -75,6 +59,33 @@ struct Cluster { Edge bestPred = {-1, 0}; }; +/// Implementation of the Call-Chain Clustering (C^3). The goal of this +/// algorithm is to improve runtime performance of the executable by arranging +/// code sections such that page table and i-cache misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are laid out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incoming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the amount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the call graph profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density class CallGraphSort { public: CallGraphSort(); @@ -260,11 +271,73 @@ DenseMap<const InputSectionBase *, int> CallGraphSort::run() { return orderMap; } +// Sort sections by the profile data using the Cache-Directed Sort algorithm. +// The placement is done by optimizing the locality by co-locating frequently +// executed code sections together. +DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() { + SmallVector<uint64_t, 0> funcSizes; + SmallVector<uint64_t, 0> funcCounts; + SmallVector<codelayout::EdgeCount, 0> callCounts; + SmallVector<uint64_t, 0> callOffsets; + SmallVector<const InputSectionBase *, 0> sections; + DenseMap<const InputSectionBase *, size_t> secToTargetId; + + auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { + auto res = secToTargetId.try_emplace(inSec, sections.size()); + if (res.second) { + // inSec does not appear before in the graph. + sections.push_back(inSec); + funcSizes.push_back(inSec->getSize()); + funcCounts.push_back(0); + } + return res.first->second; + }; + + // Create the graph. + for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) { + const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first); + const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second); + // Ignore edges between input sections belonging to different sections. + if (fromSB->getOutputSection() != toSB->getOutputSection()) + continue; + + uint64_t weight = c.second; + // Ignore edges with zero weight. + if (weight == 0) + continue; + + size_t from = getOrCreateNode(fromSB); + size_t to = getOrCreateNode(toSB); + // Ignore self-edges (recursive calls). + if (from == to) + continue; + + callCounts.push_back({from, to, weight}); + // Assume that the jump is at the middle of the input section. The profile + // data does not contain jump offsets. + callOffsets.push_back((funcSizes[from] + 1) / 2); + funcCounts[to] += weight; + } + + // Run the layout algorithm. + std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout( + funcSizes, funcCounts, callCounts, callOffsets); + + // Create the final order. + DenseMap<const InputSectionBase *, int> orderMap; + int curOrder = 1; + for (uint64_t secIdx : sortedSections) + orderMap[sections[secIdx]] = curOrder++; + + return orderMap; +} + // Sort sections by the profile data provided by --callgraph-profile-file. // // This first builds a call graph based on the profile data then merges sections -// according to the C³ heuristic. All clusters are then sorted by a density -// metric to further improve locality. +// according either to the C³ or Cache-Directed-Sort ordering algorithm. DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() { + if (config->callGraphProfileSort == CGProfileSortKind::Cdsort) + return computeCacheDirectedSortOrder(); return CallGraphSort().run(); } |
