aboutsummaryrefslogtreecommitdiff
path: root/lld/ELF/CallGraphSort.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-12-09 13:28:42 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-12-09 13:28:42 +0000
commitb1c73532ee8997fe5dfbeb7d223027bdf99758a0 (patch)
tree7d6e51c294ab6719475d660217aa0c0ad0526292 /lld/ELF/CallGraphSort.cpp
parent7fa27ce4a07f19b07799a767fc29416f3b625afb (diff)
Diffstat (limited to 'lld/ELF/CallGraphSort.cpp')
-rw-r--r--lld/ELF/CallGraphSort.cpp139
1 files changed, 106 insertions, 33 deletions
diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp
index ff72731b1f38..a0cf491bbae3 100644
--- a/lld/ELF/CallGraphSort.cpp
+++ b/lld/ELF/CallGraphSort.cpp
@@ -6,38 +6,21 @@
//
//===----------------------------------------------------------------------===//
///
-/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
-/// for Large-Scale Data-Center Applications
-/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
-///
-/// The goal of this algorithm is to improve runtime performance of the final
-/// executable by arranging code sections such that page table and i-cache
-/// misses are minimized.
-///
-/// Definitions:
-/// * Cluster
-/// * An ordered list of input sections which are laid out as a unit. At the
-/// beginning of the algorithm each input section has its own cluster and
-/// the weight of the cluster is the sum of the weight of all incoming
-/// edges.
-/// * Call-Chain Clustering (C³) Heuristic
-/// * Defines when and how clusters are combined. Pick the highest weighted
-/// input section then add it to its most likely predecessor if it wouldn't
-/// penalize it too much.
-/// * Density
-/// * The weight of the cluster divided by the size of the cluster. This is a
-/// proxy for the amount of execution time spent per byte of the cluster.
-///
-/// It does so given a call graph profile by the following:
-/// * Build a weighted call graph from the call graph profile
-/// * Sort input sections by weight
-/// * For each input section starting with the highest weight
-/// * Find its most likely predecessor cluster
-/// * Check if the combined cluster would be too large, or would have too low
-/// a density.
-/// * If not, then combine the clusters.
-/// * Sort non-empty clusters by density
+/// The file is responsible for sorting sections using LLVM call graph profile
+/// data by placing frequently executed code sections together. The goal of the
+/// placement is to improve the runtime performance of the final executable by
+/// arranging code sections so that i-TLB misses and i-cache misses are reduced.
///
+/// The algorithm first builds a call graph based on the profile data and then
+/// iteratively merges "chains" (ordered lists) of input sections which will be
+/// laid out as a unit. There are two implementations for deciding how to
+/// merge a pair of chains:
+/// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows
+/// "Optimizing Function Placement for Large-Scale Data-Center Applications"
+/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which
+/// typically produces layouts with higher locality, and hence, yields fewer
+/// instruction cache misses on large binaries.
//===----------------------------------------------------------------------===//
#include "CallGraphSort.h"
@@ -45,6 +28,7 @@
#include "InputSection.h"
#include "Symbols.h"
#include "llvm/Support/FileSystem.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
#include <numeric>
@@ -75,6 +59,33 @@ struct Cluster {
Edge bestPred = {-1, 0};
};
+/// Implementation of the Call-Chain Clustering (C^3). The goal of this
+/// algorithm is to improve runtime performance of the executable by arranging
+/// code sections such that page table and i-cache misses are minimized.
+///
+/// Definitions:
+/// * Cluster
+/// * An ordered list of input sections which are laid out as a unit. At the
+/// beginning of the algorithm each input section has its own cluster and
+/// the weight of the cluster is the sum of the weight of all incoming
+/// edges.
+/// * Call-Chain Clustering (C³) Heuristic
+/// * Defines when and how clusters are combined. Pick the highest weighted
+/// input section then add it to its most likely predecessor if it wouldn't
+/// penalize it too much.
+/// * Density
+/// * The weight of the cluster divided by the size of the cluster. This is a
+/// proxy for the amount of execution time spent per byte of the cluster.
+///
+/// It does so given a call graph profile by the following:
+/// * Build a weighted call graph from the call graph profile
+/// * Sort input sections by weight
+/// * For each input section starting with the highest weight
+/// * Find its most likely predecessor cluster
+/// * Check if the combined cluster would be too large, or would have too low
+/// a density.
+/// * If not, then combine the clusters.
+/// * Sort non-empty clusters by density
class CallGraphSort {
public:
CallGraphSort();
@@ -260,11 +271,73 @@ DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
return orderMap;
}
+// Sort sections by the profile data using the Cache-Directed Sort algorithm.
+// The placement is done by optimizing the locality by co-locating frequently
+// executed code sections together.
+DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() {
+ SmallVector<uint64_t, 0> funcSizes;
+ SmallVector<uint64_t, 0> funcCounts;
+ SmallVector<codelayout::EdgeCount, 0> callCounts;
+ SmallVector<uint64_t, 0> callOffsets;
+ SmallVector<const InputSectionBase *, 0> sections;
+ DenseMap<const InputSectionBase *, size_t> secToTargetId;
+
+ auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t {
+ auto res = secToTargetId.try_emplace(inSec, sections.size());
+ if (res.second) {
+ // inSec does not appear before in the graph.
+ sections.push_back(inSec);
+ funcSizes.push_back(inSec->getSize());
+ funcCounts.push_back(0);
+ }
+ return res.first->second;
+ };
+
+ // Create the graph.
+ for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) {
+ const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first);
+ const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second);
+ // Ignore edges between input sections belonging to different sections.
+ if (fromSB->getOutputSection() != toSB->getOutputSection())
+ continue;
+
+ uint64_t weight = c.second;
+ // Ignore edges with zero weight.
+ if (weight == 0)
+ continue;
+
+ size_t from = getOrCreateNode(fromSB);
+ size_t to = getOrCreateNode(toSB);
+ // Ignore self-edges (recursive calls).
+ if (from == to)
+ continue;
+
+ callCounts.push_back({from, to, weight});
+ // Assume that the jump is at the middle of the input section. The profile
+ // data does not contain jump offsets.
+ callOffsets.push_back((funcSizes[from] + 1) / 2);
+ funcCounts[to] += weight;
+ }
+
+ // Run the layout algorithm.
+ std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout(
+ funcSizes, funcCounts, callCounts, callOffsets);
+
+ // Create the final order.
+ DenseMap<const InputSectionBase *, int> orderMap;
+ int curOrder = 1;
+ for (uint64_t secIdx : sortedSections)
+ orderMap[sections[secIdx]] = curOrder++;
+
+ return orderMap;
+}
+
// Sort sections by the profile data provided by --callgraph-profile-file.
//
// This first builds a call graph based on the profile data then merges sections
-// according to the C³ heuristic. All clusters are then sorted by a density
-// metric to further improve locality.
+// according either to the C³ or Cache-Directed-Sort ordering algorithm.
DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
+ if (config->callGraphProfileSort == CGProfileSortKind::Cdsort)
+ return computeCacheDirectedSortOrder();
return CallGraphSort().run();
}