Vendor import of llvm-project main llvmorg-18-init-14265-ga17671084db1. - src

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-12-09 13:28:42 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-12-09 13:28:42 +0000
commit	b1c73532ee8997fe5dfbeb7d223027bdf99758a0 (patch)
tree	7d6e51c294ab6719475d660217aa0c0ad0526292 /lld/ELF/CallGraphSort.cpp
parent	7fa27ce4a07f19b07799a767fc29416f3b625afb (diff)

vendor/llvm-project/llvmorg-18-init-14265-ga17671084db1

Diffstat (limited to 'lld/ELF/CallGraphSort.cpp')

-rw-r--r--

lld/ELF/CallGraphSort.cpp

139

1 files changed, 106 insertions, 33 deletions

diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp
index ff72731b1f38..a0cf491bbae3 100644
--- a/lld/ELF/CallGraphSort.cpp
+++ b/lld/ELF/CallGraphSort.cpp

@@ -6,38 +6,21 @@

//===----------------------------------------------------------------------===//

///

-/// Implementation of Call-Chain Clustering from: Optimizing Function Placement

-/// for Large-Scale Data-Center Applications

-/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf

-///

-/// The goal of this algorithm is to improve runtime performance of the final

-/// executable by arranging code sections such that page table and i-cache

-/// misses are minimized.

-///

-/// Definitions:

-/// * Cluster

-/// * An ordered list of input sections which are laid out as a unit. At the

-/// beginning of the algorithm each input section has its own cluster and

-/// the weight of the cluster is the sum of the weight of all incoming

-/// edges.

-/// * Call-Chain Clustering (C³) Heuristic

-/// * Defines when and how clusters are combined. Pick the highest weighted

-/// input section then add it to its most likely predecessor if it wouldn't

-/// penalize it too much.

-/// * Density

-/// * The weight of the cluster divided by the size of the cluster. This is a

-/// proxy for the amount of execution time spent per byte of the cluster.

-///

-/// It does so given a call graph profile by the following:

-/// * Build a weighted call graph from the call graph profile

-/// * Sort input sections by weight

-/// * For each input section starting with the highest weight

-/// * Find its most likely predecessor cluster

-/// * Check if the combined cluster would be too large, or would have too low

-/// a density.

-/// * If not, then combine the clusters.

-/// * Sort non-empty clusters by density

+/// The file is responsible for sorting sections using LLVM call graph profile

+/// data by placing frequently executed code sections together. The goal of the

+/// placement is to improve the runtime performance of the final executable by

+/// arranging code sections so that i-TLB misses and i-cache misses are reduced.

///

+/// The algorithm first builds a call graph based on the profile data and then

+/// iteratively merges "chains" (ordered lists) of input sections which will be

+/// laid out as a unit. There are two implementations for deciding how to

+/// merge a pair of chains:

+/// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows

+/// "Optimizing Function Placement for Large-Scale Data-Center Applications"

+/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf

+/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which

+/// typically produces layouts with higher locality, and hence, yields fewer

+/// instruction cache misses on large binaries.

//===----------------------------------------------------------------------===//

#include "CallGraphSort.h"

@@ -45,6 +28,7 @@

#include "InputSection.h"

#include "Symbols.h"

#include "llvm/Support/FileSystem.h"

+#include "llvm/Transforms/Utils/CodeLayout.h"

#include <numeric>

@@ -75,6 +59,33 @@ struct Cluster {

Edge bestPred = {-1, 0};

};

+/// Implementation of the Call-Chain Clustering (C^3). The goal of this

+/// algorithm is to improve runtime performance of the executable by arranging

+/// code sections such that page table and i-cache misses are minimized.

+///

+/// Definitions:

+/// * Cluster

+/// * An ordered list of input sections which are laid out as a unit. At the

+/// beginning of the algorithm each input section has its own cluster and

+/// the weight of the cluster is the sum of the weight of all incoming

+/// edges.

+/// * Call-Chain Clustering (C³) Heuristic

+/// * Defines when and how clusters are combined. Pick the highest weighted

+/// input section then add it to its most likely predecessor if it wouldn't

+/// penalize it too much.

+/// * Density

+/// * The weight of the cluster divided by the size of the cluster. This is a

+/// proxy for the amount of execution time spent per byte of the cluster.

+///

+/// It does so given a call graph profile by the following:

+/// * Build a weighted call graph from the call graph profile

+/// * Sort input sections by weight

+/// * For each input section starting with the highest weight

+/// * Find its most likely predecessor cluster

+/// * Check if the combined cluster would be too large, or would have too low

+/// a density.

+/// * If not, then combine the clusters.

+/// * Sort non-empty clusters by density

class CallGraphSort {

public:

CallGraphSort();

@@ -260,11 +271,73 @@ DenseMap<const InputSectionBase *, int> CallGraphSort::run() {

return orderMap;

}

+// Sort sections by the profile data using the Cache-Directed Sort algorithm.

+// The placement is done by optimizing the locality by co-locating frequently

+// executed code sections together.

+DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() {

+ SmallVector<uint64_t, 0> funcSizes;

+ SmallVector<uint64_t, 0> funcCounts;

+ SmallVector<codelayout::EdgeCount, 0> callCounts;

+ SmallVector<uint64_t, 0> callOffsets;

+ SmallVector<const InputSectionBase *, 0> sections;

+ DenseMap<const InputSectionBase *, size_t> secToTargetId;

+ auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t {

+ auto res = secToTargetId.try_emplace(inSec, sections.size());

+ if (res.second) {

+ // inSec does not appear before in the graph.

+ sections.push_back(inSec);

+ funcSizes.push_back(inSec->getSize());

+ funcCounts.push_back(0);

+ }

+ return res.first->second;

+ };

+ // Create the graph.

+ for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) {

+ const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first);

+ const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second);

+ // Ignore edges between input sections belonging to different sections.

+ if (fromSB->getOutputSection() != toSB->getOutputSection())

+ continue;

+ uint64_t weight = c.second;

+ // Ignore edges with zero weight.

+ if (weight == 0)

+ continue;

+ size_t from = getOrCreateNode(fromSB);

+ size_t to = getOrCreateNode(toSB);

+ // Ignore self-edges (recursive calls).

+ if (from == to)

+ continue;

+ callCounts.push_back({from, to, weight});

+ // Assume that the jump is at the middle of the input section. The profile

+ // data does not contain jump offsets.

+ callOffsets.push_back((funcSizes[from] + 1) / 2);

+ funcCounts[to] += weight;

+ }

+ // Run the layout algorithm.

+ std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout(

+ funcSizes, funcCounts, callCounts, callOffsets);

+ // Create the final order.

+ DenseMap<const InputSectionBase *, int> orderMap;

+ int curOrder = 1;

+ for (uint64_t secIdx : sortedSections)

+ orderMap[sections[secIdx]] = curOrder++;

+ return orderMap;

// Sort sections by the profile data provided by --callgraph-profile-file.

// This first builds a call graph based on the profile data then merges sections

-// according to the C³ heuristic. All clusters are then sorted by a density

-// metric to further improve locality.

+// according either to the C³ or Cache-Directed-Sort ordering algorithm.

DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {

+ if (config->callGraphProfileSort == CGProfileSortKind::Cdsort)

+ return computeCacheDirectedSortOrder();

return CallGraphSort().run();

}