diff options
Diffstat (limited to 'ELF/CallGraphSort.cpp')
-rw-r--r-- | ELF/CallGraphSort.cpp | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/ELF/CallGraphSort.cpp b/ELF/CallGraphSort.cpp new file mode 100644 index 000000000000..33ac159a6e26 --- /dev/null +++ b/ELF/CallGraphSort.cpp @@ -0,0 +1,249 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// Implementation of Call-Chain Clustering from: Optimizing Function Placement +/// for Large-Scale Data-Center Applications +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// +/// The goal of this algorithm is to improve runtime performance of the final +/// executable by arranging code sections such that page table and i-cache +/// misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are layed out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incomming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the ammount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the call graph profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "OutputSections.h" +#include "SymbolTable.h" +#include "Symbols.h" + +using namespace llvm; +using namespace lld; +using namespace lld::elf; + +namespace { +struct Edge { + int From; + uint64_t Weight; +}; + +struct Cluster { + Cluster(int Sec, size_t S) { + Sections.push_back(Sec); + Size = S; + } + + double getDensity() const { + if (Size == 0) + return 0; + return double(Weight) / double(Size); + } + + std::vector<int> Sections; + size_t Size = 0; + uint64_t Weight = 0; + uint64_t InitialWeight = 0; + std::vector<Edge> Preds; +}; + +class CallGraphSort { +public: + CallGraphSort(); + + DenseMap<const InputSectionBase *, int> run(); + +private: + std::vector<Cluster> Clusters; + std::vector<const InputSectionBase *> Sections; + + void groupClusters(); +}; + +// Maximum ammount the combined cluster density can be worse than the original +// cluster to consider merging. +constexpr int MAX_DENSITY_DEGRADATION = 8; + +// Maximum cluster size in bytes. +constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; +} // end anonymous namespace + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// Symbols, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort() { + llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>, + uint64_t> &Profile = Config->CallGraphProfile; + DenseMap<const InputSectionBase *, int> SecToCluster; + + auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int { + auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size())); + if (Res.second) { + Sections.push_back(IS); + Clusters.emplace_back(Clusters.size(), IS->getSize()); + } + return Res.first->second; + }; + + // Create the graph. + for (const auto &C : Profile) { + const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl); + const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl); + uint64_t Weight = C.second; + + // Ignore edges between input sections belonging to different output + // sections. This is done because otherwise we would end up with clusters + // containing input sections that can't actually be placed adjacently in the + // output. This messes with the cluster size and density calculations. We + // would also end up moving input sections in other output sections without + // moving them closer to what calls them. + if (FromSB->getOutputSection() != ToSB->getOutputSection()) + continue; + + int From = GetOrCreateNode(FromSB); + int To = GetOrCreateNode(ToSB); + + Clusters[To].Weight += Weight; + + if (From == To) + continue; + + // Add an edge + Clusters[To].Preds.push_back({From, Weight}); + } + for (Cluster &C : Clusters) + C.InitialWeight = C.Weight; +} + +// It's bad to merge clusters which would degrade the density too much. +static bool isNewDensityBad(Cluster &A, Cluster &B) { + double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size); + if (NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION) + return true; + return false; +} + +static void mergeClusters(Cluster &Into, Cluster &From) { + Into.Sections.insert(Into.Sections.end(), From.Sections.begin(), + From.Sections.end()); + Into.Size += From.Size; + Into.Weight += From.Weight; + From.Sections.clear(); + From.Size = 0; + From.Weight = 0; +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +void CallGraphSort::groupClusters() { + std::vector<int> SortedSecs(Clusters.size()); + std::vector<Cluster *> SecToCluster(Clusters.size()); + + for (int SI = 0, SE = Clusters.size(); SI != SE; ++SI) { + SortedSecs[SI] = SI; + SecToCluster[SI] = &Clusters[SI]; + } + + std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) { + return Clusters[B].getDensity() < Clusters[A].getDensity(); + }); + + for (int SI : SortedSecs) { + // Clusters[SI] is the same as SecToClusters[SI] here because it has not + // been merged into another cluster yet. + Cluster &C = Clusters[SI]; + + int BestPred = -1; + uint64_t BestWeight = 0; + + for (Edge &E : C.Preds) { + if (BestPred == -1 || E.Weight > BestWeight) { + BestPred = E.From; + BestWeight = E.Weight; + } + } + + // don't consider merging if the edge is unlikely. + if (BestWeight * 10 <= C.InitialWeight) + continue; + + Cluster *PredC = SecToCluster[BestPred]; + if (PredC == &C) + continue; + + if (C.Size + PredC->Size > MAX_CLUSTER_SIZE) + continue; + + if (isNewDensityBad(*PredC, C)) + continue; + + // NOTE: Consider using a disjoint-set to track section -> cluster mapping + // if this is ever slow. + for (int SI : C.Sections) + SecToCluster[SI] = PredC; + + mergeClusters(*PredC, C); + } + + // Remove empty or dead nodes. Invalidates all cluster indices. + llvm::erase_if(Clusters, [](const Cluster &C) { + return C.Size == 0 || C.Sections.empty(); + }); + + // Sort by density. + std::stable_sort(Clusters.begin(), Clusters.end(), + [](const Cluster &A, const Cluster &B) { + return A.getDensity() > B.getDensity(); + }); +} + +DenseMap<const InputSectionBase *, int> CallGraphSort::run() { + groupClusters(); + + // Generate order. + llvm::DenseMap<const InputSectionBase *, int> OrderMap; + ssize_t CurOrder = 1; + + for (const Cluster &C : Clusters) + for (int SecIndex : C.Sections) + OrderMap[Sections[SecIndex]] = CurOrder++; + + return OrderMap; +} + +// Sort sections by the profile data provided by -callgraph-profile-file +// +// This first builds a call graph based on the profile data then merges sections +// according to the C³ huristic. All clusters are then sorted by a density +// metric to further improve locality. +DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() { + return CallGraphSort().run(); +} |