summaryrefslogtreecommitdiff
path: root/lld/MachO/ExportTrie.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lld/MachO/ExportTrie.cpp')
-rw-r--r--lld/MachO/ExportTrie.cpp283
1 files changed, 283 insertions, 0 deletions
diff --git a/lld/MachO/ExportTrie.cpp b/lld/MachO/ExportTrie.cpp
new file mode 100644
index 000000000000..7cc81bcfd5f1
--- /dev/null
+++ b/lld/MachO/ExportTrie.cpp
@@ -0,0 +1,283 @@
+//===- ExportTrie.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a partial implementation of the Mach-O export trie format. It's
+// essentially a symbol table encoded as a compressed prefix trie, meaning that
+// the common prefixes of each symbol name are shared for a more compact
+// representation. The prefixes are stored on the edges of the trie, and one
+// edge can represent multiple characters. For example, given two exported
+// symbols _bar and _baz, we will have a trie like this (terminal nodes are
+// marked with an asterisk):
+//
+// +-+-+
+// | | // root node
+// +-+-+
+// |
+// | _ba
+// |
+// +-+-+
+// | |
+// +-+-+
+// r / \ z
+// / \
+// +-+-+ +-+-+
+// | * | | * |
+// +-+-+ +-+-+
+//
+// More documentation of the format can be found in
+// llvm/tools/obj2yaml/macho2yaml.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ExportTrie.h"
+#include "Symbols.h"
+
+#include "lld/Common/ErrorHandler.h"
+#include "lld/Common/Memory.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+using namespace lld;
+using namespace lld::macho;
+
+namespace {
+
+struct Edge {
+ Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
+
+ StringRef substring;
+ struct TrieNode *child;
+};
+
+struct ExportInfo {
+ uint64_t address;
+ // TODO: Add proper support for re-exports & stub-and-resolver flags.
+};
+
+} // namespace
+
+struct macho::TrieNode {
+ std::vector<Edge> edges;
+ Optional<ExportInfo> info;
+ // Estimated offset from the start of the serialized trie to the current node.
+ // This will converge to the true offset when updateOffset() is run to a
+ // fixpoint.
+ size_t offset = 0;
+
+ // Returns whether the new estimated offset differs from the old one.
+ bool updateOffset(size_t &nextOffset);
+ void writeTo(uint8_t *buf) const;
+};
+
+bool TrieNode::updateOffset(size_t &nextOffset) {
+ // Size of the whole node (including the terminalSize and the outgoing edges.)
+ // In contrast, terminalSize only records the size of the other data in the
+ // node.
+ size_t nodeSize;
+ if (info) {
+ uint64_t flags = 0;
+ uint32_t terminalSize =
+ getULEB128Size(flags) + getULEB128Size(info->address);
+ // Overall node size so far is the uleb128 size of the length of the symbol
+ // info + the symbol info itself.
+ nodeSize = terminalSize + getULEB128Size(terminalSize);
+ } else {
+ nodeSize = 1; // Size of terminalSize (which has a value of 0)
+ }
+ // Compute size of all child edges.
+ ++nodeSize; // Byte for number of children.
+ for (Edge &edge : edges) {
+ nodeSize += edge.substring.size() + 1 // String length.
+ + getULEB128Size(edge.child->offset); // Offset len.
+ }
+ // On input, 'nextOffset' is the new preferred location for this node.
+ bool result = (offset != nextOffset);
+ // Store new location in node object for use by parents.
+ offset = nextOffset;
+ nextOffset += nodeSize;
+ return result;
+}
+
+void TrieNode::writeTo(uint8_t *buf) const {
+ buf += offset;
+ if (info) {
+ // TrieNodes with Symbol info: size, flags address
+ uint64_t flags = 0; // TODO: emit proper flags
+ uint32_t terminalSize =
+ getULEB128Size(flags) + getULEB128Size(info->address);
+ buf += encodeULEB128(terminalSize, buf);
+ buf += encodeULEB128(flags, buf);
+ buf += encodeULEB128(info->address, buf);
+ } else {
+ // TrieNode with no Symbol info.
+ *buf++ = 0; // terminalSize
+ }
+ // Add number of children. TODO: Handle case where we have more than 256.
+ assert(edges.size() < 256);
+ *buf++ = edges.size();
+ // Append each child edge substring and node offset.
+ for (const Edge &edge : edges) {
+ memcpy(buf, edge.substring.data(), edge.substring.size());
+ buf += edge.substring.size();
+ *buf++ = '\0';
+ buf += encodeULEB128(edge.child->offset, buf);
+ }
+}
+
+TrieNode *TrieBuilder::makeNode() {
+ auto *node = make<TrieNode>();
+ nodes.emplace_back(node);
+ return node;
+}
+
+static int charAt(const Symbol *sym, size_t pos) {
+ StringRef str = sym->getName();
+ if (pos >= str.size())
+ return -1;
+ return str[pos];
+}
+
+// Build the trie by performing a three-way radix quicksort: We start by sorting
+// the strings by their first characters, then sort the strings with the same
+// first characters by their second characters, and so on recursively. Each
+// time the prefixes diverge, we add a node to the trie.
+//
+// node: The most recently created node along this path in the trie (i.e.
+// the furthest from the root.)
+// lastPos: The prefix length of the most recently created node, i.e. the number
+// of characters along its path from the root.
+// pos: The string index we are currently sorting on. Note that each symbol
+// S contained in vec has the same prefix S[0...pos).
+void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
+ TrieNode *node, size_t lastPos, size_t pos) {
+tailcall:
+ if (vec.empty())
+ return;
+
+ // Partition items so that items in [0, i) are less than the pivot,
+ // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
+ // the pivot.
+ const Symbol *pivotSymbol = vec[vec.size() / 2];
+ int pivot = charAt(pivotSymbol, pos);
+ size_t i = 0;
+ size_t j = vec.size();
+ for (size_t k = 0; k < j;) {
+ int c = charAt(vec[k], pos);
+ if (c < pivot)
+ std::swap(vec[i++], vec[k++]);
+ else if (c > pivot)
+ std::swap(vec[--j], vec[k]);
+ else
+ k++;
+ }
+
+ bool isTerminal = pivot == -1;
+ bool prefixesDiverge = i != 0 || j != vec.size();
+ if (lastPos != pos && (isTerminal || prefixesDiverge)) {
+ TrieNode *newNode = makeNode();
+ node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
+ newNode);
+ node = newNode;
+ lastPos = pos;
+ }
+
+ sortAndBuild(vec.slice(0, i), node, lastPos, pos);
+ sortAndBuild(vec.slice(j), node, lastPos, pos);
+
+ if (isTerminal) {
+ assert(j - i == 1); // no duplicate symbols
+ node->info = {pivotSymbol->getVA()};
+ } else {
+ // This is the tail-call-optimized version of the following:
+ // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
+ vec = vec.slice(i, j - i);
+ ++pos;
+ goto tailcall;
+ }
+}
+
+size_t TrieBuilder::build() {
+ if (exported.empty())
+ return 0;
+
+ TrieNode *root = makeNode();
+ sortAndBuild(exported, root, 0, 0);
+
+ // Assign each node in the vector an offset in the trie stream, iterating
+ // until all uleb128 sizes have stabilized.
+ size_t offset;
+ bool more;
+ do {
+ offset = 0;
+ more = false;
+ for (TrieNode *node : nodes)
+ more |= node->updateOffset(offset);
+ } while (more);
+
+ return offset;
+}
+
+void TrieBuilder::writeTo(uint8_t *buf) const {
+ for (TrieNode *node : nodes)
+ node->writeTo(buf);
+}
+
+namespace {
+
+// Parse a serialized trie and invoke a callback for each entry.
+class TrieParser {
+public:
+ TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
+ : start(buf), end(start + size), callback(callback) {}
+
+ void parse(const uint8_t *buf, const Twine &cumulativeString);
+
+ void parse() { parse(start, ""); }
+
+ const uint8_t *start;
+ const uint8_t *end;
+ const TrieEntryCallback &callback;
+};
+
+} // namespace
+
+void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
+ if (buf >= end)
+ fatal("Node offset points outside export section");
+
+ unsigned ulebSize;
+ uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
+ buf += ulebSize;
+ uint64_t flags = 0;
+ size_t offset;
+ if (terminalSize != 0) {
+ flags = decodeULEB128(buf, &ulebSize);
+ callback(cumulativeString, flags);
+ }
+ buf += terminalSize;
+ uint8_t numEdges = *buf++;
+ for (uint8_t i = 0; i < numEdges; ++i) {
+ const char *cbuf = reinterpret_cast<const char *>(buf);
+ StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
+ buf += substring.size() + 1;
+ offset = decodeULEB128(buf, &ulebSize);
+ buf += ulebSize;
+ parse(start + offset, cumulativeString + substring);
+ }
+}
+
+void macho::parseTrie(const uint8_t *buf, size_t size,
+ const TrieEntryCallback &callback) {
+ if (size == 0)
+ return;
+
+ TrieParser(buf, size, callback).parse();
+}