diff options
Diffstat (limited to 'llvm/include/llvm/Support/SuffixTree.h')
-rw-r--r-- | llvm/include/llvm/Support/SuffixTree.h | 350 |
1 files changed, 350 insertions, 0 deletions
diff --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h new file mode 100644 index 0000000000000..67d513d032cef --- /dev/null +++ b/llvm/include/llvm/Support/SuffixTree.h @@ -0,0 +1,350 @@ +//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the Suffix Tree class and Suffix Tree Node struct. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_SUPPORT_SUFFIXTREE_H +#define LLVM_SUPPORT_SUFFIXTREE_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/Allocator.h" +#include <vector> + +namespace llvm { + +/// Represents an undefined index in the suffix tree. +const unsigned EmptyIdx = -1; + +/// A node in a suffix tree which represents a substring or suffix. +/// +/// Each node has either no children or at least two children, with the root +/// being a exception in the empty tree. +/// +/// Children are represented as a map between unsigned integers and nodes. If +/// a node N has a child M on unsigned integer k, then the mapping represented +/// by N is a proper prefix of the mapping represented by M. Note that this, +/// although similar to a trie is somewhat different: each node stores a full +/// substring of the full mapping rather than a single character state. +/// +/// Each internal node contains a pointer to the internal node representing +/// the same string, but with the first character chopped off. This is stored +/// in \p Link. Each leaf node stores the start index of its respective +/// suffix in \p SuffixIdx. +struct SuffixTreeNode { + + /// The children of this node. + /// + /// A child existing on an unsigned integer implies that from the mapping + /// represented by the current node, there is a way to reach another + /// mapping by tacking that character on the end of the current string. + llvm::DenseMap<unsigned, SuffixTreeNode *> Children; + + /// The start index of this node's substring in the main string. + unsigned StartIdx = EmptyIdx; + + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + unsigned *EndIdx = nullptr; + + /// For leaves, the start index of the suffix represented by this node. + /// + /// For all other nodes, this is ignored. + unsigned SuffixIdx = EmptyIdx; + + /// For internal nodes, a pointer to the internal node representing + /// the same sequence with the first character chopped off. + /// + /// This acts as a shortcut in Ukkonen's algorithm. One of the things that + /// Ukkonen's algorithm does to achieve linear-time construction is + /// keep track of which node the next insert should be at. This makes each + /// insert O(1), and there are a total of O(N) inserts. The suffix link + /// helps with inserting children of internal nodes. + /// + /// Say we add a child to an internal node with associated mapping S. The + /// next insertion must be at the node representing S - its first character. + /// This is given by the way that we iteratively build the tree in Ukkonen's + /// algorithm. The main idea is to look at the suffixes of each prefix in the + /// string, starting with the longest suffix of the prefix, and ending with + /// the shortest. Therefore, if we keep pointers between such nodes, we can + /// move to the next insertion point in O(1) time. If we don't, then we'd + /// have to query from the root, which takes O(N) time. This would make the + /// construction algorithm O(N^2) rather than O(N). + SuffixTreeNode *Link = nullptr; + + /// The length of the string formed by concatenating the edge labels from the + /// root to this node. + unsigned ConcatLen = 0; + + /// Returns true if this node is a leaf. + bool isLeaf() const { return SuffixIdx != EmptyIdx; } + + /// Returns true if this node is the root of its owning \p SuffixTree. + bool isRoot() const { return StartIdx == EmptyIdx; } + + /// Return the number of elements in the substring associated with this node. + size_t size() const { + + // Is it the root? If so, it's the empty string so return 0. + if (isRoot()) + return 0; + + assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); + + // Size = the number of elements in the string. + // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. + return *EndIdx - StartIdx + 1; + } + + SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link) + : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {} + + SuffixTreeNode() {} +}; + +/// A data structure for fast substring queries. +/// +/// Suffix trees represent the suffixes of their input strings in their leaves. +/// A suffix tree is a type of compressed trie structure where each node +/// represents an entire substring rather than a single character. Each leaf +/// of the tree is a suffix. +/// +/// A suffix tree can be seen as a type of state machine where each state is a +/// substring of the full string. The tree is structured so that, for a string +/// of length N, there are exactly N leaves in the tree. This structure allows +/// us to quickly find repeated substrings of the input string. +/// +/// In this implementation, a "string" is a vector of unsigned integers. +/// These integers may result from hashing some data type. A suffix tree can +/// contain 1 or many strings, which can then be queried as one large string. +/// +/// The suffix tree is implemented using Ukkonen's algorithm for linear-time +/// suffix tree construction. Ukkonen's algorithm is explained in more detail +/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The +/// paper is available at +/// +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +class SuffixTree { +public: + /// Each element is an integer representing an instruction in the module. + llvm::ArrayRef<unsigned> Str; + + /// A repeated substring in the tree. + struct RepeatedSubstring { + /// The length of the string. + unsigned Length; + + /// The start indices of each occurrence. + std::vector<unsigned> StartIndices; + }; + +private: + /// Maintains each node in the tree. + llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator; + + /// The root of the suffix tree. + /// + /// The root represents the empty string. It is maintained by the + /// \p NodeAllocator like every other node in the tree. + SuffixTreeNode *Root = nullptr; + + /// Maintains the end indices of the internal nodes in the tree. + /// + /// Each internal node is guaranteed to never have its end index change + /// during the construction algorithm; however, leaves must be updated at + /// every step. Therefore, we need to store leaf end indices by reference + /// to avoid updating O(N) leaves at every step of construction. Thus, + /// every internal node must be allocated its own end index. + llvm::BumpPtrAllocator InternalEndIdxAllocator; + + /// The end index of each leaf in the tree. + unsigned LeafEndIdx = -1; + + /// Helper struct which keeps track of the next insertion point in + /// Ukkonen's algorithm. + struct ActiveState { + /// The next node to insert at. + SuffixTreeNode *Node = nullptr; + + /// The index of the first character in the substring currently being added. + unsigned Idx = EmptyIdx; + + /// The length of the substring we have to add at the current step. + unsigned Len = 0; + }; + + /// The point the next insertion will take place at in the + /// construction algorithm. + ActiveState Active; + + /// Allocate a leaf node and add it to the tree. + /// + /// \param Parent The parent of this node. + /// \param StartIdx The start index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated leaf node. + SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx, + unsigned Edge); + + /// Allocate an internal node and add it to the tree. + /// + /// \param Parent The parent of this node. Only null when allocating the root. + /// \param StartIdx The start index of this node's associated string. + /// \param EndIdx The end index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated internal node. + SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx, + unsigned EndIdx, unsigned Edge); + + /// Set the suffix indices of the leaves to the start indices of their + /// respective suffixes. + void setSuffixIndices(); + + /// Construct the suffix tree for the prefix of the input ending at + /// \p EndIdx. + /// + /// Used to construct the full suffix tree iteratively. At the end of each + /// step, the constructed suffix tree is either a valid suffix tree, or a + /// suffix tree with implicit suffixes. At the end of the final step, the + /// suffix tree is a valid tree. + /// + /// \param EndIdx The end index of the current prefix in the main string. + /// \param SuffixesToAdd The number of suffixes that must be added + /// to complete the suffix tree at the current phase. + /// + /// \returns The number of suffixes that have not been added at the end of + /// this step. + unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); + +public: + /// Construct a suffix tree from a sequence of unsigned integers. + /// + /// \param Str The string to construct the suffix tree for. + SuffixTree(const std::vector<unsigned> &Str); + + /// Iterator for finding all repeated substrings in the suffix tree. + struct RepeatedSubstringIterator { + private: + /// The current node we're visiting. + SuffixTreeNode *N = nullptr; + + /// The repeated substring associated with this node. + RepeatedSubstring RS; + + /// The nodes left to visit. + std::vector<SuffixTreeNode *> ToVisit; + + /// The minimum length of a repeated substring to find. + /// Since we're outlining, we want at least two instructions in the range. + /// FIXME: This may not be true for targets like X86 which support many + /// instruction lengths. + const unsigned MinLength = 2; + + /// Move the iterator to the next repeated substring. + void advance() { + // Clear the current state. If we're at the end of the range, then this + // is the state we want to be in. + RS = RepeatedSubstring(); + N = nullptr; + + // Each leaf node represents a repeat of a string. + std::vector<SuffixTreeNode *> LeafChildren; + + // Continue visiting nodes until we find one which repeats more than once. + while (!ToVisit.empty()) { + SuffixTreeNode *Curr = ToVisit.back(); + ToVisit.pop_back(); + LeafChildren.clear(); + + // Keep track of the length of the string associated with the node. If + // it's too short, we'll quit. + unsigned Length = Curr->ConcatLen; + + // Iterate over each child, saving internal nodes for visiting, and + // leaf nodes in LeafChildren. Internal nodes represent individual + // strings, which may repeat. + for (auto &ChildPair : Curr->Children) { + // Save all of this node's children for processing. + if (!ChildPair.second->isLeaf()) + ToVisit.push_back(ChildPair.second); + + // It's not an internal node, so it must be a leaf. If we have a + // long enough string, then save the leaf children. + else if (Length >= MinLength) + LeafChildren.push_back(ChildPair.second); + } + + // The root never represents a repeated substring. If we're looking at + // that, then skip it. + if (Curr->isRoot()) + continue; + + // Do we have any repeated substrings? + if (LeafChildren.size() >= 2) { + // Yes. Update the state to reflect this, and then bail out. + N = Curr; + RS.Length = Length; + for (SuffixTreeNode *Leaf : LeafChildren) + RS.StartIndices.push_back(Leaf->SuffixIdx); + break; + } + } + + // At this point, either NewRS is an empty RepeatedSubstring, or it was + // set in the above loop. Similarly, N is either nullptr, or the node + // associated with NewRS. + } + + public: + /// Return the current repeated substring. + RepeatedSubstring &operator*() { return RS; } + + RepeatedSubstringIterator &operator++() { + advance(); + return *this; + } + + RepeatedSubstringIterator operator++(int I) { + RepeatedSubstringIterator It(*this); + advance(); + return It; + } + + bool operator==(const RepeatedSubstringIterator &Other) { + return N == Other.N; + } + bool operator!=(const RepeatedSubstringIterator &Other) { + return !(*this == Other); + } + + RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) { + // Do we have a non-null node? + if (N) { + // Yes. At the first step, we need to visit all of N's children. + // Note: This means that we visit N last. + ToVisit.push_back(N); + advance(); + } + } + }; + + typedef RepeatedSubstringIterator iterator; + iterator begin() { return iterator(Root); } + iterator end() { return iterator(nullptr); } +}; + +} // namespace llvm + +#endif // LLVM_SUPPORT_SUFFIXTREE_H |