1//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// A data structure for fast substring queries.
9//
10// Suffix trees represent the suffixes of their input strings in their leaves.
11// A suffix tree is a type of compressed trie structure where each node
12// represents an entire substring rather than a single character. Each leaf
13// of the tree is a suffix.
14//
15// A suffix tree can be seen as a type of state machine where each state is a
16// substring of the full string. The tree is structured so that, for a string
17// of length N, there are exactly N leaves in the tree. This structure allows
18// us to quickly find repeated substrings of the input string.
19//
20// In this implementation, a "string" is a vector of unsigned integers.
21// These integers may result from hashing some data type. A suffix tree can
22// contain 1 or many strings, which can then be queried as one large string.
23//
24// The suffix tree is implemented using Ukkonen's algorithm for linear-time
25// suffix tree construction. Ukkonen's algorithm is explained in more detail
26// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
27// paper is available at
28//
29// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
30//===----------------------------------------------------------------------===//
31
32#ifndef LLVM_SUPPORT_SUFFIXTREE_H
33#define LLVM_SUPPORT_SUFFIXTREE_H
34
35#include "llvm/ADT/ArrayRef.h"
36#include "llvm/Support/Allocator.h"
37#include "llvm/Support/SuffixTreeNode.h"
38
39namespace llvm {
40class SuffixTree {
41public:
42 /// Each element is an integer representing an instruction in the module.
43 ArrayRef<unsigned> Str;
44
45 /// A repeated substring in the tree.
46 struct RepeatedSubstring {
47 /// The length of the string.
48 unsigned Length;
49
50 /// The start indices of each occurrence.
51 SmallVector<unsigned> StartIndices;
52 };
53
54private:
55 /// Maintains internal nodes in the tree.
56 SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator;
57 /// Maintains leaf nodes in the tree.
58 SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator;
59
60 /// The root of the suffix tree.
61 ///
62 /// The root represents the empty string. It is maintained by the
63 /// \p NodeAllocator like every other node in the tree.
64 SuffixTreeInternalNode *Root = nullptr;
65
66 /// The end index of each leaf in the tree.
67 unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
68
69 /// Helper struct which keeps track of the next insertion point in
70 /// Ukkonen's algorithm.
71 struct ActiveState {
72 /// The next node to insert at.
73 SuffixTreeInternalNode *Node = nullptr;
74
75 /// The index of the first character in the substring currently being added.
76 unsigned Idx = SuffixTreeNode::EmptyIdx;
77
78 /// The length of the substring we have to add at the current step.
79 unsigned Len = 0;
80 };
81
82 /// The point the next insertion will take place at in the
83 /// construction algorithm.
84 ActiveState Active;
85
86 /// Allocate a leaf node and add it to the tree.
87 ///
88 /// \param Parent The parent of this node.
89 /// \param StartIdx The start index of this node's associated string.
90 /// \param Edge The label on the edge leaving \p Parent to this node.
91 ///
92 /// \returns A pointer to the allocated leaf node.
93 SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx,
94 unsigned Edge);
95
96 /// Allocate an internal node and add it to the tree.
97 ///
98 /// \param Parent The parent of this node. Only null when allocating the root.
99 /// \param StartIdx The start index of this node's associated string.
100 /// \param EndIdx The end index of this node's associated string.
101 /// \param Edge The label on the edge leaving \p Parent to this node.
102 ///
103 /// \returns A pointer to the allocated internal node.
104 SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent,
105 unsigned StartIdx, unsigned EndIdx,
106 unsigned Edge);
107
108 /// Allocate the root node and add it to the tree.
109 ///
110 /// \returns A pointer to the root.
111 SuffixTreeInternalNode *insertRoot();
112
113 /// Set the suffix indices of the leaves to the start indices of their
114 /// respective suffixes.
115 void setSuffixIndices();
116
117 /// Construct the suffix tree for the prefix of the input ending at
118 /// \p EndIdx.
119 ///
120 /// Used to construct the full suffix tree iteratively. At the end of each
121 /// step, the constructed suffix tree is either a valid suffix tree, or a
122 /// suffix tree with implicit suffixes. At the end of the final step, the
123 /// suffix tree is a valid tree.
124 ///
125 /// \param EndIdx The end index of the current prefix in the main string.
126 /// \param SuffixesToAdd The number of suffixes that must be added
127 /// to complete the suffix tree at the current phase.
128 ///
129 /// \returns The number of suffixes that have not been added at the end of
130 /// this step.
131 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
132
133public:
134 /// Construct a suffix tree from a sequence of unsigned integers.
135 ///
136 /// \param Str The string to construct the suffix tree for.
137 SuffixTree(const ArrayRef<unsigned> &Str);
138
139 /// Iterator for finding all repeated substrings in the suffix tree.
140 struct RepeatedSubstringIterator {
141 private:
142 /// The current node we're visiting.
143 SuffixTreeNode *N = nullptr;
144
145 /// The repeated substring associated with this node.
146 RepeatedSubstring RS;
147
148 /// The nodes left to visit.
149 SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit;
150
151 /// The minimum length of a repeated substring to find.
152 /// Since we're outlining, we want at least two instructions in the range.
153 /// FIXME: This may not be true for targets like X86 which support many
154 /// instruction lengths.
155 const unsigned MinLength = 2;
156
157 /// Move the iterator to the next repeated substring.
158 void advance();
159
160 public:
161 /// Return the current repeated substring.
162 RepeatedSubstring &operator*() { return RS; }
163
164 RepeatedSubstringIterator &operator++() {
165 advance();
166 return *this;
167 }
168
169 RepeatedSubstringIterator operator++(int I) {
170 RepeatedSubstringIterator It(*this);
171 advance();
172 return It;
173 }
174
175 bool operator==(const RepeatedSubstringIterator &Other) const {
176 return N == Other.N;
177 }
178 bool operator!=(const RepeatedSubstringIterator &Other) const {
179 return !(*this == Other);
180 }
181
182 RepeatedSubstringIterator(SuffixTreeInternalNode *N) : N(N) {
183 // Do we have a non-null node?
184 if (!N)
185 return;
186 // Yes. At the first step, we need to visit all of N's children.
187 // Note: This means that we visit N last.
188 InternalNodesToVisit.push_back(Elt: N);
189 advance();
190 }
191 };
192
193 typedef RepeatedSubstringIterator iterator;
194 iterator begin() { return iterator(Root); }
195 iterator end() { return iterator(nullptr); }
196};
197
198} // namespace llvm
199
200#endif // LLVM_SUPPORT_SUFFIXTREE_H
201

source code of llvm/include/llvm/Support/SuffixTree.h