1 | //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // A data structure for fast substring queries. |
9 | // |
10 | // Suffix trees represent the suffixes of their input strings in their leaves. |
11 | // A suffix tree is a type of compressed trie structure where each node |
12 | // represents an entire substring rather than a single character. Each leaf |
13 | // of the tree is a suffix. |
14 | // |
15 | // A suffix tree can be seen as a type of state machine where each state is a |
16 | // substring of the full string. The tree is structured so that, for a string |
17 | // of length N, there are exactly N leaves in the tree. This structure allows |
18 | // us to quickly find repeated substrings of the input string. |
19 | // |
20 | // In this implementation, a "string" is a vector of unsigned integers. |
21 | // These integers may result from hashing some data type. A suffix tree can |
22 | // contain 1 or many strings, which can then be queried as one large string. |
23 | // |
24 | // The suffix tree is implemented using Ukkonen's algorithm for linear-time |
25 | // suffix tree construction. Ukkonen's algorithm is explained in more detail |
26 | // in the paper by Esko Ukkonen "On-line construction of suffix trees. The |
27 | // paper is available at |
28 | // |
29 | // https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf |
30 | //===----------------------------------------------------------------------===// |
31 | |
32 | #ifndef LLVM_SUPPORT_SUFFIXTREE_H |
33 | #define LLVM_SUPPORT_SUFFIXTREE_H |
34 | |
35 | #include "llvm/ADT/ArrayRef.h" |
36 | #include "llvm/Support/Allocator.h" |
37 | #include "llvm/Support/SuffixTreeNode.h" |
38 | |
39 | namespace llvm { |
40 | class SuffixTree { |
41 | public: |
42 | /// Each element is an integer representing an instruction in the module. |
43 | ArrayRef<unsigned> Str; |
44 | |
45 | /// A repeated substring in the tree. |
46 | struct RepeatedSubstring { |
47 | /// The length of the string. |
48 | unsigned Length; |
49 | |
50 | /// The start indices of each occurrence. |
51 | SmallVector<unsigned> StartIndices; |
52 | }; |
53 | |
54 | private: |
55 | /// Maintains internal nodes in the tree. |
56 | SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator; |
57 | /// Maintains leaf nodes in the tree. |
58 | SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator; |
59 | |
60 | /// The root of the suffix tree. |
61 | /// |
62 | /// The root represents the empty string. It is maintained by the |
63 | /// \p NodeAllocator like every other node in the tree. |
64 | SuffixTreeInternalNode *Root = nullptr; |
65 | |
66 | /// The end index of each leaf in the tree. |
67 | unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx; |
68 | |
69 | /// Helper struct which keeps track of the next insertion point in |
70 | /// Ukkonen's algorithm. |
71 | struct ActiveState { |
72 | /// The next node to insert at. |
73 | SuffixTreeInternalNode *Node = nullptr; |
74 | |
75 | /// The index of the first character in the substring currently being added. |
76 | unsigned Idx = SuffixTreeNode::EmptyIdx; |
77 | |
78 | /// The length of the substring we have to add at the current step. |
79 | unsigned Len = 0; |
80 | }; |
81 | |
82 | /// The point the next insertion will take place at in the |
83 | /// construction algorithm. |
84 | ActiveState Active; |
85 | |
86 | /// Allocate a leaf node and add it to the tree. |
87 | /// |
88 | /// \param Parent The parent of this node. |
89 | /// \param StartIdx The start index of this node's associated string. |
90 | /// \param Edge The label on the edge leaving \p Parent to this node. |
91 | /// |
92 | /// \returns A pointer to the allocated leaf node. |
93 | SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx, |
94 | unsigned Edge); |
95 | |
96 | /// Allocate an internal node and add it to the tree. |
97 | /// |
98 | /// \param Parent The parent of this node. Only null when allocating the root. |
99 | /// \param StartIdx The start index of this node's associated string. |
100 | /// \param EndIdx The end index of this node's associated string. |
101 | /// \param Edge The label on the edge leaving \p Parent to this node. |
102 | /// |
103 | /// \returns A pointer to the allocated internal node. |
104 | SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent, |
105 | unsigned StartIdx, unsigned EndIdx, |
106 | unsigned Edge); |
107 | |
108 | /// Allocate the root node and add it to the tree. |
109 | /// |
110 | /// \returns A pointer to the root. |
111 | SuffixTreeInternalNode *insertRoot(); |
112 | |
113 | /// Set the suffix indices of the leaves to the start indices of their |
114 | /// respective suffixes. |
115 | void setSuffixIndices(); |
116 | |
117 | /// Construct the suffix tree for the prefix of the input ending at |
118 | /// \p EndIdx. |
119 | /// |
120 | /// Used to construct the full suffix tree iteratively. At the end of each |
121 | /// step, the constructed suffix tree is either a valid suffix tree, or a |
122 | /// suffix tree with implicit suffixes. At the end of the final step, the |
123 | /// suffix tree is a valid tree. |
124 | /// |
125 | /// \param EndIdx The end index of the current prefix in the main string. |
126 | /// \param SuffixesToAdd The number of suffixes that must be added |
127 | /// to complete the suffix tree at the current phase. |
128 | /// |
129 | /// \returns The number of suffixes that have not been added at the end of |
130 | /// this step. |
131 | unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); |
132 | |
133 | public: |
134 | /// Construct a suffix tree from a sequence of unsigned integers. |
135 | /// |
136 | /// \param Str The string to construct the suffix tree for. |
137 | SuffixTree(const ArrayRef<unsigned> &Str); |
138 | |
139 | /// Iterator for finding all repeated substrings in the suffix tree. |
140 | struct RepeatedSubstringIterator { |
141 | private: |
142 | /// The current node we're visiting. |
143 | SuffixTreeNode *N = nullptr; |
144 | |
145 | /// The repeated substring associated with this node. |
146 | RepeatedSubstring RS; |
147 | |
148 | /// The nodes left to visit. |
149 | SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit; |
150 | |
151 | /// The minimum length of a repeated substring to find. |
152 | /// Since we're outlining, we want at least two instructions in the range. |
153 | /// FIXME: This may not be true for targets like X86 which support many |
154 | /// instruction lengths. |
155 | const unsigned MinLength = 2; |
156 | |
157 | /// Move the iterator to the next repeated substring. |
158 | void advance(); |
159 | |
160 | public: |
161 | /// Return the current repeated substring. |
162 | RepeatedSubstring &operator*() { return RS; } |
163 | |
164 | RepeatedSubstringIterator &operator++() { |
165 | advance(); |
166 | return *this; |
167 | } |
168 | |
169 | RepeatedSubstringIterator operator++(int I) { |
170 | RepeatedSubstringIterator It(*this); |
171 | advance(); |
172 | return It; |
173 | } |
174 | |
175 | bool operator==(const RepeatedSubstringIterator &Other) const { |
176 | return N == Other.N; |
177 | } |
178 | bool operator!=(const RepeatedSubstringIterator &Other) const { |
179 | return !(*this == Other); |
180 | } |
181 | |
182 | RepeatedSubstringIterator(SuffixTreeInternalNode *N) : N(N) { |
183 | // Do we have a non-null node? |
184 | if (!N) |
185 | return; |
186 | // Yes. At the first step, we need to visit all of N's children. |
187 | // Note: This means that we visit N last. |
188 | InternalNodesToVisit.push_back(Elt: N); |
189 | advance(); |
190 | } |
191 | }; |
192 | |
193 | typedef RepeatedSubstringIterator iterator; |
194 | iterator begin() { return iterator(Root); } |
195 | iterator end() { return iterator(nullptr); } |
196 | }; |
197 | |
198 | } // namespace llvm |
199 | |
200 | #endif // LLVM_SUPPORT_SUFFIXTREE_H |
201 | |