1 | //===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines nodes for use within a SuffixTree. |
10 | // |
11 | // Each node has either no children or at least two children, with the root |
12 | // being a exception in the empty tree. |
13 | // |
14 | // Children are represented as a map between unsigned integers and nodes. If |
15 | // a node N has a child M on unsigned integer k, then the mapping represented |
16 | // by N is a proper prefix of the mapping represented by M. Note that this, |
17 | // although similar to a trie is somewhat different: each node stores a full |
18 | // substring of the full mapping rather than a single character state. |
19 | // |
20 | // Each internal node contains a pointer to the internal node representing |
21 | // the same string, but with the first character chopped off. This is stored |
22 | // in \p Link. Each leaf node stores the start index of its respective |
23 | // suffix in \p SuffixIdx. |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H |
27 | #define LLVM_SUPPORT_SUFFIXTREE_NODE_H |
28 | #include "llvm/ADT/DenseMap.h" |
29 | |
30 | namespace llvm { |
31 | |
32 | /// A node in a suffix tree which represents a substring or suffix. |
33 | struct SuffixTreeNode { |
34 | public: |
35 | /// Represents an undefined index in the suffix tree. |
36 | static const unsigned EmptyIdx = -1; |
37 | enum class NodeKind { ST_Leaf, ST_Internal }; |
38 | |
39 | private: |
40 | const NodeKind Kind; |
41 | |
42 | /// The start index of this node's substring in the main string. |
43 | unsigned StartIdx = EmptyIdx; |
44 | |
45 | /// The length of the string formed by concatenating the edge labels from |
46 | /// the root to this node. |
47 | unsigned ConcatLen = 0; |
48 | |
49 | public: |
50 | // LLVM RTTI boilerplate. |
51 | NodeKind getKind() const { return Kind; } |
52 | |
53 | /// \return the start index of this node's substring in the entire string. |
54 | unsigned getStartIdx() const; |
55 | |
56 | /// \returns the end index of this node. |
57 | virtual unsigned getEndIdx() const = 0; |
58 | |
59 | /// Advance this node's StartIdx by \p Inc. |
60 | void incrementStartIdx(unsigned Inc); |
61 | |
62 | /// Set the length of the string from the root to this node to \p Len. |
63 | void setConcatLen(unsigned Len); |
64 | |
65 | /// \returns the length of the string from the root to this node. |
66 | unsigned getConcatLen() const; |
67 | |
68 | SuffixTreeNode(NodeKind Kind, unsigned StartIdx) |
69 | : Kind(Kind), StartIdx(StartIdx) {} |
70 | virtual ~SuffixTreeNode() = default; |
71 | }; |
72 | |
73 | // A node with two or more children, or the root. |
74 | struct SuffixTreeInternalNode : SuffixTreeNode { |
75 | private: |
76 | /// The end index of this node's substring in the main string. |
77 | /// |
78 | /// Every leaf node must have its \p EndIdx incremented at the end of every |
79 | /// step in the construction algorithm. To avoid having to update O(N) |
80 | /// nodes individually at the end of every step, the end index is stored |
81 | /// as a pointer. |
82 | unsigned EndIdx = EmptyIdx; |
83 | |
84 | /// A pointer to the internal node representing the same sequence with the |
85 | /// first character chopped off. |
86 | /// |
87 | /// This acts as a shortcut in Ukkonen's algorithm. One of the things that |
88 | /// Ukkonen's algorithm does to achieve linear-time construction is |
89 | /// keep track of which node the next insert should be at. This makes each |
90 | /// insert O(1), and there are a total of O(N) inserts. The suffix link |
91 | /// helps with inserting children of internal nodes. |
92 | /// |
93 | /// Say we add a child to an internal node with associated mapping S. The |
94 | /// next insertion must be at the node representing S - its first character. |
95 | /// This is given by the way that we iteratively build the tree in Ukkonen's |
96 | /// algorithm. The main idea is to look at the suffixes of each prefix in the |
97 | /// string, starting with the longest suffix of the prefix, and ending with |
98 | /// the shortest. Therefore, if we keep pointers between such nodes, we can |
99 | /// move to the next insertion point in O(1) time. If we don't, then we'd |
100 | /// have to query from the root, which takes O(N) time. This would make the |
101 | /// construction algorithm O(N^2) rather than O(N). |
102 | SuffixTreeInternalNode *Link = nullptr; |
103 | |
104 | public: |
105 | // LLVM RTTI boilerplate. |
106 | static bool classof(const SuffixTreeNode *N) { |
107 | return N->getKind() == NodeKind::ST_Internal; |
108 | } |
109 | |
110 | /// \returns true if this node is the root of its owning \p SuffixTree. |
111 | bool isRoot() const; |
112 | |
113 | /// \returns the end index of this node's substring in the entire string. |
114 | unsigned getEndIdx() const override; |
115 | |
116 | /// Sets \p Link to \p L. Assumes \p L is not null. |
117 | void setLink(SuffixTreeInternalNode *L); |
118 | |
119 | /// \returns the pointer to the Link node. |
120 | SuffixTreeInternalNode *getLink() const; |
121 | |
122 | /// The children of this node. |
123 | /// |
124 | /// A child existing on an unsigned integer implies that from the mapping |
125 | /// represented by the current node, there is a way to reach another |
126 | /// mapping by tacking that character on the end of the current string. |
127 | DenseMap<unsigned, SuffixTreeNode *> Children; |
128 | |
129 | SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, |
130 | SuffixTreeInternalNode *Link) |
131 | : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), |
132 | Link(Link) {} |
133 | |
134 | virtual ~SuffixTreeInternalNode() = default; |
135 | }; |
136 | |
137 | // A node representing a suffix. |
138 | struct SuffixTreeLeafNode : SuffixTreeNode { |
139 | private: |
140 | /// The start index of the suffix represented by this leaf. |
141 | unsigned SuffixIdx = EmptyIdx; |
142 | |
143 | /// The end index of this node's substring in the main string. |
144 | /// |
145 | /// Every leaf node must have its \p EndIdx incremented at the end of every |
146 | /// step in the construction algorithm. To avoid having to update O(N) |
147 | /// nodes individually at the end of every step, the end index is stored |
148 | /// as a pointer. |
149 | unsigned *EndIdx = nullptr; |
150 | |
151 | public: |
152 | // LLVM RTTI boilerplate. |
153 | static bool classof(const SuffixTreeNode *N) { |
154 | return N->getKind() == NodeKind::ST_Leaf; |
155 | } |
156 | |
157 | /// \returns the end index of this node's substring in the entire string. |
158 | unsigned getEndIdx() const override; |
159 | |
160 | /// \returns the start index of the suffix represented by this leaf. |
161 | unsigned getSuffixIdx() const; |
162 | |
163 | /// Sets the start index of the suffix represented by this leaf to \p Idx. |
164 | void setSuffixIdx(unsigned Idx); |
165 | SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx) |
166 | : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} |
167 | |
168 | virtual ~SuffixTreeLeafNode() = default; |
169 | }; |
170 | } // namespace llvm |
171 | #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H |