1//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines nodes for use within a SuffixTree.
10//
11// Each node has either no children or at least two children, with the root
12// being a exception in the empty tree.
13//
14// Children are represented as a map between unsigned integers and nodes. If
15// a node N has a child M on unsigned integer k, then the mapping represented
16// by N is a proper prefix of the mapping represented by M. Note that this,
17// although similar to a trie is somewhat different: each node stores a full
18// substring of the full mapping rather than a single character state.
19//
20// Each internal node contains a pointer to the internal node representing
21// the same string, but with the first character chopped off. This is stored
22// in \p Link. Each leaf node stores the start index of its respective
23// suffix in \p SuffixIdx.
24//===----------------------------------------------------------------------===//
25
26#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
27#define LLVM_SUPPORT_SUFFIXTREE_NODE_H
28#include "llvm/ADT/DenseMap.h"
29
30namespace llvm {
31
32/// A node in a suffix tree which represents a substring or suffix.
33struct SuffixTreeNode {
34public:
35 /// Represents an undefined index in the suffix tree.
36 static const unsigned EmptyIdx = -1;
37 enum class NodeKind { ST_Leaf, ST_Internal };
38
39private:
40 const NodeKind Kind;
41
42 /// The start index of this node's substring in the main string.
43 unsigned StartIdx = EmptyIdx;
44
45 /// The length of the string formed by concatenating the edge labels from
46 /// the root to this node.
47 unsigned ConcatLen = 0;
48
49public:
50 // LLVM RTTI boilerplate.
51 NodeKind getKind() const { return Kind; }
52
53 /// \return the start index of this node's substring in the entire string.
54 unsigned getStartIdx() const;
55
56 /// \returns the end index of this node.
57 virtual unsigned getEndIdx() const = 0;
58
59 /// Advance this node's StartIdx by \p Inc.
60 void incrementStartIdx(unsigned Inc);
61
62 /// Set the length of the string from the root to this node to \p Len.
63 void setConcatLen(unsigned Len);
64
65 /// \returns the length of the string from the root to this node.
66 unsigned getConcatLen() const;
67
68 SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
69 : Kind(Kind), StartIdx(StartIdx) {}
70 virtual ~SuffixTreeNode() = default;
71};
72
73// A node with two or more children, or the root.
74struct SuffixTreeInternalNode : SuffixTreeNode {
75private:
76 /// The end index of this node's substring in the main string.
77 ///
78 /// Every leaf node must have its \p EndIdx incremented at the end of every
79 /// step in the construction algorithm. To avoid having to update O(N)
80 /// nodes individually at the end of every step, the end index is stored
81 /// as a pointer.
82 unsigned EndIdx = EmptyIdx;
83
84 /// A pointer to the internal node representing the same sequence with the
85 /// first character chopped off.
86 ///
87 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
88 /// Ukkonen's algorithm does to achieve linear-time construction is
89 /// keep track of which node the next insert should be at. This makes each
90 /// insert O(1), and there are a total of O(N) inserts. The suffix link
91 /// helps with inserting children of internal nodes.
92 ///
93 /// Say we add a child to an internal node with associated mapping S. The
94 /// next insertion must be at the node representing S - its first character.
95 /// This is given by the way that we iteratively build the tree in Ukkonen's
96 /// algorithm. The main idea is to look at the suffixes of each prefix in the
97 /// string, starting with the longest suffix of the prefix, and ending with
98 /// the shortest. Therefore, if we keep pointers between such nodes, we can
99 /// move to the next insertion point in O(1) time. If we don't, then we'd
100 /// have to query from the root, which takes O(N) time. This would make the
101 /// construction algorithm O(N^2) rather than O(N).
102 SuffixTreeInternalNode *Link = nullptr;
103
104public:
105 // LLVM RTTI boilerplate.
106 static bool classof(const SuffixTreeNode *N) {
107 return N->getKind() == NodeKind::ST_Internal;
108 }
109
110 /// \returns true if this node is the root of its owning \p SuffixTree.
111 bool isRoot() const;
112
113 /// \returns the end index of this node's substring in the entire string.
114 unsigned getEndIdx() const override;
115
116 /// Sets \p Link to \p L. Assumes \p L is not null.
117 void setLink(SuffixTreeInternalNode *L);
118
119 /// \returns the pointer to the Link node.
120 SuffixTreeInternalNode *getLink() const;
121
122 /// The children of this node.
123 ///
124 /// A child existing on an unsigned integer implies that from the mapping
125 /// represented by the current node, there is a way to reach another
126 /// mapping by tacking that character on the end of the current string.
127 DenseMap<unsigned, SuffixTreeNode *> Children;
128
129 SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
130 SuffixTreeInternalNode *Link)
131 : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
132 Link(Link) {}
133
134 virtual ~SuffixTreeInternalNode() = default;
135};
136
137// A node representing a suffix.
138struct SuffixTreeLeafNode : SuffixTreeNode {
139private:
140 /// The start index of the suffix represented by this leaf.
141 unsigned SuffixIdx = EmptyIdx;
142
143 /// The end index of this node's substring in the main string.
144 ///
145 /// Every leaf node must have its \p EndIdx incremented at the end of every
146 /// step in the construction algorithm. To avoid having to update O(N)
147 /// nodes individually at the end of every step, the end index is stored
148 /// as a pointer.
149 unsigned *EndIdx = nullptr;
150
151public:
152 // LLVM RTTI boilerplate.
153 static bool classof(const SuffixTreeNode *N) {
154 return N->getKind() == NodeKind::ST_Leaf;
155 }
156
157 /// \returns the end index of this node's substring in the entire string.
158 unsigned getEndIdx() const override;
159
160 /// \returns the start index of the suffix represented by this leaf.
161 unsigned getSuffixIdx() const;
162
163 /// Sets the start index of the suffix represented by this leaf to \p Idx.
164 void setSuffixIdx(unsigned Idx);
165 SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
166 : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
167
168 virtual ~SuffixTreeLeafNode() = default;
169};
170} // namespace llvm
171#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H

source code of llvm/include/llvm/Support/SuffixTreeNode.h