1 | //===- llvm/ADT/SparseMultiSet.h - Sparse multiset --------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file defines the SparseMultiSet class, which adds multiset behavior to |
11 | /// the SparseSet. |
12 | /// |
13 | /// A sparse multiset holds a small number of objects identified by integer keys |
14 | /// from a moderately sized universe. The sparse multiset uses more memory than |
15 | /// other containers in order to provide faster operations. Any key can map to |
16 | /// multiple values. A SparseMultiSetNode class is provided, which serves as a |
17 | /// convenient base class for the contents of a SparseMultiSet. |
18 | /// |
19 | //===----------------------------------------------------------------------===// |
20 | |
21 | #ifndef LLVM_ADT_SPARSEMULTISET_H |
22 | #define LLVM_ADT_SPARSEMULTISET_H |
23 | |
24 | #include "llvm/ADT/identity.h" |
25 | #include "llvm/ADT/SmallVector.h" |
26 | #include "llvm/ADT/SparseSet.h" |
27 | #include <cassert> |
28 | #include <cstdint> |
29 | #include <cstdlib> |
30 | #include <iterator> |
31 | #include <limits> |
32 | #include <utility> |
33 | |
34 | namespace llvm { |
35 | |
36 | /// Fast multiset implementation for objects that can be identified by small |
37 | /// unsigned keys. |
38 | /// |
39 | /// SparseMultiSet allocates memory proportional to the size of the key |
40 | /// universe, so it is not recommended for building composite data structures. |
41 | /// It is useful for algorithms that require a single set with fast operations. |
42 | /// |
43 | /// Compared to DenseSet and DenseMap, SparseMultiSet provides constant-time |
44 | /// fast clear() as fast as a vector. The find(), insert(), and erase() |
45 | /// operations are all constant time, and typically faster than a hash table. |
46 | /// The iteration order doesn't depend on numerical key values, it only depends |
47 | /// on the order of insert() and erase() operations. Iteration order is the |
48 | /// insertion order. Iteration is only provided over elements of equivalent |
49 | /// keys, but iterators are bidirectional. |
50 | /// |
51 | /// Compared to BitVector, SparseMultiSet<unsigned> uses 8x-40x more memory, but |
52 | /// offers constant-time clear() and size() operations as well as fast iteration |
53 | /// independent on the size of the universe. |
54 | /// |
55 | /// SparseMultiSet contains a dense vector holding all the objects and a sparse |
56 | /// array holding indexes into the dense vector. Most of the memory is used by |
57 | /// the sparse array which is the size of the key universe. The SparseT template |
58 | /// parameter provides a space/speed tradeoff for sets holding many elements. |
59 | /// |
60 | /// When SparseT is uint32_t, find() only touches up to 3 cache lines, but the |
61 | /// sparse array uses 4 x Universe bytes. |
62 | /// |
63 | /// When SparseT is uint8_t (the default), find() touches up to 3+[N/256] cache |
64 | /// lines, but the sparse array is 4x smaller. N is the number of elements in |
65 | /// the set. |
66 | /// |
67 | /// For sets that may grow to thousands of elements, SparseT should be set to |
68 | /// uint16_t or uint32_t. |
69 | /// |
70 | /// Multiset behavior is provided by providing doubly linked lists for values |
71 | /// that are inlined in the dense vector. SparseMultiSet is a good choice when |
72 | /// one desires a growable number of entries per key, as it will retain the |
73 | /// SparseSet algorithmic properties despite being growable. Thus, it is often a |
74 | /// better choice than a SparseSet of growable containers or a vector of |
75 | /// vectors. SparseMultiSet also keeps iterators valid after erasure (provided |
76 | /// the iterators don't point to the element erased), allowing for more |
77 | /// intuitive and fast removal. |
78 | /// |
79 | /// @tparam ValueT The type of objects in the set. |
80 | /// @tparam KeyFunctorT A functor that computes an unsigned index from KeyT. |
81 | /// @tparam SparseT An unsigned integer type. See above. |
82 | /// |
83 | template<typename ValueT, |
84 | typename KeyFunctorT = identity<unsigned>, |
85 | typename SparseT = uint8_t> |
86 | class SparseMultiSet { |
87 | static_assert(std::is_unsigned_v<SparseT>, |
88 | "SparseT must be an unsigned integer type" ); |
89 | |
90 | /// The actual data that's stored, as a doubly-linked list implemented via |
91 | /// indices into the DenseVector. The doubly linked list is implemented |
92 | /// circular in Prev indices, and INVALID-terminated in Next indices. This |
93 | /// provides efficient access to list tails. These nodes can also be |
94 | /// tombstones, in which case they are actually nodes in a single-linked |
95 | /// freelist of recyclable slots. |
96 | struct SMSNode { |
97 | static constexpr unsigned INVALID = ~0U; |
98 | |
99 | ValueT Data; |
100 | unsigned Prev; |
101 | unsigned Next; |
102 | |
103 | SMSNode(ValueT D, unsigned P, unsigned N) : Data(D), Prev(P), Next(N) {} |
104 | |
105 | /// List tails have invalid Nexts. |
106 | bool isTail() const { |
107 | return Next == INVALID; |
108 | } |
109 | |
110 | /// Whether this node is a tombstone node, and thus is in our freelist. |
111 | bool isTombstone() const { |
112 | return Prev == INVALID; |
113 | } |
114 | |
115 | /// Since the list is circular in Prev, all non-tombstone nodes have a valid |
116 | /// Prev. |
117 | bool isValid() const { return Prev != INVALID; } |
118 | }; |
119 | |
120 | using KeyT = typename KeyFunctorT::argument_type; |
121 | using DenseT = SmallVector<SMSNode, 8>; |
122 | DenseT Dense; |
123 | SparseT *Sparse = nullptr; |
124 | unsigned Universe = 0; |
125 | KeyFunctorT KeyIndexOf; |
126 | SparseSetValFunctor<KeyT, ValueT, KeyFunctorT> ValIndexOf; |
127 | |
128 | /// We have a built-in recycler for reusing tombstone slots. This recycler |
129 | /// puts a singly-linked free list into tombstone slots, allowing us quick |
130 | /// erasure, iterator preservation, and dense size. |
131 | unsigned FreelistIdx = SMSNode::INVALID; |
132 | unsigned NumFree = 0; |
133 | |
134 | unsigned sparseIndex(const ValueT &Val) const { |
135 | assert(ValIndexOf(Val) < Universe && |
136 | "Invalid key in set. Did object mutate?" ); |
137 | return ValIndexOf(Val); |
138 | } |
139 | unsigned sparseIndex(const SMSNode &N) const { return sparseIndex(N.Data); } |
140 | |
141 | /// Whether the given entry is the head of the list. List heads's previous |
142 | /// pointers are to the tail of the list, allowing for efficient access to the |
143 | /// list tail. D must be a valid entry node. |
144 | bool (const SMSNode &D) const { |
145 | assert(D.isValid() && "Invalid node for head" ); |
146 | return Dense[D.Prev].isTail(); |
147 | } |
148 | |
149 | /// Whether the given entry is a singleton entry, i.e. the only entry with |
150 | /// that key. |
151 | bool isSingleton(const SMSNode &N) const { |
152 | assert(N.isValid() && "Invalid node for singleton" ); |
153 | // Is N its own predecessor? |
154 | return &Dense[N.Prev] == &N; |
155 | } |
156 | |
157 | /// Add in the given SMSNode. Uses a free entry in our freelist if |
158 | /// available. Returns the index of the added node. |
159 | unsigned addValue(const ValueT& V, unsigned Prev, unsigned Next) { |
160 | if (NumFree == 0) { |
161 | Dense.push_back(SMSNode(V, Prev, Next)); |
162 | return Dense.size() - 1; |
163 | } |
164 | |
165 | // Peel off a free slot |
166 | unsigned Idx = FreelistIdx; |
167 | unsigned NextFree = Dense[Idx].Next; |
168 | assert(Dense[Idx].isTombstone() && "Non-tombstone free?" ); |
169 | |
170 | Dense[Idx] = SMSNode(V, Prev, Next); |
171 | FreelistIdx = NextFree; |
172 | --NumFree; |
173 | return Idx; |
174 | } |
175 | |
176 | /// Make the current index a new tombstone. Pushes it onto the freelist. |
177 | void makeTombstone(unsigned Idx) { |
178 | Dense[Idx].Prev = SMSNode::INVALID; |
179 | Dense[Idx].Next = FreelistIdx; |
180 | FreelistIdx = Idx; |
181 | ++NumFree; |
182 | } |
183 | |
184 | public: |
185 | using value_type = ValueT; |
186 | using reference = ValueT &; |
187 | using const_reference = const ValueT &; |
188 | using pointer = ValueT *; |
189 | using const_pointer = const ValueT *; |
190 | using size_type = unsigned; |
191 | |
192 | SparseMultiSet() = default; |
193 | SparseMultiSet(const SparseMultiSet &) = delete; |
194 | SparseMultiSet &operator=(const SparseMultiSet &) = delete; |
195 | ~SparseMultiSet() { free(Sparse); } |
196 | |
197 | /// Set the universe size which determines the largest key the set can hold. |
198 | /// The universe must be sized before any elements can be added. |
199 | /// |
200 | /// @param U Universe size. All object keys must be less than U. |
201 | /// |
202 | void setUniverse(unsigned U) { |
203 | // It's not hard to resize the universe on a non-empty set, but it doesn't |
204 | // seem like a likely use case, so we can add that code when we need it. |
205 | assert(empty() && "Can only resize universe on an empty map" ); |
206 | // Hysteresis prevents needless reallocations. |
207 | if (U >= Universe/4 && U <= Universe) |
208 | return; |
209 | free(Sparse); |
210 | // The Sparse array doesn't actually need to be initialized, so malloc |
211 | // would be enough here, but that will cause tools like valgrind to |
212 | // complain about branching on uninitialized data. |
213 | Sparse = static_cast<SparseT*>(safe_calloc(Count: U, Sz: sizeof(SparseT))); |
214 | Universe = U; |
215 | } |
216 | |
217 | /// Our iterators are iterators over the collection of objects that share a |
218 | /// key. |
219 | template <typename SMSPtrTy> class iterator_base { |
220 | friend class SparseMultiSet; |
221 | |
222 | public: |
223 | using iterator_category = std::bidirectional_iterator_tag; |
224 | using value_type = ValueT; |
225 | using difference_type = std::ptrdiff_t; |
226 | using pointer = value_type *; |
227 | using reference = value_type &; |
228 | |
229 | private: |
230 | SMSPtrTy SMS; |
231 | unsigned Idx; |
232 | unsigned SparseIdx; |
233 | |
234 | iterator_base(SMSPtrTy P, unsigned I, unsigned SI) |
235 | : SMS(P), Idx(I), SparseIdx(SI) {} |
236 | |
237 | /// Whether our iterator has fallen outside our dense vector. |
238 | bool isEnd() const { |
239 | if (Idx == SMSNode::INVALID) |
240 | return true; |
241 | |
242 | assert(Idx < SMS->Dense.size() && "Out of range, non-INVALID Idx?" ); |
243 | return false; |
244 | } |
245 | |
246 | /// Whether our iterator is properly keyed, i.e. the SparseIdx is valid |
247 | bool isKeyed() const { return SparseIdx < SMS->Universe; } |
248 | |
249 | unsigned Prev() const { return SMS->Dense[Idx].Prev; } |
250 | unsigned Next() const { return SMS->Dense[Idx].Next; } |
251 | |
252 | void setPrev(unsigned P) { SMS->Dense[Idx].Prev = P; } |
253 | void setNext(unsigned N) { SMS->Dense[Idx].Next = N; } |
254 | |
255 | public: |
256 | reference operator*() const { |
257 | assert(isKeyed() && SMS->sparseIndex(SMS->Dense[Idx].Data) == SparseIdx && |
258 | "Dereferencing iterator of invalid key or index" ); |
259 | |
260 | return SMS->Dense[Idx].Data; |
261 | } |
262 | pointer operator->() const { return &operator*(); } |
263 | |
264 | /// Comparison operators |
265 | bool operator==(const iterator_base &RHS) const { |
266 | // end compares equal |
267 | if (SMS == RHS.SMS && Idx == RHS.Idx) { |
268 | assert((isEnd() || SparseIdx == RHS.SparseIdx) && |
269 | "Same dense entry, but different keys?" ); |
270 | return true; |
271 | } |
272 | |
273 | return false; |
274 | } |
275 | |
276 | bool operator!=(const iterator_base &RHS) const { |
277 | return !operator==(RHS); |
278 | } |
279 | |
280 | /// Increment and decrement operators |
281 | iterator_base &operator--() { // predecrement - Back up |
282 | assert(isKeyed() && "Decrementing an invalid iterator" ); |
283 | assert((isEnd() || !SMS->isHead(SMS->Dense[Idx])) && |
284 | "Decrementing head of list" ); |
285 | |
286 | // If we're at the end, then issue a new find() |
287 | if (isEnd()) |
288 | Idx = SMS->findIndex(SparseIdx).Prev(); |
289 | else |
290 | Idx = Prev(); |
291 | |
292 | return *this; |
293 | } |
294 | iterator_base &operator++() { // preincrement - Advance |
295 | assert(!isEnd() && isKeyed() && "Incrementing an invalid/end iterator" ); |
296 | Idx = Next(); |
297 | return *this; |
298 | } |
299 | iterator_base operator--(int) { // postdecrement |
300 | iterator_base I(*this); |
301 | --*this; |
302 | return I; |
303 | } |
304 | iterator_base operator++(int) { // postincrement |
305 | iterator_base I(*this); |
306 | ++*this; |
307 | return I; |
308 | } |
309 | }; |
310 | |
311 | using iterator = iterator_base<SparseMultiSet *>; |
312 | using const_iterator = iterator_base<const SparseMultiSet *>; |
313 | |
314 | // Convenience types |
315 | using RangePair = std::pair<iterator, iterator>; |
316 | |
317 | /// Returns an iterator past this container. Note that such an iterator cannot |
318 | /// be decremented, but will compare equal to other end iterators. |
319 | iterator end() { return iterator(this, SMSNode::INVALID, SMSNode::INVALID); } |
320 | const_iterator end() const { |
321 | return const_iterator(this, SMSNode::INVALID, SMSNode::INVALID); |
322 | } |
323 | |
324 | /// Returns true if the set is empty. |
325 | /// |
326 | /// This is not the same as BitVector::empty(). |
327 | /// |
328 | bool empty() const { return size() == 0; } |
329 | |
330 | /// Returns the number of elements in the set. |
331 | /// |
332 | /// This is not the same as BitVector::size() which returns the size of the |
333 | /// universe. |
334 | /// |
335 | size_type size() const { |
336 | assert(NumFree <= Dense.size() && "Out-of-bounds free entries" ); |
337 | return Dense.size() - NumFree; |
338 | } |
339 | |
340 | /// Clears the set. This is a very fast constant time operation. |
341 | /// |
342 | void clear() { |
343 | // Sparse does not need to be cleared, see find(). |
344 | Dense.clear(); |
345 | NumFree = 0; |
346 | FreelistIdx = SMSNode::INVALID; |
347 | } |
348 | |
349 | /// Find an element by its index. |
350 | /// |
351 | /// @param Idx A valid index to find. |
352 | /// @returns An iterator to the element identified by key, or end(). |
353 | /// |
354 | iterator findIndex(unsigned Idx) { |
355 | assert(Idx < Universe && "Key out of range" ); |
356 | const unsigned Stride = std::numeric_limits<SparseT>::max() + 1u; |
357 | for (unsigned i = Sparse[Idx], e = Dense.size(); i < e; i += Stride) { |
358 | const unsigned FoundIdx = sparseIndex(Dense[i]); |
359 | // Check that we're pointing at the correct entry and that it is the head |
360 | // of a valid list. |
361 | if (Idx == FoundIdx && Dense[i].isValid() && isHead(D: Dense[i])) |
362 | return iterator(this, i, Idx); |
363 | // Stride is 0 when SparseT >= unsigned. We don't need to loop. |
364 | if (!Stride) |
365 | break; |
366 | } |
367 | return end(); |
368 | } |
369 | |
370 | /// Find an element by its key. |
371 | /// |
372 | /// @param Key A valid key to find. |
373 | /// @returns An iterator to the element identified by key, or end(). |
374 | /// |
375 | iterator find(const KeyT &Key) { |
376 | return findIndex(Idx: KeyIndexOf(Key)); |
377 | } |
378 | |
379 | const_iterator find(const KeyT &Key) const { |
380 | iterator I = const_cast<SparseMultiSet*>(this)->findIndex(KeyIndexOf(Key)); |
381 | return const_iterator(I.SMS, I.Idx, KeyIndexOf(Key)); |
382 | } |
383 | |
384 | /// Returns the number of elements identified by Key. This will be linear in |
385 | /// the number of elements of that key. |
386 | size_type count(const KeyT &Key) const { |
387 | unsigned Ret = 0; |
388 | for (const_iterator It = find(Key); It != end(); ++It) |
389 | ++Ret; |
390 | |
391 | return Ret; |
392 | } |
393 | |
394 | /// Returns true if this set contains an element identified by Key. |
395 | bool contains(const KeyT &Key) const { |
396 | return find(Key) != end(); |
397 | } |
398 | |
399 | /// Return the head and tail of the subset's list, otherwise returns end(). |
400 | iterator (const KeyT &Key) { return find(Key); } |
401 | iterator getTail(const KeyT &Key) { |
402 | iterator I = find(Key); |
403 | if (I != end()) |
404 | I = iterator(this, I.Prev(), KeyIndexOf(Key)); |
405 | return I; |
406 | } |
407 | |
408 | /// The bounds of the range of items sharing Key K. First member is the head |
409 | /// of the list, and the second member is a decrementable end iterator for |
410 | /// that key. |
411 | RangePair equal_range(const KeyT &K) { |
412 | iterator B = find(K); |
413 | iterator E = iterator(this, SMSNode::INVALID, B.SparseIdx); |
414 | return std::make_pair(B, E); |
415 | } |
416 | |
417 | /// Insert a new element at the tail of the subset list. Returns an iterator |
418 | /// to the newly added entry. |
419 | iterator insert(const ValueT &Val) { |
420 | unsigned Idx = sparseIndex(Val); |
421 | iterator I = findIndex(Idx); |
422 | |
423 | unsigned NodeIdx = addValue(V: Val, Prev: SMSNode::INVALID, Next: SMSNode::INVALID); |
424 | |
425 | if (I == end()) { |
426 | // Make a singleton list |
427 | Sparse[Idx] = NodeIdx; |
428 | Dense[NodeIdx].Prev = NodeIdx; |
429 | return iterator(this, NodeIdx, Idx); |
430 | } |
431 | |
432 | // Stick it at the end. |
433 | unsigned HeadIdx = I.Idx; |
434 | unsigned TailIdx = I.Prev(); |
435 | Dense[TailIdx].Next = NodeIdx; |
436 | Dense[HeadIdx].Prev = NodeIdx; |
437 | Dense[NodeIdx].Prev = TailIdx; |
438 | |
439 | return iterator(this, NodeIdx, Idx); |
440 | } |
441 | |
442 | /// Erases an existing element identified by a valid iterator. |
443 | /// |
444 | /// This invalidates iterators pointing at the same entry, but erase() returns |
445 | /// an iterator pointing to the next element in the subset's list. This makes |
446 | /// it possible to erase selected elements while iterating over the subset: |
447 | /// |
448 | /// tie(I, E) = Set.equal_range(Key); |
449 | /// while (I != E) |
450 | /// if (test(*I)) |
451 | /// I = Set.erase(I); |
452 | /// else |
453 | /// ++I; |
454 | /// |
455 | /// Note that if the last element in the subset list is erased, this will |
456 | /// return an end iterator which can be decremented to get the new tail (if it |
457 | /// exists): |
458 | /// |
459 | /// tie(B, I) = Set.equal_range(Key); |
460 | /// for (bool isBegin = B == I; !isBegin; /* empty */) { |
461 | /// isBegin = (--I) == B; |
462 | /// if (test(I)) |
463 | /// break; |
464 | /// I = erase(I); |
465 | /// } |
466 | iterator erase(iterator I) { |
467 | assert(I.isKeyed() && !I.isEnd() && !Dense[I.Idx].isTombstone() && |
468 | "erasing invalid/end/tombstone iterator" ); |
469 | |
470 | // First, unlink the node from its list. Then swap the node out with the |
471 | // dense vector's last entry |
472 | iterator NextI = unlink(N: Dense[I.Idx]); |
473 | |
474 | // Put in a tombstone. |
475 | makeTombstone(Idx: I.Idx); |
476 | |
477 | return NextI; |
478 | } |
479 | |
480 | /// Erase all elements with the given key. This invalidates all |
481 | /// iterators of that key. |
482 | void eraseAll(const KeyT &K) { |
483 | for (iterator I = find(K); I != end(); /* empty */) |
484 | I = erase(I); |
485 | } |
486 | |
487 | private: |
488 | /// Unlink the node from its list. Returns the next node in the list. |
489 | iterator unlink(const SMSNode &N) { |
490 | if (isSingleton(N)) { |
491 | // Singleton is already unlinked |
492 | assert(N.Next == SMSNode::INVALID && "Singleton has next?" ); |
493 | return iterator(this, SMSNode::INVALID, ValIndexOf(N.Data)); |
494 | } |
495 | |
496 | if (isHead(D: N)) { |
497 | // If we're the head, then update the sparse array and our next. |
498 | Sparse[sparseIndex(N)] = N.Next; |
499 | Dense[N.Next].Prev = N.Prev; |
500 | return iterator(this, N.Next, ValIndexOf(N.Data)); |
501 | } |
502 | |
503 | if (N.isTail()) { |
504 | // If we're the tail, then update our head and our previous. |
505 | findIndex(Idx: sparseIndex(N)).setPrev(N.Prev); |
506 | Dense[N.Prev].Next = N.Next; |
507 | |
508 | // Give back an end iterator that can be decremented |
509 | iterator I(this, N.Prev, ValIndexOf(N.Data)); |
510 | return ++I; |
511 | } |
512 | |
513 | // Otherwise, just drop us |
514 | Dense[N.Next].Prev = N.Prev; |
515 | Dense[N.Prev].Next = N.Next; |
516 | return iterator(this, N.Next, ValIndexOf(N.Data)); |
517 | } |
518 | }; |
519 | |
520 | } // end namespace llvm |
521 | |
522 | #endif // LLVM_ADT_SPARSEMULTISET_H |
523 | |