1//===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This defines posting list interface: a storage for identifiers of symbols
11/// which can be characterized by a specific feature (such as fuzzy-find
12/// trigram, scope, type or any other Search Token). Posting lists can be
13/// traversed in order using an iterator and are values for inverted index,
14/// which maps search tokens to corresponding posting lists.
15///
16/// In order to decrease size of Index in-memory representation, Variable Byte
17/// Encoding (VByte) is used for PostingLists compression. An overview of VByte
18/// algorithm can be found in "Introduction to Information Retrieval" book:
19/// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html
20///
21//===----------------------------------------------------------------------===//
22
23#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
24#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
25
26#include "Iterator.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/SmallVector.h"
29#include <cstdint>
30#include <vector>
31
32namespace clang {
33namespace clangd {
34namespace dex {
35class Token;
36
37/// NOTE: This is an implementation detail.
38///
39/// Chunk is a fixed-width piece of PostingList which contains the first DocID
40/// in uncompressed format (Head) and delta-encoded Payload. It can be
41/// decompressed upon request.
42struct Chunk {
43 /// Keep sizeof(Chunk) == 32.
44 static constexpr size_t PayloadSize = 32 - sizeof(DocID);
45
46 llvm::SmallVector<DocID, PayloadSize + 1> decompress() const;
47
48 /// The first element of decompressed Chunk.
49 DocID Head;
50 /// VByte-encoded deltas.
51 std::array<uint8_t, PayloadSize> Payload;
52};
53static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory.");
54
55/// PostingList is the storage of DocIDs which can be inserted to the Query
56/// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs
57/// are stored in underlying chunks. Compression saves memory at a small cost
58/// in access time, which is still fast enough in practice.
59class PostingList {
60public:
61 explicit PostingList(llvm::ArrayRef<DocID> Documents);
62
63 /// Constructs DocumentIterator over given posting list. DocumentIterator will
64 /// go through the chunks and decompress them on-the-fly when necessary.
65 /// If given, Tok is only used for the string representation.
66 std::unique_ptr<Iterator> iterator(const Token *Tok = nullptr) const;
67
68 /// Returns in-memory size of external storage.
69 size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); }
70
71private:
72 const std::vector<Chunk> Chunks;
73};
74
75} // namespace dex
76} // namespace clangd
77} // namespace clang
78
79#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
80

source code of clang-tools-extra/clangd/index/dex/PostingList.h