1//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Tokens are the first level of abstraction above bytes used in pseudoparsing.
10// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
11// The tokens is wrapped into pseudo::Token, along with line/indent info.
12//
13// Unlike clang, we make multiple passes over the whole file, out-of-order.
14// Therefore we retain the whole token sequence in memory. (This is feasible as
15// we process one file at a time). pseudo::TokenStream holds such a stream.
16// The initial stream holds the raw tokens read from the file, later passes
17// operate on derived TokenStreams (e.g. with directives stripped).
18//
19// Similar facilities from clang that are *not* used:
20// - SourceManager: designed around multiple files and precise macro expansion.
21// - clang::Token: coupled to SourceManager, doesn't retain layout info.
22// (pseudo::Token is similar, but without SourceLocations).
23// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
24// (pseudo::TokenStream is similar, but a flat token list).
25//
26//===----------------------------------------------------------------------===//
27
28#ifndef CLANG_PSEUDO_TOKEN_H
29#define CLANG_PSEUDO_TOKEN_H
30
31#include "clang/Basic/LLVM.h"
32#include "clang/Basic/LangStandard.h"
33#include "clang/Basic/TokenKinds.h"
34#include "llvm/ADT/ArrayRef.h"
35#include "llvm/ADT/STLForwardCompat.h"
36#include "llvm/Support/raw_ostream.h"
37#include <cstdint>
38#include <limits>
39#include <memory>
40#include <vector>
41
42namespace clang {
43class LangOptions;
44namespace pseudo {
45
46/// A single C++ or preprocessor token.
47///
48/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
49/// SourceManager - we are not dealing with multiple files.
50struct Token {
51 /// An Index identifies a token within a stream.
52 using Index = uint32_t;
53 /// A sentinel Index indicating no token.
54 constexpr static Index Invalid = std::numeric_limits<Index>::max();
55 struct Range;
56
57 /// The token text.
58 ///
59 /// Typically from the original source file, but may have been synthesized.
60 StringRef text() const { return StringRef(Data, Length); }
61 const char *Data = nullptr;
62 uint32_t Length = 0;
63
64 /// Zero-based line number for the start of the token.
65 /// This refers to the original source file as written.
66 uint32_t Line = 0;
67 /// Width of whitespace before the first token on this line.
68 uint8_t Indent = 0;
69 /// Flags have some meaning defined by the function that produced this stream.
70 uint8_t Flags = 0;
71 /// Index into the original token stream (as raw-lexed from the source code).
72 Index OriginalIndex = Invalid;
73 // Helpers to get/set Flags based on `enum class`.
74 template <class T> bool flag(T Mask) const {
75 return Flags & uint8_t{llvm::to_underlying(Mask)};
76 }
77 template <class T> void setFlag(T Mask) {
78 Flags |= uint8_t{llvm::to_underlying(Mask)};
79 }
80
81 /// Returns the next token in the stream. this may not be a sentinel.
82 const Token &next() const {
83 assert(Kind != tok::eof);
84 return *(this + 1);
85 }
86 /// Returns the next token in the stream, skipping over comments.
87 const Token &nextNC() const {
88 const Token *T = this;
89 do
90 T = &T->next();
91 while (T->Kind == tok::comment);
92 return *T;
93 }
94 /// Returns the previous token in the stream. this may not be a sentinel.
95 const Token &prev() const {
96 assert(Kind != tok::eof);
97 return *(this - 1);
98 }
99 /// Returns the bracket paired with this one, if any.
100 const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }
101
102 /// The type of token as determined by clang's lexer.
103 clang::tok::TokenKind Kind = clang::tok::unknown;
104 /// If this token is a paired bracket, the offset of the pair in the stream.
105 int32_t Pair = 0;
106};
107static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!");
108llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
109
110/// A half-open range of tokens within a stream.
111struct Token::Range {
112 Index Begin = 0;
113 Index End = 0;
114
115 uint32_t size() const { return End - Begin; }
116 static Range emptyAt(Index Index) { return Range{.Begin: Index, .End: Index}; }
117};
118llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
119
120/// A complete sequence of Tokens representing a source file.
121///
122/// This may match a raw file from disk, or be derived from a previous stream.
123/// For example, stripping comments from a TokenStream results in a new stream.
124///
125/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
126/// int main ( ) ;
127/// eof kw_int ident l_paren r_paren semi eof
128/// front() back()
129/// 0 1 2 3 4 5
130class TokenStream {
131public:
132 /// Create an empty stream.
133 ///
134 /// Initially, the stream is appendable and not finalized.
135 /// The token sequence may only be accessed after finalize() is called.
136 ///
137 /// Payload is an opaque object which will be owned by the stream.
138 /// e.g. an allocator to hold backing storage for synthesized token text.
139 explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
140
141 /// Append a token to the stream, which must not be finalized.
142 void push(Token T) {
143 assert(!isFinalized());
144 Storage.push_back(x: std::move(T));
145 }
146
147 /// Finalize the token stream, allowing tokens to be accessed.
148 /// Tokens may no longer be appended.
149 void finalize();
150 bool isFinalized() const;
151
152 /// Returns the index of T within the stream.
153 ///
154 /// T must be within the stream or the end sentinel (not the start sentinel).
155 Token::Index index(const Token &T) const {
156 assert(isFinalized());
157 assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
158 assert(&T != Storage.data() && "start sentinel");
159 return &T - Tokens.data();
160 }
161
162 ArrayRef<Token> tokens() const {
163 assert(isFinalized());
164 return Tokens;
165 }
166 ArrayRef<Token> tokens(Token::Range R) const {
167 return tokens().slice(N: R.Begin, M: R.End - R.Begin);
168 }
169
170 MutableArrayRef<Token> tokens() {
171 assert(isFinalized());
172 return Tokens;
173 }
174
175 /// May return the end sentinel if the stream is empty.
176 const Token &front() const {
177 assert(isFinalized());
178 return Storage[1];
179 }
180
181 /// Returns the shared payload.
182 std::shared_ptr<void> getPayload() const { return Payload; }
183 /// Adds the given payload to the stream.
184 void addPayload(std::shared_ptr<void> P) {
185 if (!Payload)
186 Payload = std::move(P);
187 else
188 Payload = std::make_shared<
189 std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>(
190 args: std::move(P), args: std::move(Payload));
191 }
192
193 /// Print the tokens in this stream to the output stream.
194 ///
195 /// The presence of newlines/spaces is preserved, but not the quantity.
196 void print(llvm::raw_ostream &) const;
197
198private:
199 std::shared_ptr<void> Payload;
200
201 MutableArrayRef<Token> Tokens;
202 std::vector<Token> Storage; // eof + Tokens + eof
203};
204llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
205
206/// Extracts a raw token stream from the source code.
207///
208/// All tokens will reference the data of the provided string.
209/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
210TokenStream lex(const std::string &, const clang::LangOptions &);
211enum class LexFlags : uint8_t {
212 /// Marks the token at the start of a logical preprocessor line.
213 /// This is a position where a directive might start.
214 ///
215 /// Here, the first # is StartsPPLine, but second is not (same logical line).
216 /// #define X(error) \
217 /// #error // not a directive!
218 ///
219 /// Careful, the directive may not start exactly on the StartsPPLine token:
220 /// /*comment*/ #include <foo.h>
221 StartsPPLine = 1 << 0,
222 /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
223 /// The text() of such tokens will contain the raw trigrah.
224 NeedsCleaning = 1 << 1,
225};
226/// A generic lang options suitable for lexing/parsing a langage.
227clang::LangOptions genericLangOpts(
228 clang::Language = clang::Language::CXX,
229 clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);
230
231/// Decoding raw tokens written in the source code, returning a derived stream.
232///
233/// - escaped newlines within tokens are removed
234/// - trigraphs are replaced with the characters they encode
235/// - UCNs within raw_identifiers are replaced by the characters they encode
236/// (UCNs within strings, comments etc are not translated)
237/// - raw_identifier tokens are assigned their correct keyword type
238/// - the >> token is split into separate > > tokens
239/// (we use a modified grammar where >> is a nonterminal, not a token)
240///
241/// The StartsPPLine flag is preserved.
242///
243/// Formally the identifier correctly happens before preprocessing, while we
244/// should only cook raw_identifiers that survive preprocessing.
245/// However, ignoring the Token::Kind of tokens in directives achieves the same.
246/// (And having cooked token kinds in PP-disabled sections is useful for us).
247TokenStream cook(const TokenStream &, const clang::LangOptions &);
248
249/// Drops comment tokens.
250TokenStream stripComments(const TokenStream &);
251
252} // namespace pseudo
253} // namespace clang
254
255#endif // CLANG_PSEUDO_TOKEN_H
256

source code of clang-tools-extra/pseudo/include/clang-pseudo/Token.h