1 | //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // Record tokens that a preprocessor emits and define operations to map between |
9 | // the tokens written in a file and tokens produced by the preprocessor. |
10 | // |
11 | // When running the compiler, there are two token streams we are interested in: |
12 | // - "spelled" tokens directly correspond to a substring written in some |
13 | // source file. |
14 | // - "expanded" tokens represent the result of preprocessing, parses consumes |
15 | // this token stream to produce the AST. |
16 | // |
17 | // Expanded tokens correspond directly to locations found in the AST, allowing |
18 | // to find subranges of the token stream covered by various AST nodes. Spelled |
19 | // tokens correspond directly to the source code written by the user. |
20 | // |
21 | // To allow composing these two use-cases, we also define operations that map |
22 | // between expanded and spelled tokens that produced them (macro calls, |
23 | // directives, etc). |
24 | // |
25 | //===----------------------------------------------------------------------===// |
26 | |
27 | #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H |
28 | #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H |
29 | |
30 | #include "clang/Basic/LangOptions.h" |
31 | #include "clang/Basic/SourceLocation.h" |
32 | #include "clang/Basic/SourceManager.h" |
33 | #include "clang/Basic/TokenKinds.h" |
34 | #include "clang/Lex/Token.h" |
35 | #include "llvm/ADT/ArrayRef.h" |
36 | #include "llvm/ADT/DenseMap.h" |
37 | #include "llvm/ADT/StringRef.h" |
38 | #include "llvm/Support/Compiler.h" |
39 | #include "llvm/Support/raw_ostream.h" |
40 | #include <cstdint> |
41 | #include <tuple> |
42 | |
43 | namespace clang { |
44 | class Preprocessor; |
45 | |
46 | namespace syntax { |
47 | |
48 | /// A half-open character range inside a particular file, the start offset is |
49 | /// included and the end offset is excluded from the range. |
50 | struct FileRange { |
51 | /// EXPECTS: File.isValid() && Begin <= End. |
52 | FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); |
53 | /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). |
54 | FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); |
55 | /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files |
56 | /// are the same. |
57 | FileRange(const SourceManager &SM, SourceLocation BeginLoc, |
58 | SourceLocation EndLoc); |
59 | |
60 | FileID file() const { return File; } |
61 | /// Start is a start offset (inclusive) in the corresponding file. |
62 | unsigned beginOffset() const { return Begin; } |
63 | /// End offset (exclusive) in the corresponding file. |
64 | unsigned endOffset() const { return End; } |
65 | |
66 | unsigned length() const { return End - Begin; } |
67 | |
68 | /// Check if \p Offset is inside the range. |
69 | bool contains(unsigned Offset) const { |
70 | return Begin <= Offset && Offset < End; |
71 | } |
72 | /// Check \p Offset is inside the range or equal to its endpoint. |
73 | bool touches(unsigned Offset) const { |
74 | return Begin <= Offset && Offset <= End; |
75 | } |
76 | |
77 | /// Gets the substring that this FileRange refers to. |
78 | llvm::StringRef text(const SourceManager &SM) const; |
79 | |
80 | /// Convert to the clang range. The returned range is always a char range, |
81 | /// never a token range. |
82 | CharSourceRange toCharRange(const SourceManager &SM) const; |
83 | |
84 | friend bool operator==(const FileRange &L, const FileRange &R) { |
85 | return std::tie(args: L.File, args: L.Begin, args: L.End) == std::tie(args: R.File, args: R.Begin, args: R.End); |
86 | } |
87 | friend bool operator!=(const FileRange &L, const FileRange &R) { |
88 | return !(L == R); |
89 | } |
90 | |
91 | private: |
92 | FileID File; |
93 | unsigned Begin; |
94 | unsigned End; |
95 | }; |
96 | |
97 | /// For debugging purposes. |
98 | llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); |
99 | |
100 | /// A token coming directly from a file or from a macro invocation. Has just |
101 | /// enough information to locate the token in the source code. |
102 | /// Can represent both expanded and spelled tokens. |
103 | class Token { |
104 | public: |
105 | Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind); |
106 | /// EXPECTS: clang::Token is not an annotation token. |
107 | explicit Token(const clang::Token &T); |
108 | |
109 | tok::TokenKind kind() const { return Kind; } |
110 | /// Location of the first character of a token. |
111 | SourceLocation location() const { return Location; } |
112 | /// Location right after the last character of a token. |
113 | SourceLocation endLocation() const { |
114 | return Location.getLocWithOffset(Offset: Length); |
115 | } |
116 | unsigned length() const { return Length; } |
117 | |
118 | /// Get the substring covered by the token. Note that will include all |
119 | /// digraphs, newline continuations, etc. E.g. tokens for 'int' and |
120 | /// in\ |
121 | /// t |
122 | /// both have the same kind tok::kw_int, but results of text() are different. |
123 | llvm::StringRef text(const SourceManager &SM) const; |
124 | |
125 | /// Gets a range of this token. |
126 | /// EXPECTS: token comes from a file, not from a macro expansion. |
127 | FileRange range(const SourceManager &SM) const; |
128 | |
129 | /// Given two tokens inside the same file, returns a file range that starts at |
130 | /// \p First and ends at \p Last. |
131 | /// EXPECTS: First and Last are file tokens from the same file, Last starts |
132 | /// after First. |
133 | static FileRange range(const SourceManager &SM, const syntax::Token &First, |
134 | const syntax::Token &Last); |
135 | |
136 | std::string dumpForTests(const SourceManager &SM) const; |
137 | /// For debugging purposes. |
138 | std::string str() const; |
139 | |
140 | private: |
141 | SourceLocation Location; |
142 | unsigned Length; |
143 | tok::TokenKind Kind; |
144 | }; |
145 | /// For debugging purposes. Equivalent to a call to Token::str(). |
146 | llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); |
147 | |
148 | /// A list of tokens obtained by preprocessing a text buffer and operations to |
149 | /// map between the expanded and spelled tokens, i.e. TokenBuffer has |
150 | /// information about two token streams: |
151 | /// 1. Expanded tokens: tokens produced by the preprocessor after all macro |
152 | /// replacements, |
153 | /// 2. Spelled tokens: corresponding directly to the source code of a file |
154 | /// before any macro replacements occurred. |
155 | /// Here's an example to illustrate a difference between those two: |
156 | /// #define FOO 10 |
157 | /// int a = FOO; |
158 | /// |
159 | /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. |
160 | /// Expanded tokens are {'int','a','=','10',';','eof'}. |
161 | /// |
162 | /// Note that the expanded token stream has a tok::eof token at the end, the |
163 | /// spelled tokens never store a 'eof' token. |
164 | /// |
165 | /// The full list expanded tokens can be obtained with expandedTokens(). Spelled |
166 | /// tokens for each of the files can be obtained via spelledTokens(FileID). |
167 | /// |
168 | /// To map between the expanded and spelled tokens use findSpelledByExpanded(). |
169 | /// |
170 | /// To build a token buffer use the TokenCollector class. You can also compute |
171 | /// the spelled tokens of a file using the tokenize() helper. |
172 | /// |
173 | /// FIXME: allow mappings into macro arguments. |
174 | class TokenBuffer { |
175 | public: |
176 | TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} |
177 | |
178 | TokenBuffer(TokenBuffer &&) = default; |
179 | TokenBuffer(const TokenBuffer &) = delete; |
180 | TokenBuffer &operator=(TokenBuffer &&) = default; |
181 | TokenBuffer &operator=(const TokenBuffer &) = delete; |
182 | |
183 | /// All tokens produced by the preprocessor after all macro replacements, |
184 | /// directives, etc. Source locations found in the clang AST will always |
185 | /// point to one of these tokens. |
186 | /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()). |
187 | /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split |
188 | /// into two '>' tokens by the parser. However, TokenBuffer currently |
189 | /// keeps it as a single '>>' token. |
190 | llvm::ArrayRef<syntax::Token> expandedTokens() const { |
191 | return ExpandedTokens; |
192 | } |
193 | |
194 | /// Builds a cache to make future calls to expandedToken(SourceRange) faster. |
195 | /// Creates an index only once. Further calls to it will be no-op. |
196 | void indexExpandedTokens(); |
197 | |
198 | /// Returns the subrange of expandedTokens() corresponding to the closed |
199 | /// token range R. |
200 | /// Consider calling indexExpandedTokens() before for faster lookups. |
201 | llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const; |
202 | |
203 | /// Returns the subrange of spelled tokens corresponding to AST node spanning |
204 | /// \p Expanded. This is the text that should be replaced if a refactoring |
205 | /// were to rewrite the node. If \p Expanded is empty, the returned value is |
206 | /// std::nullopt. |
207 | /// |
208 | /// Will fail if the expanded tokens do not correspond to a sequence of |
209 | /// spelled tokens. E.g. for the following example: |
210 | /// |
211 | /// #define FIRST f1 f2 f3 |
212 | /// #define SECOND s1 s2 s3 |
213 | /// #define ID2(X, Y) X Y |
214 | /// |
215 | /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c |
216 | /// d ID2(e f g, h) i // expanded tokens are: d e f g h i |
217 | /// |
218 | /// the results would be: |
219 | /// expanded => spelled |
220 | /// ------------------------ |
221 | /// a => a |
222 | /// s1 s2 s3 => SECOND |
223 | /// a f1 f2 f3 => a FIRST |
224 | /// a f1 => can't map |
225 | /// s1 s2 => can't map |
226 | /// e f => e f |
227 | /// g h => can't map |
228 | /// |
229 | /// EXPECTS: \p Expanded is a subrange of expandedTokens(). |
230 | /// Complexity is logarithmic. |
231 | std::optional<llvm::ArrayRef<syntax::Token>> |
232 | spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; |
233 | |
234 | /// Find the subranges of expanded tokens, corresponding to \p Spelled. |
235 | /// |
236 | /// Some spelled tokens may not be present in the expanded token stream, so |
237 | /// this function can return an empty vector, e.g. for tokens of macro |
238 | /// directives or disabled preprocessor branches. |
239 | /// |
240 | /// Some spelled tokens can be duplicated in the expanded token stream |
241 | /// multiple times and this function will return multiple results in those |
242 | /// cases. This happens when \p Spelled is inside a macro argument. |
243 | /// |
244 | /// FIXME: return correct results on macro arguments. For now, we return an |
245 | /// empty list. |
246 | /// |
247 | /// (!) will return empty vector on tokens from #define body: |
248 | /// E.g. for the following example: |
249 | /// |
250 | /// #define FIRST(A) f1 A = A f2 |
251 | /// #define SECOND s |
252 | /// |
253 | /// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s |
254 | /// The results would be |
255 | /// spelled => expanded |
256 | /// ------------------------ |
257 | /// #define FIRST => {} |
258 | /// a FIRST(arg) => {a f1 arg = arg f2} |
259 | /// arg => {arg, arg} // arg #1 is before `=` and arg #2 is |
260 | /// // after `=` in the expanded tokens. |
261 | llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1> |
262 | expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; |
263 | |
264 | /// An expansion produced by the preprocessor, includes macro expansions and |
265 | /// preprocessor directives. Preprocessor always maps a non-empty range of |
266 | /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a |
267 | /// few examples of expansions: |
268 | /// #pragma once // Expands to an empty range. |
269 | /// #define FOO 1 2 3 // Expands an empty range. |
270 | /// FOO // Expands to "1 2 3". |
271 | /// FIXME(ibiryukov): implement this, currently #include expansions are empty. |
272 | /// #include <vector> // Expands to tokens produced by the include. |
273 | struct Expansion { |
274 | llvm::ArrayRef<syntax::Token> Spelled; |
275 | llvm::ArrayRef<syntax::Token> Expanded; |
276 | }; |
277 | /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting |
278 | /// a preprocessor directive) return the subrange of expanded tokens that the |
279 | /// macro expands to. |
280 | std::optional<Expansion> |
281 | expansionStartingAt(const syntax::Token *Spelled) const; |
282 | /// Returns all expansions (partially) expanded from the specified tokens. |
283 | /// This is the expansions whose Spelled range intersects \p Spelled. |
284 | std::vector<Expansion> |
285 | expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const; |
286 | |
287 | /// Lexed tokens of a file before preprocessing. E.g. for the following input |
288 | /// #define DECL(name) int name = 10 |
289 | /// DECL(a); |
290 | /// spelledTokens() returns |
291 | /// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10", |
292 | /// "DECL", "(", "a", ")", ";"} |
293 | llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; |
294 | |
295 | /// Returns the spelled Token starting at Loc, if there are no such tokens |
296 | /// returns nullptr. |
297 | const syntax::Token *spelledTokenAt(SourceLocation Loc) const; |
298 | |
299 | /// Get all tokens that expand a macro in \p FID. For the following input |
300 | /// #define FOO B |
301 | /// #define FOO2(X) int X |
302 | /// FOO2(XY) |
303 | /// int B; |
304 | /// FOO; |
305 | /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5 |
306 | /// respecitvely). |
307 | std::vector<const syntax::Token *> macroExpansions(FileID FID) const; |
308 | |
309 | const SourceManager &sourceManager() const { return *SourceMgr; } |
310 | |
311 | std::string dumpForTests() const; |
312 | |
313 | private: |
314 | /// Describes a mapping between a continuous subrange of spelled tokens and |
315 | /// expanded tokens. Represents macro expansions, preprocessor directives, |
316 | /// conditionally disabled pp regions, etc. |
317 | /// #define FOO 1+2 |
318 | /// #define BAR(a) a + 1 |
319 | /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. |
320 | /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, |
321 | /// macroTokens = {'BAR', '(', '1', ')'}. |
322 | struct Mapping { |
323 | // Positions in the corresponding spelled token stream. The corresponding |
324 | // range is never empty. |
325 | unsigned BeginSpelled = 0; |
326 | unsigned EndSpelled = 0; |
327 | // Positions in the expanded token stream. The corresponding range can be |
328 | // empty. |
329 | unsigned BeginExpanded = 0; |
330 | unsigned EndExpanded = 0; |
331 | |
332 | /// For debugging purposes. |
333 | std::string str() const; |
334 | }; |
335 | /// Spelled tokens of the file with information about the subranges. |
336 | struct MarkedFile { |
337 | /// Lexed, but not preprocessed, tokens of the file. These map directly to |
338 | /// text in the corresponding files and include tokens of all preprocessor |
339 | /// directives. |
340 | /// FIXME: spelled tokens don't change across FileID that map to the same |
341 | /// FileEntry. We could consider deduplicating them to save memory. |
342 | std::vector<syntax::Token> SpelledTokens; |
343 | /// A sorted list to convert between the spelled and expanded token streams. |
344 | std::vector<Mapping> Mappings; |
345 | /// The first expanded token produced for this FileID. |
346 | unsigned BeginExpanded = 0; |
347 | unsigned EndExpanded = 0; |
348 | }; |
349 | |
350 | friend class TokenCollector; |
351 | |
352 | /// Maps a single expanded token to its spelled counterpart or a mapping that |
353 | /// produced it. |
354 | std::pair<const syntax::Token *, const Mapping *> |
355 | spelledForExpandedToken(const syntax::Token *Expanded) const; |
356 | |
357 | /// Returns a mapping starting before \p Spelled token, or nullptr if no |
358 | /// such mapping exists. |
359 | static const Mapping * |
360 | mappingStartingBeforeSpelled(const MarkedFile &F, |
361 | const syntax::Token *Spelled); |
362 | |
363 | /// Convert a private Mapping to a public Expansion. |
364 | Expansion makeExpansion(const MarkedFile &, const Mapping &) const; |
365 | /// Returns the file that the Spelled tokens are taken from. |
366 | /// Asserts that they are non-empty, from a tracked file, and in-bounds. |
367 | const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; |
368 | |
369 | /// Token stream produced after preprocessing, conceputally this captures the |
370 | /// same stream as 'clang -E' (excluding the preprocessor directives like |
371 | /// #file, etc.). |
372 | std::vector<syntax::Token> ExpandedTokens; |
373 | // Index of ExpandedTokens for faster lookups by SourceLocation. |
374 | llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex; |
375 | llvm::DenseMap<FileID, MarkedFile> Files; |
376 | // The value is never null, pointer instead of reference to avoid disabling |
377 | // implicit assignment operator. |
378 | const SourceManager *SourceMgr; |
379 | }; |
380 | |
381 | /// The spelled tokens that overlap or touch a spelling location Loc. |
382 | /// This always returns 0-2 tokens. |
383 | llvm::ArrayRef<syntax::Token> |
384 | spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); |
385 | llvm::ArrayRef<syntax::Token> |
386 | spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens); |
387 | |
388 | /// The identifier token that overlaps or touches a spelling location Loc. |
389 | /// If there is none, returns nullptr. |
390 | const syntax::Token * |
391 | spelledIdentifierTouching(SourceLocation Loc, |
392 | llvm::ArrayRef<syntax::Token> Tokens); |
393 | const syntax::Token * |
394 | spelledIdentifierTouching(SourceLocation Loc, |
395 | const syntax::TokenBuffer &Tokens); |
396 | |
397 | /// Lex the text buffer, corresponding to \p FID, in raw mode and record the |
398 | /// resulting spelled tokens. Does minimal post-processing on raw identifiers, |
399 | /// setting the appropriate token kind (instead of the raw_identifier reported |
400 | /// by lexer in raw mode). This is a very low-level function, most users should |
401 | /// prefer to use TokenCollector. Lexing in raw mode produces wildly different |
402 | /// results from what one might expect when running a C++ frontend, e.g. |
403 | /// preprocessor does not run at all. |
404 | /// The result will *not* have a 'eof' token at the end. |
405 | std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, |
406 | const LangOptions &LO); |
407 | /// Similar to one above, instead of whole file tokenizes a part of it. Note |
408 | /// that, the first token might be incomplete if FR.startOffset is not at the |
409 | /// beginning of a token, and the last token returned will start before the |
410 | /// FR.endOffset but might end after it. |
411 | std::vector<syntax::Token> |
412 | tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO); |
413 | |
414 | /// Collects tokens for the main file while running the frontend action. An |
415 | /// instance of this object should be created on |
416 | /// FrontendAction::BeginSourceFile() and the results should be consumed after |
417 | /// FrontendAction::Execute() finishes. |
418 | class TokenCollector { |
419 | public: |
420 | /// Adds the hooks to collect the tokens. Should be called before the |
421 | /// preprocessing starts, i.e. as a part of BeginSourceFile() or |
422 | /// CreateASTConsumer(). |
423 | TokenCollector(Preprocessor &P); |
424 | |
425 | /// Finalizes token collection. Should be called after preprocessing is |
426 | /// finished, i.e. after running Execute(). |
427 | [[nodiscard]] TokenBuffer consume() &&; |
428 | |
429 | private: |
430 | /// Maps from a start to an end spelling location of transformations |
431 | /// performed by the preprocessor. These include: |
432 | /// 1. range from '#' to the last token in the line for PP directives, |
433 | /// 2. macro name and arguments for macro expansions. |
434 | /// Note that we record only top-level macro expansions, intermediate |
435 | /// expansions (e.g. inside macro arguments) are ignored. |
436 | /// |
437 | /// Used to find correct boundaries of macro calls and directives when |
438 | /// building mappings from spelled to expanded tokens. |
439 | /// |
440 | /// Logically, at each point of the preprocessor execution there is a stack of |
441 | /// macro expansions being processed and we could use it to recover the |
442 | /// location information we need. However, the public preprocessor API only |
443 | /// exposes the points when macro expansions start (when we push a macro onto |
444 | /// the stack) and not when they end (when we pop a macro from the stack). |
445 | /// To workaround this limitation, we rely on source location information |
446 | /// stored in this map. |
447 | using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>; |
448 | class Builder; |
449 | class CollectPPExpansions; |
450 | |
451 | std::vector<syntax::Token> Expanded; |
452 | // FIXME: we only store macro expansions, also add directives(#pragma, etc.) |
453 | PPExpansions Expansions; |
454 | Preprocessor &PP; |
455 | CollectPPExpansions *Collector; |
456 | }; |
457 | |
458 | } // namespace syntax |
459 | } // namespace clang |
460 | |
461 | #endif |
462 | |