1 | //===--- Token.h - Token interface ------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the Token interface. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef LLVM_CLANG_LEX_TOKEN_H |
14 | #define LLVM_CLANG_LEX_TOKEN_H |
15 | |
16 | #include "clang/Basic/SourceLocation.h" |
17 | #include "clang/Basic/TokenKinds.h" |
18 | #include "llvm/ADT/StringRef.h" |
19 | #include <cassert> |
20 | |
21 | namespace clang { |
22 | |
23 | class IdentifierInfo; |
24 | |
25 | /// Token - This structure provides full information about a lexed token. |
26 | /// It is not intended to be space efficient, it is intended to return as much |
27 | /// information as possible about each returned token. This is expected to be |
28 | /// compressed into a smaller form if memory footprint is important. |
29 | /// |
30 | /// The parser can create a special "annotation token" representing a stream of |
31 | /// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>" |
32 | /// can be represented by a single typename annotation token that carries |
33 | /// information about the SourceRange of the tokens and the type object. |
34 | class Token { |
35 | /// The location of the token. This is actually a SourceLocation. |
36 | unsigned Loc; |
37 | |
38 | // Conceptually these next two fields could be in a union. However, this |
39 | // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical |
40 | // routine. Keeping as separate members with casts until a more beautiful fix |
41 | // presents itself. |
42 | |
43 | /// UintData - This holds either the length of the token text, when |
44 | /// a normal token, or the end of the SourceRange when an annotation |
45 | /// token. |
46 | unsigned UintData; |
47 | |
48 | /// PtrData - This is a union of four different pointer types, which depends |
49 | /// on what type of token this is: |
50 | /// Identifiers, keywords, etc: |
51 | /// This is an IdentifierInfo*, which contains the uniqued identifier |
52 | /// spelling. |
53 | /// Literals: isLiteral() returns true. |
54 | /// This is a pointer to the start of the token in a text buffer, which |
55 | /// may be dirty (have trigraphs / escaped newlines). |
56 | /// Annotations (resolved type names, C++ scopes, etc): isAnnotation(). |
57 | /// This is a pointer to sema-specific data for the annotation token. |
58 | /// Eof: |
59 | // This is a pointer to a Decl. |
60 | /// Other: |
61 | /// This is null. |
62 | void *PtrData; |
63 | |
64 | /// Kind - The actual flavor of token this is. |
65 | tok::TokenKind Kind; |
66 | |
67 | /// Flags - Bits we track about this token, members of the TokenFlags enum. |
68 | unsigned short Flags; |
69 | |
70 | public: |
71 | // Various flags set per token: |
72 | enum TokenFlags { |
73 | StartOfLine = 0x01, // At start of line or only after whitespace |
74 | // (considering the line after macro expansion). |
75 | LeadingSpace = 0x02, // Whitespace exists before this token (considering |
76 | // whitespace after macro expansion). |
77 | DisableExpand = 0x04, // This identifier may never be macro expanded. |
78 | NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. |
79 | LeadingEmptyMacro = 0x10, // Empty macro exists before this token. |
80 | HasUDSuffix = 0x20, // This string or character literal has a ud-suffix. |
81 | HasUCN = 0x40, // This identifier contains a UCN. |
82 | IgnoredComma = 0x80, // This comma is not a macro argument separator (MS). |
83 | StringifiedInMacro = 0x100, // This string or character literal is formed by |
84 | // macro stringizing or charizing operator. |
85 | CommaAfterElided = 0x200, // The comma following this token was elided (MS). |
86 | IsEditorPlaceholder = 0x400, // This identifier is a placeholder. |
87 | }; |
88 | |
89 | tok::TokenKind getKind() const { return Kind; } |
90 | void setKind(tok::TokenKind K) { Kind = K; } |
91 | |
92 | /// is/isNot - Predicates to check if this token is a specific kind, as in |
93 | /// "if (Tok.is(tok::l_brace)) {...}". |
94 | bool is(tok::TokenKind K) const { return Kind == K; } |
95 | bool isNot(tok::TokenKind K) const { return Kind != K; } |
96 | bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const { |
97 | return is(K1) || is(K2); |
98 | } |
99 | template <typename... Ts> |
100 | bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, Ts... Ks) const { |
101 | return is(K1) || isOneOf(K2, Ks...); |
102 | } |
103 | |
104 | /// Return true if this is a raw identifier (when lexing |
105 | /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode). |
106 | bool isAnyIdentifier() const { |
107 | return tok::isAnyIdentifier(getKind()); |
108 | } |
109 | |
110 | /// Return true if this is a "literal", like a numeric |
111 | /// constant, string, etc. |
112 | bool isLiteral() const { |
113 | return tok::isLiteral(getKind()); |
114 | } |
115 | |
116 | /// Return true if this is any of tok::annot_* kind tokens. |
117 | bool isAnnotation() const { |
118 | return tok::isAnnotation(getKind()); |
119 | } |
120 | |
121 | /// Return a source location identifier for the specified |
122 | /// offset in the current file. |
123 | SourceLocation getLocation() const { |
124 | return SourceLocation::getFromRawEncoding(Loc); |
125 | } |
126 | unsigned getLength() const { |
127 | assert(!isAnnotation() && "Annotation tokens have no length field" ); |
128 | return UintData; |
129 | } |
130 | |
131 | void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); } |
132 | void setLength(unsigned Len) { |
133 | assert(!isAnnotation() && "Annotation tokens have no length field" ); |
134 | UintData = Len; |
135 | } |
136 | |
137 | SourceLocation getAnnotationEndLoc() const { |
138 | assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token" ); |
139 | return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc); |
140 | } |
141 | void setAnnotationEndLoc(SourceLocation L) { |
142 | assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token" ); |
143 | UintData = L.getRawEncoding(); |
144 | } |
145 | |
146 | SourceLocation getLastLoc() const { |
147 | return isAnnotation() ? getAnnotationEndLoc() : getLocation(); |
148 | } |
149 | |
150 | SourceLocation getEndLoc() const { |
151 | return isAnnotation() ? getAnnotationEndLoc() |
152 | : getLocation().getLocWithOffset(getLength()); |
153 | } |
154 | |
155 | /// SourceRange of the group of tokens that this annotation token |
156 | /// represents. |
157 | SourceRange getAnnotationRange() const { |
158 | return SourceRange(getLocation(), getAnnotationEndLoc()); |
159 | } |
160 | void setAnnotationRange(SourceRange R) { |
161 | setLocation(R.getBegin()); |
162 | setAnnotationEndLoc(R.getEnd()); |
163 | } |
164 | |
165 | const char *getName() const { return tok::getTokenName(Kind); } |
166 | |
167 | /// Reset all flags to cleared. |
168 | void startToken() { |
169 | Kind = tok::unknown; |
170 | Flags = 0; |
171 | PtrData = nullptr; |
172 | UintData = 0; |
173 | Loc = SourceLocation().getRawEncoding(); |
174 | } |
175 | |
176 | IdentifierInfo *getIdentifierInfo() const { |
177 | assert(isNot(tok::raw_identifier) && |
178 | "getIdentifierInfo() on a tok::raw_identifier token!" ); |
179 | assert(!isAnnotation() && |
180 | "getIdentifierInfo() on an annotation token!" ); |
181 | if (isLiteral()) return nullptr; |
182 | if (is(tok::eof)) return nullptr; |
183 | return (IdentifierInfo*) PtrData; |
184 | } |
185 | void setIdentifierInfo(IdentifierInfo *II) { |
186 | PtrData = (void*) II; |
187 | } |
188 | |
189 | const void *getEofData() const { |
190 | assert(is(tok::eof)); |
191 | return reinterpret_cast<const void *>(PtrData); |
192 | } |
193 | void setEofData(const void *D) { |
194 | assert(is(tok::eof)); |
195 | assert(!PtrData); |
196 | PtrData = const_cast<void *>(D); |
197 | } |
198 | |
199 | /// getRawIdentifier - For a raw identifier token (i.e., an identifier |
200 | /// lexed in raw mode), returns a reference to the text substring in the |
201 | /// buffer if known. |
202 | StringRef getRawIdentifier() const { |
203 | assert(is(tok::raw_identifier)); |
204 | return StringRef(reinterpret_cast<const char *>(PtrData), getLength()); |
205 | } |
206 | void setRawIdentifierData(const char *Ptr) { |
207 | assert(is(tok::raw_identifier)); |
208 | PtrData = const_cast<char*>(Ptr); |
209 | } |
210 | |
211 | /// getLiteralData - For a literal token (numeric constant, string, etc), this |
212 | /// returns a pointer to the start of it in the text buffer if known, null |
213 | /// otherwise. |
214 | const char *getLiteralData() const { |
215 | assert(isLiteral() && "Cannot get literal data of non-literal" ); |
216 | return reinterpret_cast<const char*>(PtrData); |
217 | } |
218 | void setLiteralData(const char *Ptr) { |
219 | assert(isLiteral() && "Cannot set literal data of non-literal" ); |
220 | PtrData = const_cast<char*>(Ptr); |
221 | } |
222 | |
223 | void *getAnnotationValue() const { |
224 | assert(isAnnotation() && "Used AnnotVal on non-annotation token" ); |
225 | return PtrData; |
226 | } |
227 | void setAnnotationValue(void *val) { |
228 | assert(isAnnotation() && "Used AnnotVal on non-annotation token" ); |
229 | PtrData = val; |
230 | } |
231 | |
232 | /// Set the specified flag. |
233 | void setFlag(TokenFlags Flag) { |
234 | Flags |= Flag; |
235 | } |
236 | |
237 | /// Get the specified flag. |
238 | bool getFlag(TokenFlags Flag) const { |
239 | return (Flags & Flag) != 0; |
240 | } |
241 | |
242 | /// Unset the specified flag. |
243 | void clearFlag(TokenFlags Flag) { |
244 | Flags &= ~Flag; |
245 | } |
246 | |
247 | /// Return the internal represtation of the flags. |
248 | /// |
249 | /// This is only intended for low-level operations such as writing tokens to |
250 | /// disk. |
251 | unsigned getFlags() const { |
252 | return Flags; |
253 | } |
254 | |
255 | /// Set a flag to either true or false. |
256 | void setFlagValue(TokenFlags Flag, bool Val) { |
257 | if (Val) |
258 | setFlag(Flag); |
259 | else |
260 | clearFlag(Flag); |
261 | } |
262 | |
263 | /// isAtStartOfLine - Return true if this token is at the start of a line. |
264 | /// |
265 | bool isAtStartOfLine() const { return getFlag(StartOfLine); } |
266 | |
267 | /// Return true if this token has whitespace before it. |
268 | /// |
269 | bool hasLeadingSpace() const { return getFlag(LeadingSpace); } |
270 | |
271 | /// Return true if this identifier token should never |
272 | /// be expanded in the future, due to C99 6.10.3.4p2. |
273 | bool isExpandDisabled() const { return getFlag(DisableExpand); } |
274 | |
275 | /// Return true if we have an ObjC keyword identifier. |
276 | bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const; |
277 | |
278 | /// Return the ObjC keyword kind. |
279 | tok::ObjCKeywordKind getObjCKeywordID() const; |
280 | |
281 | /// Return true if this token has trigraphs or escaped newlines in it. |
282 | bool needsCleaning() const { return getFlag(NeedsCleaning); } |
283 | |
284 | /// Return true if this token has an empty macro before it. |
285 | /// |
286 | bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); } |
287 | |
288 | /// Return true if this token is a string or character literal which |
289 | /// has a ud-suffix. |
290 | bool hasUDSuffix() const { return getFlag(HasUDSuffix); } |
291 | |
292 | /// Returns true if this token contains a universal character name. |
293 | bool hasUCN() const { return getFlag(HasUCN); } |
294 | |
295 | /// Returns true if this token is formed by macro by stringizing or charizing |
296 | /// operator. |
297 | bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); } |
298 | |
299 | /// Returns true if the comma after this token was elided. |
300 | bool commaAfterElided() const { return getFlag(CommaAfterElided); } |
301 | |
302 | /// Returns true if this token is an editor placeholder. |
303 | /// |
304 | /// Editor placeholders are produced by the code-completion engine and are |
305 | /// represented as characters between '<#' and '#>' in the source code. The |
306 | /// lexer uses identifier tokens to represent placeholders. |
307 | bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); } |
308 | }; |
309 | |
310 | /// Information about the conditional stack (\#if directives) |
311 | /// currently active. |
312 | struct PPConditionalInfo { |
313 | /// Location where the conditional started. |
314 | SourceLocation IfLoc; |
315 | |
316 | /// True if this was contained in a skipping directive, e.g., |
317 | /// in a "\#if 0" block. |
318 | bool WasSkipping; |
319 | |
320 | /// True if we have emitted tokens already, and now we're in |
321 | /// an \#else block or something. Only useful in Skipping blocks. |
322 | bool FoundNonSkip; |
323 | |
324 | /// True if we've seen a \#else in this block. If so, |
325 | /// \#elif/\#else directives are not allowed. |
326 | bool FoundElse; |
327 | }; |
328 | |
329 | } // end namespace clang |
330 | |
331 | #endif // LLVM_CLANG_LEX_TOKEN_H |
332 | |