1 | //===--- LiteralSupport.h ---------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the NumericLiteralParser, CharLiteralParser, and |
10 | // StringLiteralParser interfaces. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef LLVM_CLANG_LEX_LITERALSUPPORT_H |
15 | #define LLVM_CLANG_LEX_LITERALSUPPORT_H |
16 | |
17 | #include "clang/Basic/CharInfo.h" |
18 | #include "clang/Basic/LLVM.h" |
19 | #include "clang/Basic/TokenKinds.h" |
20 | #include "llvm/ADT/APFloat.h" |
21 | #include "llvm/ADT/ArrayRef.h" |
22 | #include "llvm/ADT/SmallString.h" |
23 | #include "llvm/ADT/StringRef.h" |
24 | #include "llvm/Support/DataTypes.h" |
25 | |
26 | namespace clang { |
27 | |
28 | class DiagnosticsEngine; |
29 | class Preprocessor; |
30 | class Token; |
31 | class SourceLocation; |
32 | class TargetInfo; |
33 | class SourceManager; |
34 | class LangOptions; |
35 | |
36 | /// Copy characters from Input to Buf, expanding any UCNs. |
37 | void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input); |
38 | |
39 | /// Return true if the token corresponds to a function local predefined macro, |
40 | /// which expands to a string literal, that can be concatenated with other |
41 | /// string literals (only in Microsoft mode). |
42 | bool isFunctionLocalStringLiteralMacro(tok::TokenKind K, const LangOptions &LO); |
43 | |
44 | /// Return true if the token is a string literal, or a function local |
45 | /// predefined macro, which expands to a string literal. |
46 | bool tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO); |
47 | |
48 | /// NumericLiteralParser - This performs strict semantic analysis of the content |
49 | /// of a ppnumber, classifying it as either integer, floating, or erroneous, |
50 | /// determines the radix of the value and can convert it to a useful value. |
51 | class NumericLiteralParser { |
52 | const SourceManager &SM; |
53 | const LangOptions &LangOpts; |
54 | DiagnosticsEngine &Diags; |
55 | |
56 | const char *const ThisTokBegin; |
57 | const char *const ThisTokEnd; |
58 | const char *DigitsBegin, *SuffixBegin; // markers |
59 | const char *s; // cursor |
60 | |
61 | unsigned radix; |
62 | |
63 | bool saw_exponent, saw_period, saw_ud_suffix, saw_fixed_point_suffix; |
64 | |
65 | SmallString<32> UDSuffixBuf; |
66 | |
67 | public: |
68 | NumericLiteralParser(StringRef TokSpelling, SourceLocation TokLoc, |
69 | const SourceManager &SM, const LangOptions &LangOpts, |
70 | const TargetInfo &Target, DiagnosticsEngine &Diags); |
71 | bool hadError : 1; |
72 | bool isUnsigned : 1; |
73 | bool isLong : 1; // This is *not* set for long long. |
74 | bool isLongLong : 1; |
75 | bool isSizeT : 1; // 1z, 1uz (C++23) |
76 | bool isHalf : 1; // 1.0h |
77 | bool isFloat : 1; // 1.0f |
78 | bool isImaginary : 1; // 1.0i |
79 | bool isFloat16 : 1; // 1.0f16 |
80 | bool isFloat128 : 1; // 1.0q |
81 | bool isFract : 1; // 1.0hr/r/lr/uhr/ur/ulr |
82 | bool isAccum : 1; // 1.0hk/k/lk/uhk/uk/ulk |
83 | bool isBitInt : 1; // 1wb, 1uwb (C23) |
84 | uint8_t MicrosoftInteger; // Microsoft suffix extension i8, i16, i32, or i64. |
85 | |
86 | |
87 | bool isFixedPointLiteral() const { |
88 | return (saw_period || saw_exponent) && saw_fixed_point_suffix; |
89 | } |
90 | |
91 | bool isIntegerLiteral() const { |
92 | return !saw_period && !saw_exponent && !isFixedPointLiteral(); |
93 | } |
94 | bool isFloatingLiteral() const { |
95 | return (saw_period || saw_exponent) && !isFixedPointLiteral(); |
96 | } |
97 | |
98 | bool hasUDSuffix() const { |
99 | return saw_ud_suffix; |
100 | } |
101 | StringRef getUDSuffix() const { |
102 | assert(saw_ud_suffix); |
103 | return UDSuffixBuf; |
104 | } |
105 | unsigned getUDSuffixOffset() const { |
106 | assert(saw_ud_suffix); |
107 | return SuffixBegin - ThisTokBegin; |
108 | } |
109 | |
110 | static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); |
111 | |
112 | unsigned getRadix() const { return radix; } |
113 | |
114 | /// GetIntegerValue - Convert this numeric literal value to an APInt that |
115 | /// matches Val's input width. If there is an overflow (i.e., if the unsigned |
116 | /// value read is larger than the APInt's bits will hold), set Val to the low |
117 | /// bits of the result and return true. Otherwise, return false. |
118 | bool GetIntegerValue(llvm::APInt &Val); |
119 | |
120 | /// GetFloatValue - Convert this numeric literal to a floating value, using |
121 | /// the specified APFloat fltSemantics (specifying float, double, etc). |
122 | /// The optional bool isExact (passed-by-reference) has its value |
123 | /// set to true if the returned APFloat can represent the number in the |
124 | /// literal exactly, and false otherwise. |
125 | llvm::APFloat::opStatus GetFloatValue(llvm::APFloat &Result); |
126 | |
127 | /// GetFixedPointValue - Convert this numeric literal value into a |
128 | /// scaled integer that represents this value. Returns true if an overflow |
129 | /// occurred when calculating the integral part of the scaled integer or |
130 | /// calculating the digit sequence of the exponent. |
131 | bool GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale); |
132 | |
133 | /// Get the digits that comprise the literal. This excludes any prefix or |
134 | /// suffix associated with the literal. |
135 | StringRef getLiteralDigits() const { |
136 | assert(!hadError && "cannot reliably get the literal digits with an error" ); |
137 | return StringRef(DigitsBegin, SuffixBegin - DigitsBegin); |
138 | } |
139 | |
140 | private: |
141 | |
142 | void ParseNumberStartingWithZero(SourceLocation TokLoc); |
143 | void ParseDecimalOrOctalCommon(SourceLocation TokLoc); |
144 | |
145 | static bool isDigitSeparator(char C) { return C == '\''; } |
146 | |
147 | /// Determine whether the sequence of characters [Start, End) contains |
148 | /// any real digits (not digit separators). |
149 | bool containsDigits(const char *Start, const char *End) { |
150 | return Start != End && (Start + 1 != End || !isDigitSeparator(C: Start[0])); |
151 | } |
152 | |
153 | enum CheckSeparatorKind { CSK_BeforeDigits, CSK_AfterDigits }; |
154 | |
155 | /// Ensure that we don't have a digit separator here. |
156 | void checkSeparator(SourceLocation TokLoc, const char *Pos, |
157 | CheckSeparatorKind IsAfterDigits); |
158 | |
159 | /// SkipHexDigits - Read and skip over any hex digits, up to End. |
160 | /// Return a pointer to the first non-hex digit or End. |
161 | const char *SkipHexDigits(const char *ptr) { |
162 | while (ptr != ThisTokEnd && (isHexDigit(c: *ptr) || isDigitSeparator(C: *ptr))) |
163 | ptr++; |
164 | return ptr; |
165 | } |
166 | |
167 | /// SkipOctalDigits - Read and skip over any octal digits, up to End. |
168 | /// Return a pointer to the first non-hex digit or End. |
169 | const char *SkipOctalDigits(const char *ptr) { |
170 | while (ptr != ThisTokEnd && |
171 | ((*ptr >= '0' && *ptr <= '7') || isDigitSeparator(C: *ptr))) |
172 | ptr++; |
173 | return ptr; |
174 | } |
175 | |
176 | /// SkipDigits - Read and skip over any digits, up to End. |
177 | /// Return a pointer to the first non-hex digit or End. |
178 | const char *SkipDigits(const char *ptr) { |
179 | while (ptr != ThisTokEnd && (isDigit(c: *ptr) || isDigitSeparator(C: *ptr))) |
180 | ptr++; |
181 | return ptr; |
182 | } |
183 | |
184 | /// SkipBinaryDigits - Read and skip over any binary digits, up to End. |
185 | /// Return a pointer to the first non-binary digit or End. |
186 | const char *SkipBinaryDigits(const char *ptr) { |
187 | while (ptr != ThisTokEnd && |
188 | (*ptr == '0' || *ptr == '1' || isDigitSeparator(C: *ptr))) |
189 | ptr++; |
190 | return ptr; |
191 | } |
192 | |
193 | }; |
194 | |
195 | /// CharLiteralParser - Perform interpretation and semantic analysis of a |
196 | /// character literal. |
197 | class CharLiteralParser { |
198 | uint64_t Value; |
199 | tok::TokenKind Kind; |
200 | bool IsMultiChar; |
201 | bool HadError; |
202 | SmallString<32> UDSuffixBuf; |
203 | unsigned UDSuffixOffset; |
204 | public: |
205 | CharLiteralParser(const char *begin, const char *end, |
206 | SourceLocation Loc, Preprocessor &PP, |
207 | tok::TokenKind kind); |
208 | |
209 | bool hadError() const { return HadError; } |
210 | bool isOrdinary() const { return Kind == tok::char_constant; } |
211 | bool isWide() const { return Kind == tok::wide_char_constant; } |
212 | bool isUTF8() const { return Kind == tok::utf8_char_constant; } |
213 | bool isUTF16() const { return Kind == tok::utf16_char_constant; } |
214 | bool isUTF32() const { return Kind == tok::utf32_char_constant; } |
215 | bool isMultiChar() const { return IsMultiChar; } |
216 | uint64_t getValue() const { return Value; } |
217 | StringRef getUDSuffix() const { return UDSuffixBuf; } |
218 | unsigned getUDSuffixOffset() const { |
219 | assert(!UDSuffixBuf.empty() && "no ud-suffix" ); |
220 | return UDSuffixOffset; |
221 | } |
222 | }; |
223 | |
224 | enum class StringLiteralEvalMethod { |
225 | Evaluated, |
226 | Unevaluated, |
227 | }; |
228 | |
229 | /// StringLiteralParser - This decodes string escape characters and performs |
230 | /// wide string analysis and Translation Phase #6 (concatenation of string |
231 | /// literals) (C99 5.1.1.2p1). |
232 | class StringLiteralParser { |
233 | const SourceManager &SM; |
234 | const LangOptions &Features; |
235 | const TargetInfo &Target; |
236 | DiagnosticsEngine *Diags; |
237 | |
238 | unsigned MaxTokenLength; |
239 | unsigned SizeBound; |
240 | unsigned CharByteWidth; |
241 | tok::TokenKind Kind; |
242 | SmallString<512> ResultBuf; |
243 | char *ResultPtr; // cursor |
244 | SmallString<32> UDSuffixBuf; |
245 | unsigned UDSuffixToken; |
246 | unsigned UDSuffixOffset; |
247 | StringLiteralEvalMethod EvalMethod; |
248 | |
249 | public: |
250 | StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP, |
251 | StringLiteralEvalMethod StringMethod = |
252 | StringLiteralEvalMethod::Evaluated); |
253 | StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm, |
254 | const LangOptions &features, const TargetInfo &target, |
255 | DiagnosticsEngine *diags = nullptr) |
256 | : SM(sm), Features(features), Target(target), Diags(diags), |
257 | MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), |
258 | ResultPtr(ResultBuf.data()), |
259 | EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false), |
260 | Pascal(false) { |
261 | init(StringToks); |
262 | } |
263 | |
264 | bool hadError; |
265 | bool Pascal; |
266 | |
267 | StringRef GetString() const { |
268 | return StringRef(ResultBuf.data(), GetStringLength()); |
269 | } |
270 | unsigned GetStringLength() const { return ResultPtr-ResultBuf.data(); } |
271 | |
272 | unsigned GetNumStringChars() const { |
273 | return GetStringLength() / CharByteWidth; |
274 | } |
275 | /// getOffsetOfStringByte - This function returns the offset of the |
276 | /// specified byte of the string data represented by Token. This handles |
277 | /// advancing over escape sequences in the string. |
278 | /// |
279 | /// If the Diagnostics pointer is non-null, then this will do semantic |
280 | /// checking of the string literal and emit errors and warnings. |
281 | unsigned getOffsetOfStringByte(const Token &TheTok, unsigned ByteNo) const; |
282 | |
283 | bool isOrdinary() const { return Kind == tok::string_literal; } |
284 | bool isWide() const { return Kind == tok::wide_string_literal; } |
285 | bool isUTF8() const { return Kind == tok::utf8_string_literal; } |
286 | bool isUTF16() const { return Kind == tok::utf16_string_literal; } |
287 | bool isUTF32() const { return Kind == tok::utf32_string_literal; } |
288 | bool isPascal() const { return Pascal; } |
289 | bool isUnevaluated() const { |
290 | return EvalMethod == StringLiteralEvalMethod::Unevaluated; |
291 | } |
292 | |
293 | StringRef getUDSuffix() const { return UDSuffixBuf; } |
294 | |
295 | /// Get the index of a token containing a ud-suffix. |
296 | unsigned getUDSuffixToken() const { |
297 | assert(!UDSuffixBuf.empty() && "no ud-suffix" ); |
298 | return UDSuffixToken; |
299 | } |
300 | /// Get the spelling offset of the first byte of the ud-suffix. |
301 | unsigned getUDSuffixOffset() const { |
302 | assert(!UDSuffixBuf.empty() && "no ud-suffix" ); |
303 | return UDSuffixOffset; |
304 | } |
305 | |
306 | static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); |
307 | |
308 | private: |
309 | void init(ArrayRef<Token> StringToks); |
310 | bool CopyStringFragment(const Token &Tok, const char *TokBegin, |
311 | StringRef Fragment); |
312 | void DiagnoseLexingError(SourceLocation Loc); |
313 | }; |
314 | |
315 | } // end namespace clang |
316 | |
317 | #endif |
318 | |