1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class implements the lexer for assembly files.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/MC/MCParser/AsmLexer.h"
14#include "llvm/ADT/APInt.h"
15#include "llvm/ADT/ArrayRef.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/ADT/StringSwitch.h"
19#include "llvm/MC/MCAsmInfo.h"
20#include "llvm/MC/MCParser/MCAsmLexer.h"
21#include "llvm/Support/Compiler.h"
22#include "llvm/Support/SMLoc.h"
23#include "llvm/Support/SaveAndRestore.h"
24#include <cassert>
25#include <cctype>
26#include <cstdio>
27#include <cstring>
28#include <string>
29#include <tuple>
30#include <utility>
31
32using namespace llvm;
33
34AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with(Prefix: "@");
36 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
37}
38
39AsmLexer::~AsmLexer() = default;
40
41void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42 bool EndStatementAtEOF) {
43 CurBuf = Buf;
44
45 if (ptr)
46 CurPtr = ptr;
47 else
48 CurPtr = CurBuf.begin();
49
50 TokStart = nullptr;
51 this->EndStatementAtEOF = EndStatementAtEOF;
52}
53
54/// ReturnError - Set the error to the specified string at the specified
55/// location. This is defined to always return AsmToken::Error.
56AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
57 SetError(errLoc: SMLoc::getFromPointer(Ptr: Loc), err: Msg);
58
59 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
60}
61
62int AsmLexer::getNextChar() {
63 if (CurPtr == CurBuf.end())
64 return EOF;
65 return (unsigned char)*CurPtr++;
66}
67
68int AsmLexer::peekNextChar() {
69 if (CurPtr == CurBuf.end())
70 return EOF;
71 return (unsigned char)*CurPtr;
72}
73
74/// The leading integral digit sequence and dot should have already been
75/// consumed, some or all of the fractional digit sequence *can* have been
76/// consumed.
77AsmToken AsmLexer::LexFloatLiteral() {
78 // Skip the fractional digit sequence.
79 while (isDigit(C: *CurPtr))
80 ++CurPtr;
81
82 if (*CurPtr == '-' || *CurPtr == '+')
83 return ReturnError(Loc: CurPtr, Msg: "invalid sign in float literal");
84
85 // Check for exponent
86 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87 ++CurPtr;
88
89 if (*CurPtr == '-' || *CurPtr == '+')
90 ++CurPtr;
91
92 while (isDigit(C: *CurPtr))
93 ++CurPtr;
94 }
95
96 return AsmToken(AsmToken::Real,
97 StringRef(TokStart, CurPtr - TokStart));
98}
99
100/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101/// while making sure there are enough actual digits around for the constant to
102/// be valid.
103///
104/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105/// before we get here.
106AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108 "unexpected parse state in floating hex");
109 bool NoFracDigits = true;
110
111 // Skip the fractional part if there is one
112 if (*CurPtr == '.') {
113 ++CurPtr;
114
115 const char *FracStart = CurPtr;
116 while (isHexDigit(C: *CurPtr))
117 ++CurPtr;
118
119 NoFracDigits = CurPtr == FracStart;
120 }
121
122 if (NoIntDigits && NoFracDigits)
123 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
124 "expected at least one significand digit");
125
126 // Make sure we do have some kind of proper exponent part
127 if (*CurPtr != 'p' && *CurPtr != 'P')
128 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
129 "expected exponent part 'p'");
130 ++CurPtr;
131
132 if (*CurPtr == '+' || *CurPtr == '-')
133 ++CurPtr;
134
135 // N.b. exponent digits are *not* hex
136 const char *ExpStart = CurPtr;
137 while (isDigit(C: *CurPtr))
138 ++CurPtr;
139
140 if (CurPtr == ExpStart)
141 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
142 "expected at least one exponent digit");
143
144 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
145}
146
147/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
148static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150 (AllowAt && C == '@') || (AllowHash && C == '#');
151}
152
153AsmToken AsmLexer::LexIdentifier() {
154 // Check for floating point literals.
155 if (CurPtr[-1] == '.' && isDigit(C: *CurPtr)) {
156 // Disambiguate a .1243foo identifier from a floating literal.
157 while (isDigit(C: *CurPtr))
158 ++CurPtr;
159
160 if (!isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier,
161 AllowHash: AllowHashInIdentifier) ||
162 *CurPtr == 'e' || *CurPtr == 'E')
163 return LexFloatLiteral();
164 }
165
166 while (isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, AllowHash: AllowHashInIdentifier))
167 ++CurPtr;
168
169 // Handle . as a special case.
170 if (CurPtr == TokStart+1 && TokStart[0] == '.')
171 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
172
173 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
174}
175
176/// LexSlash: Slash: /
177/// C-Style Comment: /* ... */
178/// C-style Comment: // ...
179AsmToken AsmLexer::LexSlash() {
180 if (!MAI.shouldAllowAdditionalComments()) {
181 IsAtStartOfStatement = false;
182 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
183 }
184
185 switch (*CurPtr) {
186 case '*':
187 IsAtStartOfStatement = false;
188 break; // C style comment.
189 case '/':
190 ++CurPtr;
191 return LexLineComment();
192 default:
193 IsAtStartOfStatement = false;
194 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
195 }
196
197 // C Style comment.
198 ++CurPtr; // skip the star.
199 const char *CommentTextStart = CurPtr;
200 while (CurPtr != CurBuf.end()) {
201 switch (*CurPtr++) {
202 case '*':
203 // End of the comment?
204 if (*CurPtr != '/')
205 break;
206 // If we have a CommentConsumer, notify it about the comment.
207 if (CommentConsumer) {
208 CommentConsumer->HandleComment(
209 Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
210 CommentText: StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
211 }
212 ++CurPtr; // End the */.
213 return AsmToken(AsmToken::Comment,
214 StringRef(TokStart, CurPtr - TokStart));
215 }
216 }
217 return ReturnError(Loc: TokStart, Msg: "unterminated comment");
218}
219
220/// LexLineComment: Comment: #[^\n]*
221/// : //[^\n]*
222AsmToken AsmLexer::LexLineComment() {
223 // Mark This as an end of statement with a body of the
224 // comment. While it would be nicer to leave this two tokens,
225 // backwards compatability with TargetParsers makes keeping this in this form
226 // better.
227 const char *CommentTextStart = CurPtr;
228 int CurChar = getNextChar();
229 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230 CurChar = getNextChar();
231 const char *NewlinePtr = CurPtr;
232 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
233 ++CurPtr;
234
235 // If we have a CommentConsumer, notify it about the comment.
236 if (CommentConsumer) {
237 CommentConsumer->HandleComment(
238 Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
239 CommentText: StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
240 }
241
242 IsAtStartOfLine = true;
243 // This is a whole line comment. leave newline
244 if (IsAtStartOfStatement)
245 return AsmToken(AsmToken::EndOfStatement,
246 StringRef(TokStart, CurPtr - TokStart));
247 IsAtStartOfStatement = true;
248
249 return AsmToken(AsmToken::EndOfStatement,
250 StringRef(TokStart, CurPtr - 1 - TokStart));
251}
252
253static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
254 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
255 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
256 ++CurPtr;
257 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
258 ++CurPtr;
259 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
260 ++CurPtr;
261}
262
263// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264// integer as a hexadecimal, possibly with leading zeroes.
265static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
266 bool LexHex) {
267 const char *FirstNonDec = nullptr;
268 const char *LookAhead = CurPtr;
269 while (true) {
270 if (isDigit(C: *LookAhead)) {
271 ++LookAhead;
272 } else {
273 if (!FirstNonDec)
274 FirstNonDec = LookAhead;
275
276 // Keep going if we are looking for a 'h' suffix.
277 if (LexHex && isHexDigit(C: *LookAhead))
278 ++LookAhead;
279 else
280 break;
281 }
282 }
283 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
284 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
285 if (isHex)
286 return 16;
287 return DefaultRadix;
288}
289
290static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
291 while (hexDigitValue(C: *CurPtr) < DefaultRadix) {
292 ++CurPtr;
293 }
294 return CurPtr;
295}
296
297static AsmToken intToken(StringRef Ref, APInt &Value) {
298 if (Value.isIntN(N: 64))
299 return AsmToken(AsmToken::Integer, Ref, Value);
300 return AsmToken(AsmToken::BigNum, Ref, Value);
301}
302
303static std::string radixName(unsigned Radix) {
304 switch (Radix) {
305 case 2:
306 return "binary";
307 case 8:
308 return "octal";
309 case 10:
310 return "decimal";
311 case 16:
312 return "hexadecimal";
313 default:
314 return "base-" + std::to_string(val: Radix);
315 }
316}
317
318/// LexDigit: First character is [0-9].
319/// Local Label: [0-9][:]
320/// Forward/Backward Label: [0-9][fb]
321/// Binary integer: 0b[01]+
322/// Octal integer: 0[0-7]+
323/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
324/// Decimal integer: [1-9][0-9]*
325AsmToken AsmLexer::LexDigit() {
326 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327 // MASM-flavor octal integer: [0-7]+[oOqQ]
328 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
330 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
331 const char *FirstNonBinary =
332 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
333 const char *FirstNonDecimal =
334 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
335 const char *OldCurPtr = CurPtr;
336 while (isHexDigit(C: *CurPtr)) {
337 switch (*CurPtr) {
338 default:
339 if (!FirstNonDecimal) {
340 FirstNonDecimal = CurPtr;
341 }
342 [[fallthrough]];
343 case '9':
344 case '8':
345 case '7':
346 case '6':
347 case '5':
348 case '4':
349 case '3':
350 case '2':
351 if (!FirstNonBinary) {
352 FirstNonBinary = CurPtr;
353 }
354 break;
355 case '1':
356 case '0':
357 break;
358 }
359 ++CurPtr;
360 }
361 if (*CurPtr == '.') {
362 // MASM float literals (other than hex floats) always contain a ".", and
363 // are always written in decimal.
364 ++CurPtr;
365 return LexFloatLiteral();
366 }
367
368 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
369 ++CurPtr;
370 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
371 }
372
373 unsigned Radix = 0;
374 if (*CurPtr == 'h' || *CurPtr == 'H') {
375 // hexadecimal number
376 ++CurPtr;
377 Radix = 16;
378 } else if (*CurPtr == 't' || *CurPtr == 'T') {
379 // decimal number
380 ++CurPtr;
381 Radix = 10;
382 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
383 *CurPtr == 'Q') {
384 // octal number
385 ++CurPtr;
386 Radix = 8;
387 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
388 // binary number
389 ++CurPtr;
390 Radix = 2;
391 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
392 DefaultRadix < 14 &&
393 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
394 Radix = 10;
395 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
396 DefaultRadix < 12 &&
397 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
398 Radix = 2;
399 }
400
401 if (Radix) {
402 StringRef Result(TokStart, CurPtr - TokStart);
403 APInt Value(128, 0, true);
404
405 if (Result.drop_back().getAsInteger(Radix, Result&: Value))
406 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
407
408 // MSVC accepts and ignores type suffices on integer literals.
409 SkipIgnoredIntegerSuffix(CurPtr);
410
411 return intToken(Ref: Result, Value);
412 }
413
414 // default-radix integers, or floating point numbers, fall through
415 CurPtr = OldCurPtr;
416 }
417
418 // MASM default-radix integers: [0-9a-fA-F]+
419 // (All other integer literals have a radix specifier.)
420 if (LexMasmIntegers && UseMasmDefaultRadix) {
421 CurPtr = findLastDigit(CurPtr, DefaultRadix: 16);
422 StringRef Result(TokStart, CurPtr - TokStart);
423
424 APInt Value(128, 0, true);
425 if (Result.getAsInteger(Radix: DefaultRadix, Result&: Value)) {
426 return ReturnError(Loc: TokStart,
427 Msg: "invalid " + radixName(Radix: DefaultRadix) + " number");
428 }
429
430 return intToken(Ref: Result, Value);
431 }
432
433 // Motorola hex integers: $[0-9a-fA-F]+
434 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
435 const char *NumStart = CurPtr;
436 while (isHexDigit(C: CurPtr[0]))
437 ++CurPtr;
438
439 APInt Result(128, 0);
440 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 16, Result))
441 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
442
443 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
444 }
445
446 // Motorola binary integers: %[01]+
447 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
448 const char *NumStart = CurPtr;
449 while (*CurPtr == '0' || *CurPtr == '1')
450 ++CurPtr;
451
452 APInt Result(128, 0);
453 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 2, Result))
454 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
455
456 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
457 }
458
459 // Decimal integer: [1-9][0-9]*
460 // HLASM-flavour decimal integer: [0-9][0-9]*
461 // FIXME: Later on, support for fb for HLASM has to be added in
462 // as they probably would be needed for asm goto
463 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
464 unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 10, LexHex: LexMasmIntegers);
465
466 if (!LexHLASMIntegers) {
467 bool IsHex = Radix == 16;
468 // Check for floating point literals.
469 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
470 if (*CurPtr == '.')
471 ++CurPtr;
472 return LexFloatLiteral();
473 }
474 }
475
476 StringRef Result(TokStart, CurPtr - TokStart);
477
478 APInt Value(128, 0, true);
479 if (Result.getAsInteger(Radix, Result&: Value))
480 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
481
482 if (!LexHLASMIntegers)
483 // The darwin/x86 (and x86-64) assembler accepts and ignores type
484 // suffices on integer literals.
485 SkipIgnoredIntegerSuffix(CurPtr);
486
487 return intToken(Ref: Result, Value);
488 }
489
490 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
491 ++CurPtr;
492 // See if we actually have "0b" as part of something like "jmp 0b\n"
493 if (!isDigit(C: CurPtr[0])) {
494 --CurPtr;
495 StringRef Result(TokStart, CurPtr - TokStart);
496 return AsmToken(AsmToken::Integer, Result, 0);
497 }
498 const char *NumStart = CurPtr;
499 while (CurPtr[0] == '0' || CurPtr[0] == '1')
500 ++CurPtr;
501
502 // Requires at least one binary digit.
503 if (CurPtr == NumStart)
504 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
505
506 StringRef Result(TokStart, CurPtr - TokStart);
507
508 APInt Value(128, 0, true);
509 if (Result.substr(Start: 2).getAsInteger(Radix: 2, Result&: Value))
510 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
511
512 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513 // suffixes on integer literals.
514 SkipIgnoredIntegerSuffix(CurPtr);
515
516 return intToken(Ref: Result, Value);
517 }
518
519 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
520 ++CurPtr;
521 const char *NumStart = CurPtr;
522 while (isHexDigit(C: CurPtr[0]))
523 ++CurPtr;
524
525 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526 // diagnosed by LexHexFloatLiteral).
527 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
528 return LexHexFloatLiteral(NoIntDigits: NumStart == CurPtr);
529
530 // Otherwise requires at least one hex digit.
531 if (CurPtr == NumStart)
532 return ReturnError(Loc: CurPtr-2, Msg: "invalid hexadecimal number");
533
534 APInt Result(128, 0);
535 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(Radix: 0, Result))
536 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
537
538 // Consume the optional [hH].
539 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
540 ++CurPtr;
541
542 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543 // suffixes on integer literals.
544 SkipIgnoredIntegerSuffix(CurPtr);
545
546 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
547 }
548
549 // Either octal or hexadecimal.
550 APInt Value(128, 0, true);
551 unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 8, LexHex: LexMasmIntegers);
552 StringRef Result(TokStart, CurPtr - TokStart);
553 if (Result.getAsInteger(Radix, Result&: Value))
554 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
555
556 // Consume the [hH].
557 if (Radix == 16)
558 ++CurPtr;
559
560 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561 // suffixes on integer literals.
562 SkipIgnoredIntegerSuffix(CurPtr);
563
564 return intToken(Ref: Result, Value);
565}
566
567/// LexSingleQuote: Integer: 'b'
568AsmToken AsmLexer::LexSingleQuote() {
569 int CurChar = getNextChar();
570
571 if (LexHLASMStrings)
572 return ReturnError(Loc: TokStart, Msg: "invalid usage of character literals");
573
574 if (LexMasmStrings) {
575 while (CurChar != EOF) {
576 if (CurChar != '\'') {
577 CurChar = getNextChar();
578 } else if (peekNextChar() == '\'') {
579 // In MASM single-quote strings, doubled single-quotes mean an escaped
580 // single quote, so should be lexed in.
581 (void)getNextChar();
582 CurChar = getNextChar();
583 } else {
584 break;
585 }
586 }
587 if (CurChar == EOF)
588 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
589 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
590 }
591
592 if (CurChar == '\\')
593 CurChar = getNextChar();
594
595 if (CurChar == EOF)
596 return ReturnError(Loc: TokStart, Msg: "unterminated single quote");
597
598 CurChar = getNextChar();
599
600 if (CurChar != '\'')
601 return ReturnError(Loc: TokStart, Msg: "single quote way too long");
602
603 // The idea here being that 'c' is basically just an integral
604 // constant.
605 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
606 long long Value;
607
608 if (Res.starts_with(Prefix: "\'\\")) {
609 char theChar = Res[2];
610 switch (theChar) {
611 default: Value = theChar; break;
612 case '\'': Value = '\''; break;
613 case 't': Value = '\t'; break;
614 case 'n': Value = '\n'; break;
615 case 'b': Value = '\b'; break;
616 case 'f': Value = '\f'; break;
617 case 'r': Value = '\r'; break;
618 }
619 } else
620 Value = TokStart[1];
621
622 return AsmToken(AsmToken::Integer, Res, Value);
623}
624
625/// LexQuote: String: "..."
626AsmToken AsmLexer::LexQuote() {
627 int CurChar = getNextChar();
628 if (LexHLASMStrings)
629 return ReturnError(Loc: TokStart, Msg: "invalid usage of string literals");
630
631 if (LexMasmStrings) {
632 while (CurChar != EOF) {
633 if (CurChar != '"') {
634 CurChar = getNextChar();
635 } else if (peekNextChar() == '"') {
636 // In MASM double-quoted strings, doubled double-quotes mean an escaped
637 // double quote, so should be lexed in.
638 (void)getNextChar();
639 CurChar = getNextChar();
640 } else {
641 break;
642 }
643 }
644 if (CurChar == EOF)
645 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
646 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
647 }
648
649 // TODO: does gas allow multiline string constants?
650 while (CurChar != '"') {
651 if (CurChar == '\\') {
652 // Allow \", etc.
653 CurChar = getNextChar();
654 }
655
656 if (CurChar == EOF)
657 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
658
659 CurChar = getNextChar();
660 }
661
662 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
663}
664
665StringRef AsmLexer::LexUntilEndOfStatement() {
666 TokStart = CurPtr;
667
668 while (!isAtStartOfComment(Ptr: CurPtr) && // Start of line comment.
669 !isAtStatementSeparator(Ptr: CurPtr) && // End of statement marker.
670 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
671 ++CurPtr;
672 }
673 return StringRef(TokStart, CurPtr-TokStart);
674}
675
676StringRef AsmLexer::LexUntilEndOfLine() {
677 TokStart = CurPtr;
678
679 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
680 ++CurPtr;
681 }
682 return StringRef(TokStart, CurPtr-TokStart);
683}
684
685size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
686 bool ShouldSkipSpace) {
687 SaveAndRestore SavedTokenStart(TokStart);
688 SaveAndRestore SavedCurPtr(CurPtr);
689 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
690 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
691 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
692 SaveAndRestore SavedIsPeeking(IsPeeking, true);
693 std::string SavedErr = getErr();
694 SMLoc SavedErrLoc = getErrLoc();
695
696 size_t ReadCount;
697 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
698 AsmToken Token = LexToken();
699
700 Buf[ReadCount] = Token;
701
702 if (Token.is(K: AsmToken::Eof))
703 break;
704 }
705
706 SetError(errLoc: SavedErrLoc, err: SavedErr);
707 return ReadCount;
708}
709
710bool AsmLexer::isAtStartOfComment(const char *Ptr) {
711 if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
712 return false;
713
714 StringRef CommentString = MAI.getCommentString();
715
716 if (CommentString.size() == 1)
717 return CommentString[0] == Ptr[0];
718
719 // Allow # preprocessor comments also be counted as comments for "##" cases
720 if (CommentString[1] == '#')
721 return CommentString[0] == Ptr[0];
722
723 return strncmp(s1: Ptr, s2: CommentString.data(), n: CommentString.size()) == 0;
724}
725
726bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
727 return strncmp(s1: Ptr, s2: MAI.getSeparatorString(),
728 n: strlen(s: MAI.getSeparatorString())) == 0;
729}
730
731AsmToken AsmLexer::LexToken() {
732 TokStart = CurPtr;
733 // This always consumes at least one character.
734 int CurChar = getNextChar();
735
736 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
737 // If this starts with a '#', this may be a cpp
738 // hash directive and otherwise a line comment.
739 AsmToken TokenBuf[2];
740 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
741 size_t num = peekTokens(Buf, ShouldSkipSpace: true);
742 // There cannot be a space preceding this
743 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(K: AsmToken::Integer) &&
744 TokenBuf[1].is(K: AsmToken::String)) {
745 CurPtr = TokStart; // reset curPtr;
746 StringRef s = LexUntilEndOfLine();
747 UnLex(Token: TokenBuf[1]);
748 UnLex(Token: TokenBuf[0]);
749 return AsmToken(AsmToken::HashDirective, s);
750 }
751
752 if (MAI.shouldAllowAdditionalComments())
753 return LexLineComment();
754 }
755
756 if (isAtStartOfComment(Ptr: TokStart))
757 return LexLineComment();
758
759 if (isAtStatementSeparator(Ptr: TokStart)) {
760 CurPtr += strlen(s: MAI.getSeparatorString()) - 1;
761 IsAtStartOfLine = true;
762 IsAtStartOfStatement = true;
763 return AsmToken(AsmToken::EndOfStatement,
764 StringRef(TokStart, strlen(s: MAI.getSeparatorString())));
765 }
766
767 // If we're missing a newline at EOF, make sure we still get an
768 // EndOfStatement token before the Eof token.
769 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
770 IsAtStartOfLine = true;
771 IsAtStartOfStatement = true;
772 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
773 }
774 IsAtStartOfLine = false;
775 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
776 IsAtStartOfStatement = false;
777 switch (CurChar) {
778 default:
779 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
780 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
781 // an identifier is target-dependent. These characters are handled in the
782 // respective switch cases.
783 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
784 return LexIdentifier();
785
786 // Unknown character, emit an error.
787 return ReturnError(Loc: TokStart, Msg: "invalid character in input");
788 case EOF:
789 if (EndStatementAtEOF) {
790 IsAtStartOfLine = true;
791 IsAtStartOfStatement = true;
792 }
793 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
794 case 0:
795 case ' ':
796 case '\t':
797 IsAtStartOfStatement = OldIsAtStartOfStatement;
798 while (*CurPtr == ' ' || *CurPtr == '\t')
799 CurPtr++;
800 if (SkipSpace)
801 return LexToken(); // Ignore whitespace.
802 else
803 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
804 case '\r': {
805 IsAtStartOfLine = true;
806 IsAtStartOfStatement = true;
807 // If this is a CR followed by LF, treat that as one token.
808 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
809 ++CurPtr;
810 return AsmToken(AsmToken::EndOfStatement,
811 StringRef(TokStart, CurPtr - TokStart));
812 }
813 case '\n':
814 IsAtStartOfLine = true;
815 IsAtStartOfStatement = true;
816 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
817 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
818 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
819 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
820 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
821 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
822 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
823 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
824 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
825 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
826 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
827 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
828 case '$': {
829 if (LexMotorolaIntegers && isHexDigit(C: *CurPtr))
830 return LexDigit();
831 if (MAI.doesAllowDollarAtStartOfIdentifier())
832 return LexIdentifier();
833 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
834 }
835 case '@':
836 if (MAI.doesAllowAtAtStartOfIdentifier())
837 return LexIdentifier();
838 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
839 case '#':
840 if (MAI.doesAllowHashAtStartOfIdentifier())
841 return LexIdentifier();
842 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
843 case '?':
844 if (MAI.doesAllowQuestionAtStartOfIdentifier())
845 return LexIdentifier();
846 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
847 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
848 case '=':
849 if (*CurPtr == '=') {
850 ++CurPtr;
851 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
852 }
853 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
854 case '-':
855 if (*CurPtr == '>') {
856 ++CurPtr;
857 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
858 }
859 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
860 case '|':
861 if (*CurPtr == '|') {
862 ++CurPtr;
863 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
864 }
865 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
866 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
867 case '&':
868 if (*CurPtr == '&') {
869 ++CurPtr;
870 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
871 }
872 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
873 case '!':
874 if (*CurPtr == '=') {
875 ++CurPtr;
876 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
877 }
878 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
879 case '%':
880 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
881 return LexDigit();
882 }
883
884 if (MAI.hasMipsExpressions()) {
885 AsmToken::TokenKind Operator;
886 unsigned OperatorLength;
887
888 std::tie(args&: Operator, args&: OperatorLength) =
889 StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
890 StringRef(CurPtr))
891 .StartsWith(S: "call16", Value: {AsmToken::PercentCall16, 7})
892 .StartsWith(S: "call_hi", Value: {AsmToken::PercentCall_Hi, 8})
893 .StartsWith(S: "call_lo", Value: {AsmToken::PercentCall_Lo, 8})
894 .StartsWith(S: "dtprel_hi", Value: {AsmToken::PercentDtprel_Hi, 10})
895 .StartsWith(S: "dtprel_lo", Value: {AsmToken::PercentDtprel_Lo, 10})
896 .StartsWith(S: "got_disp", Value: {AsmToken::PercentGot_Disp, 9})
897 .StartsWith(S: "got_hi", Value: {AsmToken::PercentGot_Hi, 7})
898 .StartsWith(S: "got_lo", Value: {AsmToken::PercentGot_Lo, 7})
899 .StartsWith(S: "got_ofst", Value: {AsmToken::PercentGot_Ofst, 9})
900 .StartsWith(S: "got_page", Value: {AsmToken::PercentGot_Page, 9})
901 .StartsWith(S: "gottprel", Value: {AsmToken::PercentGottprel, 9})
902 .StartsWith(S: "got", Value: {AsmToken::PercentGot, 4})
903 .StartsWith(S: "gp_rel", Value: {AsmToken::PercentGp_Rel, 7})
904 .StartsWith(S: "higher", Value: {AsmToken::PercentHigher, 7})
905 .StartsWith(S: "highest", Value: {AsmToken::PercentHighest, 8})
906 .StartsWith(S: "hi", Value: {AsmToken::PercentHi, 3})
907 .StartsWith(S: "lo", Value: {AsmToken::PercentLo, 3})
908 .StartsWith(S: "neg", Value: {AsmToken::PercentNeg, 4})
909 .StartsWith(S: "pcrel_hi", Value: {AsmToken::PercentPcrel_Hi, 9})
910 .StartsWith(S: "pcrel_lo", Value: {AsmToken::PercentPcrel_Lo, 9})
911 .StartsWith(S: "tlsgd", Value: {AsmToken::PercentTlsgd, 6})
912 .StartsWith(S: "tlsldm", Value: {AsmToken::PercentTlsldm, 7})
913 .StartsWith(S: "tprel_hi", Value: {AsmToken::PercentTprel_Hi, 9})
914 .StartsWith(S: "tprel_lo", Value: {AsmToken::PercentTprel_Lo, 9})
915 .Default(Value: {AsmToken::Percent, 1});
916
917 if (Operator != AsmToken::Percent) {
918 CurPtr += OperatorLength - 1;
919 return AsmToken(Operator, StringRef(TokStart, OperatorLength));
920 }
921 }
922 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
923 case '/':
924 IsAtStartOfStatement = OldIsAtStartOfStatement;
925 return LexSlash();
926 case '\'': return LexSingleQuote();
927 case '"': return LexQuote();
928 case '0': case '1': case '2': case '3': case '4':
929 case '5': case '6': case '7': case '8': case '9':
930 return LexDigit();
931 case '<':
932 switch (*CurPtr) {
933 case '<':
934 ++CurPtr;
935 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
936 case '=':
937 ++CurPtr;
938 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
939 case '>':
940 ++CurPtr;
941 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
942 default:
943 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
944 }
945 case '>':
946 switch (*CurPtr) {
947 case '>':
948 ++CurPtr;
949 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
950 case '=':
951 ++CurPtr;
952 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
953 default:
954 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
955 }
956
957 // TODO: Quoted identifiers (objc methods etc)
958 // local labels: [0-9][:]
959 // Forward/backward labels: [0-9][fb]
960 // Integers, fp constants, character constants.
961 }
962}
963

source code of llvm/lib/MC/MCParser/AsmLexer.cpp