AsmLexer.cpp source code [llvm/lib/MC/MCParser/AsmLexer.cpp]

1	//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This class implements the lexer for assembly files.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/MC/MCParser/AsmLexer.h"
14	#include "llvm/ADT/APInt.h"
15	#include "llvm/ADT/ArrayRef.h"
16	#include "llvm/ADT/StringExtras.h"
17	#include "llvm/ADT/StringRef.h"
18	#include "llvm/ADT/StringSwitch.h"
19	#include "llvm/MC/MCAsmInfo.h"
20	#include "llvm/MC/MCParser/MCAsmLexer.h"
21	#include "llvm/Support/Compiler.h"
22	#include "llvm/Support/SMLoc.h"
23	#include "llvm/Support/SaveAndRestore.h"
24	#include <cassert>
25	#include <cctype>
26	#include <cstdio>
27	#include <cstring>
28	#include <string>
29	#include <tuple>
30	#include <utility>
31
32	using namespace llvm;
33
34	AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35	AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with(Prefix: "@");
36	LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
37	}
38
39	AsmLexer::~AsmLexer() = default;
40
41	void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42	bool EndStatementAtEOF) {
43	CurBuf = Buf;
44
45	if (ptr)
46	CurPtr = ptr;
47	else
48	CurPtr = CurBuf.begin();
49
50	TokStart = nullptr;
51	this->EndStatementAtEOF = EndStatementAtEOF;
52	}
53
54	/// ReturnError - Set the error to the specified string at the specified
55	/// location. This is defined to always return AsmToken::Error.
56	AsmToken AsmLexer::ReturnError(const char Loc, const* std::string &Msg) {
57	SetError(errLoc: SMLoc::getFromPointer(Ptr: Loc), err: Msg);
58
59	return AsmToken (AsmToken::Error, StringRef (Loc, CurPtr - Loc));
60	}
61
62	int AsmLexer::getNextChar() {
63	if (CurPtr == CurBuf.end())
64	return EOF;
65	return (unsigned char)*CurPtr++;
66	}
67
68	int AsmLexer::peekNextChar() {
69	if (CurPtr == CurBuf.end())
70	return EOF;
71	return (unsigned char)*CurPtr;
72	}
73
74	/// The leading integral digit sequence and dot should have already been
75	/// consumed, some or all of the fractional digit sequence can* have been*
76	/// consumed.
77	AsmToken AsmLexer::LexFloatLiteral() {
78	// Skip the fractional digit sequence.
79	while (isDigit(C: *CurPtr))
80	++CurPtr;
81
82	if (CurPtr == `'-'` \|\| CurPtr == `'+'`)
83	return ReturnError(Loc: CurPtr, Msg: "invalid sign in float literal");
84
85	// Check for exponent
86	if ((CurPtr == `'e'` \|\| CurPtr == `'E'`)) {
87	++CurPtr;
88
89	if (CurPtr == `'-'` \|\| CurPtr == `'+'`)
90	++CurPtr;
91
92	while (isDigit(C: *CurPtr))
93	++CurPtr;
94	}
95
96	return AsmToken (AsmToken::Real,
97	StringRef (TokStart, CurPtr - TokStart));
98	}
99
100	/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F])?[pP][+-]?[0-9a-fA-F]+*
101	/// while making sure there are enough actual digits around for the constant to
102	/// be valid.
103	///
104	/// The leading "0x[0-9a-fA-F]" (i.e. integer part) has already been consumed*
105	/// before we get here.
106	AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107	assert((CurPtr == `'p'` \|\| CurPtr == `'P'` \|\| *CurPtr == `'.'`) &&
108	"unexpected parse state in floating hex");
109	bool NoFracDigits = true;
110
111	// Skip the fractional part if there is one
112	if (*CurPtr == `'.'`) {
113	++CurPtr;
114
115	const char *FracStart = CurPtr;
116	while (isHexDigit(C: *CurPtr))
117	++CurPtr;
118
119	NoFracDigits = CurPtr == FracStart;
120	}
121
122	if (NoIntDigits && NoFracDigits)
123	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
124	"expected at least one significand digit");
125
126	// Make sure we do have some kind of proper exponent part
127	if (CurPtr != `'p'` && CurPtr != `'P'`)
128	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
129	"expected exponent part 'p'");
130	++CurPtr;
131
132	if (CurPtr == `'+'` \|\| CurPtr == `'-'`)
133	++CurPtr;
134
135	// N.b. exponent digits are not* hex*
136	const char *ExpStart = CurPtr;
137	while (isDigit(C: *CurPtr))
138	++CurPtr;
139
140	if (CurPtr == ExpStart)
141	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
142	"expected at least one exponent digit");
143
144	return AsmToken (AsmToken::Real, StringRef (TokStart, CurPtr - TokStart));
145	}
146
147	/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
148	static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149	return isAlnum(C) \|\| C == `'_'` \|\| C == `'$'` \|\| C == `'.'` \|\| C == `'?'` \|\|
150	(AllowAt && C == `'@'`) \|\| (AllowHash && C == `'#'`);
151	}
152
153	AsmToken AsmLexer::LexIdentifier() {
154	// Check for floating point literals.
155	if (CurPtr[-`1`] == `'.'` && isDigit(C: *CurPtr)) {
156	// Disambiguate a .1243foo identifier from a floating literal.
157	while (isDigit(C: *CurPtr))
158	++CurPtr;
159
160	if (!isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier,
161	AllowHash: AllowHashInIdentifier) \|\|
162	CurPtr == `'e'` \|\| CurPtr == `'E'`)
163	return LexFloatLiteral();
164	}
165
166	while (isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, AllowHash: AllowHashInIdentifier))
167	++CurPtr;
168
169	// Handle . as a special case.
170	if (CurPtr == TokStart+`1` && TokStart[`0`] == `'.'`)
171	return AsmToken (AsmToken::Dot, StringRef (TokStart, `1`));
172
173	return AsmToken (AsmToken::Identifier, StringRef (TokStart, CurPtr - TokStart));
174	}
175
176	/// LexSlash: Slash: /
177	/// C-Style Comment: / ... /
178	/// C-style Comment: // ...
179	AsmToken AsmLexer::LexSlash() {
180	if (!MAI.shouldAllowAdditionalComments()) {
181	IsAtStartOfStatement = false;
182	return AsmToken (AsmToken::Slash, StringRef (TokStart, `1`));
183	}
184
185	switch (*CurPtr) {
186	case `'*'`:
187	IsAtStartOfStatement = false;
188	break; // C style comment.
189	case `'/'`:
190	++CurPtr;
191	return LexLineComment();
192	default:
193	IsAtStartOfStatement = false;
194	return AsmToken (AsmToken::Slash, StringRef (TokStart, `1`));
195	}
196
197	// C Style comment.
198	++CurPtr; // skip the star.
199	const char *CommentTextStart = CurPtr;
200	while (CurPtr != CurBuf.end()) {
201	switch (*CurPtr++) {
202	case `'*'`:
203	// End of the comment?
204	if (*CurPtr != `'/'`)
205	break;
206	// If we have a CommentConsumer, notify it about the comment.
207	if (CommentConsumer) {
208	CommentConsumer->HandleComment(
209	Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
210	CommentText: StringRef (CommentTextStart, CurPtr - `1` - CommentTextStart));
211	}
212	++CurPtr; // End the /.*
213	return AsmToken (AsmToken::Comment,
214	StringRef (TokStart, CurPtr - TokStart));
215	}
216	}
217	return ReturnError(Loc: TokStart, Msg: "unterminated comment");
218	}
219
220	/// LexLineComment: Comment: #[^\n]*
221	/// : //[^\n]*
222	AsmToken AsmLexer::LexLineComment() {
223	// Mark This as an end of statement with a body of the
224	// comment. While it would be nicer to leave this two tokens,
225	// backwards compatability with TargetParsers makes keeping this in this form
226	// better.
227	const char *CommentTextStart = CurPtr;
228	int CurChar = getNextChar();
229	while (CurChar != `'\n'` && CurChar != `'\r'` && CurChar != EOF)
230	CurChar = getNextChar();
231	const char *NewlinePtr = CurPtr;
232	if (CurChar == `'\r'` && CurPtr != CurBuf.end() && *CurPtr == `'\n'`)
233	++CurPtr;
234
235	// If we have a CommentConsumer, notify it about the comment.
236	if (CommentConsumer) {
237	CommentConsumer->HandleComment(
238	Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
239	CommentText: StringRef (CommentTextStart, NewlinePtr - `1` - CommentTextStart));
240	}
241
242	IsAtStartOfLine = true;
243	// This is a whole line comment. leave newline
244	if (IsAtStartOfStatement)
245	return AsmToken (AsmToken::EndOfStatement,
246	StringRef (TokStart, CurPtr - TokStart));
247	IsAtStartOfStatement = true;
248
249	return AsmToken (AsmToken::EndOfStatement,
250	StringRef (TokStart, CurPtr - `1` - TokStart));
251	}
252
253	static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
254	// Skip case-insensitive ULL, UL, U, L and LL suffixes.
255	if (CurPtr[`0`] == `'U'` \|\| CurPtr[`0`] == `'u'`)
256	++CurPtr;
257	if (CurPtr[`0`] == `'L'` \|\| CurPtr[`0`] == `'l'`)
258	++CurPtr;
259	if (CurPtr[`0`] == `'L'` \|\| CurPtr[`0`] == `'l'`)
260	++CurPtr;
261	}
262
263	// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264	// integer as a hexadecimal, possibly with leading zeroes.
265	static unsigned doHexLookAhead(const char &CurPtr, unsigned* DefaultRadix,
266	bool LexHex) {
267	const char FirstNonDec = nullptr*;
268	const char *LookAhead = CurPtr;
269	while (true) {
270	if (isDigit(C: *LookAhead)) {
271	++LookAhead;
272	} else {
273	if (!FirstNonDec)
274	FirstNonDec = LookAhead;
275
276	// Keep going if we are looking for a 'h' suffix.
277	if (LexHex && isHexDigit(C: *LookAhead))
278	++LookAhead;
279	else
280	break;
281	}
282	}
283	bool isHex = LexHex && (LookAhead == `'h'` \|\| LookAhead == `'H'`);
284	CurPtr = isHex \|\| !FirstNonDec ? LookAhead : FirstNonDec;
285	if (isHex)
286	return `16`;
287	return DefaultRadix;
288	}
289
290	static const char findLastDigit(const* char CurPtr, unsigned* DefaultRadix) {
291	while (hexDigitValue(C: *CurPtr) < DefaultRadix) {
292	++CurPtr;
293	}
294	return CurPtr;
295	}
296
297	static AsmToken intToken(StringRef Ref, APInt &Value) {
298	if (Value.isIntN(N: `64`))
299	return AsmToken (AsmToken::Integer, Ref, Value);
300	return AsmToken (AsmToken::BigNum, Ref, Value);
301	}
302
303	static std::string radixName(unsigned Radix) {
304	switch (Radix) {
305	case `2`:
306	return "binary";
307	case `8`:
308	return "octal";
309	case `10`:
310	return "decimal";
311	case `16`:
312	return "hexadecimal";
313	default:
314	return "base-" + std::to_string(val: Radix);
315	}
316	}
317
318	/// LexDigit: First character is [0-9].
319	/// Local Label: [0-9][:]
320	/// Forward/Backward Label: [0-9][fb]
321	/// Binary integer: 0b[01]+
322	/// Octal integer: 0[0-7]+
323	/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F][hH]*
324	/// Decimal integer: [1-9][0-9]*
325	AsmToken AsmLexer::LexDigit() {
326	// MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327	// MASM-flavor octal integer: [0-7]+[oOqQ]
328	// MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329	// MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F][hH]*
330	if (LexMasmIntegers && isdigit(CurPtr[-`1`])) {
331	const char *FirstNonBinary =
332	(CurPtr[-`1`] != `'0'` && CurPtr[-`1`] != `'1'`) ? CurPtr - `1` : nullptr;
333	const char *FirstNonDecimal =
334	(CurPtr[-`1`] < `'0'` \|\| CurPtr[-`1`] > `'9'`) ? CurPtr - `1` : nullptr;
335	const char *OldCurPtr = CurPtr;
336	while (isHexDigit(C: *CurPtr)) {
337	switch (*CurPtr) {
338	default:
339	if (!FirstNonDecimal) {
340	FirstNonDecimal = CurPtr;
341	}
342	[[fallthrough]];
343	case `'9'`:
344	case `'8'`:
345	case `'7'`:
346	case `'6'`:
347	case `'5'`:
348	case `'4'`:
349	case `'3'`:
350	case `'2'`:
351	if (!FirstNonBinary) {
352	FirstNonBinary = CurPtr;
353	}
354	break;
355	case `'1'`:
356	case `'0'`:
357	break;
358	}
359	++CurPtr;
360	}
361	if (*CurPtr == `'.'`) {
362	// MASM float literals (other than hex floats) always contain a ".", and
363	// are always written in decimal.
364	++CurPtr;
365	return LexFloatLiteral();
366	}
367
368	if (LexMasmHexFloats && (CurPtr == `'r'` \|\| CurPtr == `'R'`)) {
369	++CurPtr;
370	return AsmToken (AsmToken::Real, StringRef (TokStart, CurPtr - TokStart));
371	}
372
373	unsigned Radix = `0`;
374	if (CurPtr == `'h'` \|\| CurPtr == `'H'`) {
375	// hexadecimal number
376	++CurPtr;
377	Radix = `16`;
378	} else if (CurPtr == `'t'` \|\| CurPtr == `'T'`) {
379	// decimal number
380	++CurPtr;
381	Radix = `10`;
382	} else if (CurPtr == `'o'` \|\| CurPtr == `'O'` \|\| *CurPtr == `'q'` \|\|
383	*CurPtr == `'Q'`) {
384	// octal number
385	++CurPtr;
386	Radix = `8`;
387	} else if (CurPtr == `'y'` \|\| CurPtr == `'Y'`) {
388	// binary number
389	++CurPtr;
390	Radix = `2`;
391	} else if (FirstNonDecimal && FirstNonDecimal + `1` == CurPtr &&
392	DefaultRadix < `14` &&
393	(FirstNonDecimal == `'d'` \|\| FirstNonDecimal == `'D'`)) {
394	Radix = `10`;
395	} else if (FirstNonBinary && FirstNonBinary + `1` == CurPtr &&
396	DefaultRadix < `12` &&
397	(FirstNonBinary == `'b'` \|\| FirstNonBinary == `'B'`)) {
398	Radix = `2`;
399	}
400
401	if (Radix) {
402	StringRef Result(TokStart, CurPtr - TokStart);
403	APInt Value(`128`, `0`, true);
404
405	if (Result.drop_back().getAsInteger(Radix, Result&: Value))
406	return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
407
408	// MSVC accepts and ignores type suffices on integer literals.
409	SkipIgnoredIntegerSuffix(CurPtr);
410
411	return intToken(Ref: Result, Value);
412	}
413
414	// default-radix integers, or floating point numbers, fall through
415	CurPtr = OldCurPtr;
416	}
417
418	// MASM default-radix integers: [0-9a-fA-F]+
419	// (All other integer literals have a radix specifier.)
420	if (LexMasmIntegers && UseMasmDefaultRadix) {
421	CurPtr = findLastDigit(CurPtr, DefaultRadix: `16`);
422	StringRef Result(TokStart, CurPtr - TokStart);
423
424	APInt Value(`128`, `0`, true);
425	if (Result.getAsInteger(Radix: DefaultRadix, Result&: Value)) {
426	return ReturnError(Loc: TokStart,
427	Msg: "invalid " + radixName(Radix: DefaultRadix) + " number");
428	}
429
430	return intToken(Ref: Result, Value);
431	}
432
433	// Motorola hex integers: $[0-9a-fA-F]+
434	if (LexMotorolaIntegers && CurPtr[-`1`] == `'$'`) {
435	const char *NumStart = CurPtr;
436	while (isHexDigit(C: CurPtr[`0`]))
437	++CurPtr;
438
439	APInt Result(`128`, `0`);
440	if (StringRef (NumStart, CurPtr - NumStart).getAsInteger(Radix: `16`, Result))
441	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
442
443	return intToken(Ref: StringRef (TokStart, CurPtr - TokStart), Value&: Result);
444	}
445
446	// Motorola binary integers: %[01]+
447	if (LexMotorolaIntegers && CurPtr[-`1`] == `'%'`) {
448	const char *NumStart = CurPtr;
449	while (CurPtr == `'0'` \|\| CurPtr == `'1'`)
450	++CurPtr;
451
452	APInt Result(`128`, `0`);
453	if (StringRef (NumStart, CurPtr - NumStart).getAsInteger(Radix: `2`, Result))
454	return ReturnError(Loc: TokStart, Msg: "invalid binary number");
455
456	return intToken(Ref: StringRef (TokStart, CurPtr - TokStart), Value&: Result);
457	}
458
459	// Decimal integer: [1-9][0-9]*
460	// HLASM-flavour decimal integer: [0-9][0-9]*
461	// FIXME: Later on, support for fb for HLASM has to be added in
462	// as they probably would be needed for asm goto
463	if (LexHLASMIntegers \|\| CurPtr[-`1`] != `'0'` \|\| CurPtr[`0`] == `'.'`) {
464	unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: `10`, LexHex: LexMasmIntegers);
465
466	if (!LexHLASMIntegers) {
467	bool IsHex = Radix == `16`;
468	// Check for floating point literals.
469	if (!IsHex && (CurPtr == `'.'` \|\| CurPtr == `'e'` \|\| *CurPtr == `'E'`)) {
470	if (*CurPtr == `'.'`)
471	++CurPtr;
472	return LexFloatLiteral();
473	}
474	}
475
476	StringRef Result(TokStart, CurPtr - TokStart);
477
478	APInt Value(`128`, `0`, true);
479	if (Result.getAsInteger(Radix, Result&: Value))
480	return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
481
482	if (!LexHLASMIntegers)
483	// The darwin/x86 (and x86-64) assembler accepts and ignores type
484	// suffices on integer literals.
485	SkipIgnoredIntegerSuffix(CurPtr);
486
487	return intToken(Ref: Result, Value);
488	}
489
490	if (!LexMasmIntegers && ((CurPtr == `'b'`) \|\| (CurPtr == `'B'`))) {
491	++CurPtr;
492	// See if we actually have "0b" as part of something like "jmp 0b\n"
493	if (!isDigit(C: CurPtr[`0`])) {
494	--CurPtr;
495	StringRef Result(TokStart, CurPtr - TokStart);
496	return AsmToken (AsmToken::Integer, Result, `0`);
497	}
498	const char *NumStart = CurPtr;
499	while (CurPtr[`0`] == `'0'` \|\| CurPtr[`0`] == `'1'`)
500	++CurPtr;
501
502	// Requires at least one binary digit.
503	if (CurPtr == NumStart)
504	return ReturnError(Loc: TokStart, Msg: "invalid binary number");
505
506	StringRef Result(TokStart, CurPtr - TokStart);
507
508	APInt Value(`128`, `0`, true);
509	if (Result.substr(Start: `2`).getAsInteger(Radix: `2`, Result&: Value))
510	return ReturnError(Loc: TokStart, Msg: "invalid binary number");
511
512	// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513	// suffixes on integer literals.
514	SkipIgnoredIntegerSuffix(CurPtr);
515
516	return intToken(Ref: Result, Value);
517	}
518
519	if ((CurPtr == `'x'`) \|\| (CurPtr == `'X'`)) {
520	++CurPtr;
521	const char *NumStart = CurPtr;
522	while (isHexDigit(C: CurPtr[`0`]))
523	++CurPtr;
524
525	// "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526	// diagnosed by LexHexFloatLiteral).
527	if (CurPtr[`0`] == `'.'` \|\| CurPtr[`0`] == `'p'` \|\| CurPtr[`0`] == `'P'`)
528	return LexHexFloatLiteral(NoIntDigits: NumStart == CurPtr);
529
530	// Otherwise requires at least one hex digit.
531	if (CurPtr == NumStart)
532	return ReturnError(Loc: CurPtr-`2`, Msg: "invalid hexadecimal number");
533
534	APInt Result(`128`, `0`);
535	if (StringRef (TokStart, CurPtr - TokStart).getAsInteger(Radix: `0`, Result))
536	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
537
538	// Consume the optional [hH].
539	if (LexMasmIntegers && (CurPtr == `'h'` \|\| CurPtr == `'H'`))
540	++CurPtr;
541
542	// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543	// suffixes on integer literals.
544	SkipIgnoredIntegerSuffix(CurPtr);
545
546	return intToken(Ref: StringRef (TokStart, CurPtr - TokStart), Value&: Result);
547	}
548
549	// Either octal or hexadecimal.
550	APInt Value(`128`, `0`, true);
551	unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: `8`, LexHex: LexMasmIntegers);
552	StringRef Result(TokStart, CurPtr - TokStart);
553	if (Result.getAsInteger(Radix, Result&: Value))
554	return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
555
556	// Consume the [hH].
557	if (Radix == `16`)
558	++CurPtr;
559
560	// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561	// suffixes on integer literals.
562	SkipIgnoredIntegerSuffix(CurPtr);
563
564	return intToken(Ref: Result, Value);
565	}
566
567	/// LexSingleQuote: Integer: 'b'
568	AsmToken AsmLexer::LexSingleQuote() {
569	int CurChar = getNextChar();
570
571	if (LexHLASMStrings)
572	return ReturnError(Loc: TokStart, Msg: "invalid usage of character literals");
573
574	if (LexMasmStrings) {
575	while (CurChar != EOF) {
576	if (CurChar != `'\''`) {
577	CurChar = getNextChar();
578	} else if (peekNextChar() == `'\''`) {
579	// In MASM single-quote strings, doubled single-quotes mean an escaped
580	// single quote, so should be lexed in.
581	(void)getNextChar();
582	CurChar = getNextChar();
583	} else {
584	break;
585	}
586	}
587	if (CurChar == EOF)
588	return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
589	return AsmToken (AsmToken::String, StringRef (TokStart, CurPtr - TokStart));
590	}
591
592	if (CurChar == `'\\'`)
593	CurChar = getNextChar();
594
595	if (CurChar == EOF)
596	return ReturnError(Loc: TokStart, Msg: "unterminated single quote");
597
598	CurChar = getNextChar();
599
600	if (CurChar != `'\''`)
601	return ReturnError(Loc: TokStart, Msg: "single quote way too long");
602
603	// The idea here being that 'c' is basically just an integral
604	// constant.
605	StringRef Res = StringRef (TokStart,CurPtr - TokStart);
606	long long Value;
607
608	if (Res.starts_with(Prefix: "\'\\")) {
609	char theChar = Res [`2`];
610	switch (theChar) {
611	default: Value = theChar; break;
612	case `'\''`: Value = `'\''`; break;
613	case `'t'`: Value = `'\t'`; break;
614	case `'n'`: Value = `'\n'`; break;
615	case `'b'`: Value = `'\b'`; break;
616	case `'f'`: Value = `'\f'`; break;
617	case `'r'`: Value = `'\r'`; break;
618	}
619	} else
620	Value = TokStart[`1`];
621
622	return AsmToken (AsmToken::Integer, Res, Value);
623	}
624
625	/// LexQuote: String: "..."
626	AsmToken AsmLexer::LexQuote() {
627	int CurChar = getNextChar();
628	if (LexHLASMStrings)
629	return ReturnError(Loc: TokStart, Msg: "invalid usage of string literals");
630
631	if (LexMasmStrings) {
632	while (CurChar != EOF) {
633	if (CurChar != `'"'`) {
634	CurChar = getNextChar();
635	} else if (peekNextChar() == `'"'`) {
636	// In MASM double-quoted strings, doubled double-quotes mean an escaped
637	// double quote, so should be lexed in.
638	(void)getNextChar();
639	CurChar = getNextChar();
640	} else {
641	break;
642	}
643	}
644	if (CurChar == EOF)
645	return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
646	return AsmToken (AsmToken::String, StringRef (TokStart, CurPtr - TokStart));
647	}
648
649	// TODO: does gas allow multiline string constants?
650	while (CurChar != `'"'`) {
651	if (CurChar == `'\\'`) {
652	// Allow \", etc.
653	CurChar = getNextChar();
654	}
655
656	if (CurChar == EOF)
657	return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
658
659	CurChar = getNextChar();
660	}
661
662	return AsmToken (AsmToken::String, StringRef (TokStart, CurPtr - TokStart));
663	}
664
665	StringRef AsmLexer::LexUntilEndOfStatement() {
666	TokStart = CurPtr;
667
668	while (!isAtStartOfComment(Ptr: CurPtr) && // Start of line comment.
669	!isAtStatementSeparator(Ptr: CurPtr) && // End of statement marker.
670	CurPtr != `'\n'` && CurPtr != `'\r'` && CurPtr != CurBuf.end()) {
671	++CurPtr;
672	}
673	return StringRef (TokStart, CurPtr-TokStart);
674	}
675
676	StringRef AsmLexer::LexUntilEndOfLine() {
677	TokStart = CurPtr;
678
679	while (CurPtr != `'\n'` && CurPtr != `'\r'` && CurPtr != CurBuf.end()) {
680	++CurPtr;
681	}
682	return StringRef (TokStart, CurPtr-TokStart);
683	}
684
685	size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
686	bool ShouldSkipSpace) {
687	SaveAndRestore SavedTokenStart(TokStart);
688	SaveAndRestore SavedCurPtr(CurPtr);
689	SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
690	SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
691	SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
692	SaveAndRestore SavedIsPeeking(IsPeeking, true);
693	std::string SavedErr = getErr();
694	SMLoc SavedErrLoc = getErrLoc();
695
696	size_t ReadCount;
697	for (ReadCount = `0`; ReadCount < Buf.size(); ++ReadCount) {
698	AsmToken Token = LexToken();
699
700	Buf [ReadCount] = Token;
701
702	if (Token.is(K: AsmToken::Eof))
703	break;
704	}
705
706	SetError(errLoc: SavedErrLoc, err: SavedErr);
707	return ReadCount;
708	}
709
710	bool AsmLexer::isAtStartOfComment(const char *Ptr) {
711	if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
712	return false;
713
714	StringRef CommentString = MAI.getCommentString();
715
716	if (CommentString.size() == `1`)
717	return CommentString [`0`] == Ptr[`0`];
718
719	// Allow # preprocessor comments also be counted as comments for "##" cases
720	if (CommentString [`1`] == `'#'`)
721	return CommentString [`0`] == Ptr[`0`];
722
723	return strncmp(s1: Ptr, s2: CommentString.data(), n: CommentString.size()) == `0`;
724	}
725
726	bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
727	return strncmp(s1: Ptr, s2: MAI.getSeparatorString(),
728	n: strlen(s: MAI.getSeparatorString())) == `0`;
729	}
730
731	AsmToken AsmLexer::LexToken() {
732	TokStart = CurPtr;
733	// This always consumes at least one character.
734	int CurChar = getNextChar();
735
736	if (!IsPeeking && CurChar == `'#'` && IsAtStartOfStatement) {
737	// If this starts with a '#', this may be a cpp
738	// hash directive and otherwise a line comment.
739	AsmToken TokenBuf[`2`];
740	MutableArrayRef<AsmToken> Buf(TokenBuf, `2`);
741	size_t num = peekTokens(Buf, ShouldSkipSpace: true);
742	// There cannot be a space preceding this
743	if (IsAtStartOfLine && num == `2` && TokenBuf[`0`].is(K: AsmToken::Integer) &&
744	TokenBuf[`1`].is(K: AsmToken::String)) {
745	CurPtr = TokStart; // reset curPtr;
746	StringRef s = LexUntilEndOfLine();
747	UnLex(Token: TokenBuf[`1`]);
748	UnLex(Token: TokenBuf[`0`]);
749	return AsmToken (AsmToken::HashDirective, s);
750	}
751
752	if (MAI.shouldAllowAdditionalComments())
753	return LexLineComment();
754	}
755
756	if (isAtStartOfComment(Ptr: TokStart))
757	return LexLineComment();
758
759	if (isAtStatementSeparator(Ptr: TokStart)) {
760	CurPtr += strlen(s: MAI.getSeparatorString()) - `1`;
761	IsAtStartOfLine = true;
762	IsAtStartOfStatement = true;
763	return AsmToken (AsmToken::EndOfStatement,
764	StringRef (TokStart, strlen(s: MAI.getSeparatorString())));
765	}
766
767	// If we're missing a newline at EOF, make sure we still get an
768	// EndOfStatement token before the Eof token.
769	if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
770	IsAtStartOfLine = true;
771	IsAtStartOfStatement = true;
772	return AsmToken (AsmToken::EndOfStatement, StringRef (TokStart, `0`));
773	}
774	IsAtStartOfLine = false;
775	bool OldIsAtStartOfStatement = IsAtStartOfStatement;
776	IsAtStartOfStatement = false;
777	switch (CurChar) {
778	default:
779	// Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
780	// Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
781	// an identifier is target-dependent. These characters are handled in the
782	// respective switch cases.
783	if (isalpha(CurChar) \|\| CurChar == `'_'` \|\| CurChar == `'.'`)
784	return LexIdentifier();
785
786	// Unknown character, emit an error.
787	return ReturnError(Loc: TokStart, Msg: "invalid character in input");
788	case EOF:
789	if (EndStatementAtEOF) {
790	IsAtStartOfLine = true;
791	IsAtStartOfStatement = true;
792	}
793	return AsmToken (AsmToken::Eof, StringRef (TokStart, `0`));
794	case `0`:
795	case `' '`:
796	case `'\t'`:
797	IsAtStartOfStatement = OldIsAtStartOfStatement;
798	while (CurPtr == `' '` \|\| CurPtr == `'\t'`)
799	CurPtr++;
800	if (SkipSpace)
801	return LexToken(); // Ignore whitespace.
802	else
803	return AsmToken (AsmToken::Space, StringRef (TokStart, CurPtr - TokStart));
804	case `'\r'`: {
805	IsAtStartOfLine = true;
806	IsAtStartOfStatement = true;
807	// If this is a CR followed by LF, treat that as one token.
808	if (CurPtr != CurBuf.end() && *CurPtr == `'\n'`)
809	++CurPtr;
810	return AsmToken (AsmToken::EndOfStatement,
811	StringRef (TokStart, CurPtr - TokStart));
812	}
813	case `'\n'`:
814	IsAtStartOfLine = true;
815	IsAtStartOfStatement = true;
816	return AsmToken (AsmToken::EndOfStatement, StringRef (TokStart, `1`));
817	case `':'`: return AsmToken (AsmToken::Colon, StringRef (TokStart, `1`));
818	case `'+'`: return AsmToken (AsmToken::Plus, StringRef (TokStart, `1`));
819	case `'~'`: return AsmToken (AsmToken::Tilde, StringRef (TokStart, `1`));
820	case `'('`: return AsmToken (AsmToken::LParen, StringRef (TokStart, `1`));
821	case `')'`: return AsmToken (AsmToken::RParen, StringRef (TokStart, `1`));
822	case `'['`: return AsmToken (AsmToken::LBrac, StringRef (TokStart, `1`));
823	case `']'`: return AsmToken (AsmToken::RBrac, StringRef (TokStart, `1`));
824	case `'{'`: return AsmToken (AsmToken::LCurly, StringRef (TokStart, `1`));
825	case `'}'`: return AsmToken (AsmToken::RCurly, StringRef (TokStart, `1`));
826	case `''`: return* AsmToken (AsmToken::Star, StringRef (TokStart, `1`));
827	case `','`: return AsmToken (AsmToken::Comma, StringRef (TokStart, `1`));
828	case `'$'`: {
829	if (LexMotorolaIntegers && isHexDigit(C: *CurPtr))
830	return LexDigit();
831	if (MAI.doesAllowDollarAtStartOfIdentifier())
832	return LexIdentifier();
833	return AsmToken (AsmToken::Dollar, StringRef (TokStart, `1`));
834	}
835	case `'@'`:
836	if (MAI.doesAllowAtAtStartOfIdentifier())
837	return LexIdentifier();
838	return AsmToken (AsmToken::At, StringRef (TokStart, `1`));
839	case `'#'`:
840	if (MAI.doesAllowHashAtStartOfIdentifier())
841	return LexIdentifier();
842	return AsmToken (AsmToken::Hash, StringRef (TokStart, `1`));
843	case `'?'`:
844	if (MAI.doesAllowQuestionAtStartOfIdentifier())
845	return LexIdentifier();
846	return AsmToken (AsmToken::Question, StringRef (TokStart, `1`));
847	case `'\\'`: return AsmToken (AsmToken::BackSlash, StringRef (TokStart, `1`));
848	case `'='`:
849	if (*CurPtr == `'='`) {
850	++CurPtr;
851	return AsmToken (AsmToken::EqualEqual, StringRef (TokStart, `2`));
852	}
853	return AsmToken (AsmToken::Equal, StringRef (TokStart, `1`));
854	case `'-'`:
855	if (*CurPtr == `'>'`) {
856	++CurPtr;
857	return AsmToken (AsmToken::MinusGreater, StringRef (TokStart, `2`));
858	}
859	return AsmToken (AsmToken::Minus, StringRef (TokStart, `1`));
860	case `'\|'`:
861	if (*CurPtr == `'\|'`) {
862	++CurPtr;
863	return AsmToken (AsmToken::PipePipe, StringRef (TokStart, `2`));
864	}
865	return AsmToken (AsmToken::Pipe, StringRef (TokStart, `1`));
866	case `'^'`: return AsmToken (AsmToken::Caret, StringRef (TokStart, `1`));
867	case `'&'`:
868	if (*CurPtr == `'&'`) {
869	++CurPtr;
870	return AsmToken (AsmToken::AmpAmp, StringRef (TokStart, `2`));
871	}
872	return AsmToken (AsmToken::Amp, StringRef (TokStart, `1`));
873	case `'!'`:
874	if (*CurPtr == `'='`) {
875	++CurPtr;
876	return AsmToken (AsmToken::ExclaimEqual, StringRef (TokStart, `2`));
877	}
878	return AsmToken (AsmToken::Exclaim, StringRef (TokStart, `1`));
879	case `'%'`:
880	if (LexMotorolaIntegers && (CurPtr == `'0'` \|\| CurPtr == `'1'`)) {
881	return LexDigit();
882	}
883
884	if (MAI.hasMipsExpressions()) {
885	AsmToken::TokenKind Operator;
886	unsigned OperatorLength;
887
888	std::tie(args&: Operator, args&: OperatorLength) =
889	StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
890	StringRef (CurPtr))
891	.StartsWith(S: "call16", Value: {AsmToken::PercentCall16, `7`})
892	.StartsWith(S: "call_hi", Value: {AsmToken::PercentCall_Hi, `8`})
893	.StartsWith(S: "call_lo", Value: {AsmToken::PercentCall_Lo, `8`})
894	.StartsWith(S: "dtprel_hi", Value: {AsmToken::PercentDtprel_Hi, `10`})
895	.StartsWith(S: "dtprel_lo", Value: {AsmToken::PercentDtprel_Lo, `10`})
896	.StartsWith(S: "got_disp", Value: {AsmToken::PercentGot_Disp, `9`})
897	.StartsWith(S: "got_hi", Value: {AsmToken::PercentGot_Hi, `7`})
898	.StartsWith(S: "got_lo", Value: {AsmToken::PercentGot_Lo, `7`})
899	.StartsWith(S: "got_ofst", Value: {AsmToken::PercentGot_Ofst, `9`})
900	.StartsWith(S: "got_page", Value: {AsmToken::PercentGot_Page, `9`})
901	.StartsWith(S: "gottprel", Value: {AsmToken::PercentGottprel, `9`})
902	.StartsWith(S: "got", Value: {AsmToken::PercentGot, `4`})
903	.StartsWith(S: "gp_rel", Value: {AsmToken::PercentGp_Rel, `7`})
904	.StartsWith(S: "higher", Value: {AsmToken::PercentHigher, `7`})
905	.StartsWith(S: "highest", Value: {AsmToken::PercentHighest, `8`})
906	.StartsWith(S: "hi", Value: {AsmToken::PercentHi, `3`})
907	.StartsWith(S: "lo", Value: {AsmToken::PercentLo, `3`})
908	.StartsWith(S: "neg", Value: {AsmToken::PercentNeg, `4`})
909	.StartsWith(S: "pcrel_hi", Value: {AsmToken::PercentPcrel_Hi, `9`})
910	.StartsWith(S: "pcrel_lo", Value: {AsmToken::PercentPcrel_Lo, `9`})
911	.StartsWith(S: "tlsgd", Value: {AsmToken::PercentTlsgd, `6`})
912	.StartsWith(S: "tlsldm", Value: {AsmToken::PercentTlsldm, `7`})
913	.StartsWith(S: "tprel_hi", Value: {AsmToken::PercentTprel_Hi, `9`})
914	.StartsWith(S: "tprel_lo", Value: {AsmToken::PercentTprel_Lo, `9`})
915	.Default(Value: {AsmToken::Percent, `1`});
916
917	if (Operator != AsmToken::Percent) {
918	CurPtr += OperatorLength - `1`;
919	return AsmToken (Operator, StringRef (TokStart, OperatorLength));
920	}
921	}
922	return AsmToken (AsmToken::Percent, StringRef (TokStart, `1`));
923	case `'/'`:
924	IsAtStartOfStatement = OldIsAtStartOfStatement;
925	return LexSlash();
926	case `'\''`: return LexSingleQuote();
927	case `'"'`: return LexQuote();
928	case `'0'`: case `'1'`: case `'2'`: case `'3'`: case `'4'`:
929	case `'5'`: case `'6'`: case `'7'`: case `'8'`: case `'9'`:
930	return LexDigit();
931	case `'<'`:
932	switch (*CurPtr) {
933	case `'<'`:
934	++CurPtr;
935	return AsmToken (AsmToken::LessLess, StringRef (TokStart, `2`));
936	case `'='`:
937	++CurPtr;
938	return AsmToken (AsmToken::LessEqual, StringRef (TokStart, `2`));
939	case `'>'`:
940	++CurPtr;
941	return AsmToken (AsmToken::LessGreater, StringRef (TokStart, `2`));
942	default:
943	return AsmToken (AsmToken::Less, StringRef (TokStart, `1`));
944	}
945	case `'>'`:
946	switch (*CurPtr) {
947	case `'>'`:
948	++CurPtr;
949	return AsmToken (AsmToken::GreaterGreater, StringRef (TokStart, `2`));
950	case `'='`:
951	++CurPtr;
952	return AsmToken (AsmToken::GreaterEqual, StringRef (TokStart, `2`));
953	default:
954	return AsmToken (AsmToken::Greater, StringRef (TokStart, `1`));
955	}
956
957	// TODO: Quoted identifiers (objc methods etc)
958	// local labels: [0-9][:]
959	// Forward/backward labels: [0-9][fb]
960	// Integers, fp constants, character constants.
961	}
962	}
963

source code of llvm/lib/MC/MCParser/AsmLexer.cpp