TGLexer.cpp source code [llvm/lib/TableGen/TGLexer.cpp]

1	//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Implement the Lexer for TableGen.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "TGLexer.h"
14	#include "llvm/ADT/ArrayRef.h"
15	#include "llvm/ADT/StringSwitch.h"
16	#include "llvm/ADT/Twine.h"
17	#include "llvm/Config/config.h" // for strtoull()/strtoll() define
18	#include "llvm/Support/Compiler.h"
19	#include "llvm/Support/MemoryBuffer.h"
20	#include "llvm/Support/SourceMgr.h"
21	#include "llvm/TableGen/Error.h"
22	#include <algorithm>
23	#include <cctype>
24	#include <cerrno>
25	#include <cstdint>
26	#include <cstdio>
27	#include <cstdlib>
28	#include <cstring>
29
30	using namespace llvm;
31
32	namespace {
33	// A list of supported preprocessing directives with their
34	// internal token kinds and names.
35	struct {
36	tgtok::TokKind Kind;
37	const char *Word;
38	} PreprocessorDirs[] = {
39	{ .Kind: tgtok::Ifdef, .Word: "ifdef" },
40	{ .Kind: tgtok::Ifndef, .Word: "ifndef" },
41	{ .Kind: tgtok::Else, .Word: "else" },
42	{ .Kind: tgtok::Endif, .Word: "endif" },
43	{ .Kind: tgtok::Define, .Word: "define" }
44	};
45	} // end anonymous namespace
46
47	TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
48	CurBuffer = SrcMgr.getMainFileID();
49	CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer();
50	CurPtr = CurBuf.begin();
51	TokStart = nullptr;
52
53	// Pretend that we enter the "top-level" include file.
54	PrepIncludeStack.push_back(
55	x: std::make_unique<std::vector<PreprocessorControlDesc>>());
56
57	// Put all macros defined in the command line into the DefinedMacros set.
58	for (const std::string &MacroName : Macros)
59	DefinedMacros.insert(key: MacroName);
60	}
61
62	SMLoc TGLexer::getLoc() const {
63	return SMLoc::getFromPointer(Ptr: TokStart);
64	}
65
66	SMRange TGLexer::getLocRange() const {
67	return {getLoc(), SMLoc::getFromPointer(Ptr: CurPtr)};
68	}
69
70	/// ReturnError - Set the error to the specified string at the specified
71	/// location. This is defined to always return tgtok::Error.
72	tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) {
73	PrintError(ErrorLoc: Loc, Msg);
74	return tgtok::Error;
75	}
76
77	tgtok::TokKind TGLexer::ReturnError(const char Loc, const* Twine &Msg) {
78	return ReturnError(Loc: SMLoc::getFromPointer(Ptr: Loc), Msg);
79	}
80
81	bool TGLexer::processEOF() {
82	SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(i: CurBuffer);
83	if (ParentIncludeLoc != SMLoc ()) {
84	// If prepExitInclude() detects a problem with the preprocessing
85	// control stack, it will return false. Pretend that we reached
86	// the final EOF and stop lexing more tokens by returning false
87	// to LexToken().
88	if (!prepExitInclude(IncludeStackMustBeEmpty: false))
89	return false;
90
91	CurBuffer = SrcMgr.FindBufferContainingLoc(Loc: ParentIncludeLoc);
92	CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer();
93	CurPtr = ParentIncludeLoc.getPointer();
94	// Make sure TokStart points into the parent file's buffer.
95	// LexToken() assigns to it before calling getNextChar(),
96	// so it is pointing into the included file now.
97	TokStart = CurPtr;
98	return true;
99	}
100
101	// Pretend that we exit the "top-level" include file.
102	// Note that in case of an error (e.g. control stack imbalance)
103	// the routine will issue a fatal error.
104	prepExitInclude(IncludeStackMustBeEmpty: true);
105	return false;
106	}
107
108	int TGLexer::getNextChar() {
109	char CurChar = *CurPtr++;
110	switch (CurChar) {
111	default:
112	return (unsigned char)CurChar;
113
114	case `0`: {
115	// A NUL character in the stream is either the end of the current buffer or
116	// a spurious NUL in the file. Disambiguate that here.
117	if (CurPtr - `1` == CurBuf.end()) {
118	--CurPtr; // Arrange for another call to return EOF again.
119	return EOF;
120	}
121	PrintError(ErrorLoc: getLoc(),
122	Msg: "NUL character is invalid in source; treated as space");
123	return `' '`;
124	}
125
126	case `'\n'`:
127	case `'\r'`:
128	// Handle the newline character by ignoring it and incrementing the line
129	// count. However, be careful about 'dos style' files with \n\r in them.
130	// Only treat a \n\r or \r\n as a single line.
131	if ((CurPtr == `'\n'` \|\| (CurPtr == `'\r'`)) &&
132	*CurPtr != CurChar)
133	++CurPtr; // Eat the two char newline sequence.
134	return `'\n'`;
135	}
136	}
137
138	int TGLexer::peekNextChar(int Index) const {
139	return *(CurPtr + Index);
140	}
141
142	tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
143	TokStart = CurPtr;
144	// This always consumes at least one character.
145	int CurChar = getNextChar();
146
147	switch (CurChar) {
148	default:
149	// Handle letters: [a-zA-Z_]
150	if (isalpha(CurChar) \|\| CurChar == `'_'`)
151	return LexIdentifier();
152
153	// Unknown character, emit an error.
154	return ReturnError(Loc: TokStart, Msg: "Unexpected character");
155	case EOF:
156	// Lex next token, if we just left an include file.
157	// Note that leaving an include file means that the next
158	// symbol is located at the end of the 'include "..."'
159	// construct, so LexToken() is called with default
160	// false parameter.
161	if (processEOF())
162	return LexToken();
163
164	// Return EOF denoting the end of lexing.
165	return tgtok::Eof;
166
167	case `':'`: return tgtok::colon;
168	case `';'`: return tgtok::semi;
169	case `','`: return tgtok::comma;
170	case `'<'`: return tgtok::less;
171	case `'>'`: return tgtok::greater;
172	case `']'`: return tgtok::r_square;
173	case `'{'`: return tgtok::l_brace;
174	case `'}'`: return tgtok::r_brace;
175	case `'('`: return tgtok::l_paren;
176	case `')'`: return tgtok::r_paren;
177	case `'='`: return tgtok::equal;
178	case `'?'`: return tgtok::question;
179	case `'#'`:
180	if (FileOrLineStart) {
181	tgtok::TokKind Kind = prepIsDirective();
182	if (Kind != tgtok::Error)
183	return lexPreprocessor(Kind);
184	}
185
186	return tgtok::paste;
187
188	// The period is a separate case so we can recognize the "..."
189	// range punctuator.
190	case `'.'`:
191	if (peekNextChar(Index: `0`) == `'.'`) {
192	++CurPtr; // Eat second dot.
193	if (peekNextChar(Index: `0`) == `'.'`) {
194	++CurPtr; // Eat third dot.
195	return tgtok::dotdotdot;
196	}
197	return ReturnError(Loc: TokStart, Msg: "Invalid '..' punctuation");
198	}
199	return tgtok::dot;
200
201	case `'\r'`:
202	PrintFatalError(Msg: "getNextChar() must never return '\r'");
203	return tgtok::Error;
204
205	case `' '`:
206	case `'\t'`:
207	// Ignore whitespace.
208	return LexToken(FileOrLineStart);
209	case `'\n'`:
210	// Ignore whitespace, and identify the new line.
211	return LexToken(FileOrLineStart: true);
212	case `'/'`:
213	// If this is the start of a // comment, skip until the end of the line or
214	// the end of the buffer.
215	if (*CurPtr == `'/'`)
216	SkipBCPLComment();
217	else if (CurPtr == `''`) {
218	if (SkipCComment())
219	return tgtok::Error;
220	} else // Otherwise, this is an error.
221	return ReturnError(Loc: TokStart, Msg: "Unexpected character");
222	return LexToken(FileOrLineStart);
223	case `'-'`: case `'+'`:
224	case `'0'`: case `'1'`: case `'2'`: case `'3'`: case `'4'`: case `'5'`: case `'6'`:
225	case `'7'`: case `'8'`: case `'9'`: {
226	int NextChar = `0`;
227	if (isdigit(CurChar)) {
228	// Allow identifiers to start with a number if it is followed by
229	// an identifier. This can happen with paste operations like
230	// foo#8i.
231	int i = `0`;
232	do {
233	NextChar = peekNextChar(Index: i++);
234	} while (isdigit(NextChar));
235
236	if (NextChar == `'x'` \|\| NextChar == `'b'`) {
237	// If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
238	// likely a number.
239	int NextNextChar = peekNextChar(Index: i);
240	switch (NextNextChar) {
241	default:
242	break;
243	case `'0'`: case `'1'`:
244	if (NextChar == `'b'`)
245	return LexNumber();
246	[[fallthrough]];
247	case `'2'`: case `'3'`: case `'4'`: case `'5'`:
248	case `'6'`: case `'7'`: case `'8'`: case `'9'`:
249	case `'a'`: case `'b'`: case `'c'`: case `'d'`: case `'e'`: case `'f'`:
250	case `'A'`: case `'B'`: case `'C'`: case `'D'`: case `'E'`: case `'F'`:
251	if (NextChar == `'x'`)
252	return LexNumber();
253	break;
254	}
255	}
256	}
257
258	if (isalpha(NextChar) \|\| NextChar == `'_'`)
259	return LexIdentifier();
260
261	return LexNumber();
262	}
263	case `'"'`: return LexString();
264	case `'$'`: return LexVarName();
265	case `'['`: return LexBracket();
266	case `'!'`: return LexExclaim();
267	}
268	}
269
270	/// LexString - Lex "[^"]"*
271	tgtok::TokKind TGLexer::LexString() {
272	const char *StrStart = CurPtr;
273
274	CurStrVal = "";
275
276	while (*CurPtr != `'"'`) {
277	// If we hit the end of the buffer, report an error.
278	if (*CurPtr == `0` && CurPtr == CurBuf.end())
279	return ReturnError(Loc: StrStart, Msg: "End of file in string literal");
280
281	if (CurPtr == `'\n'` \|\| CurPtr == `'\r'`)
282	return ReturnError(Loc: StrStart, Msg: "End of line in string literal");
283
284	if (*CurPtr != `'\\'`) {
285	CurStrVal += *CurPtr++;
286	continue;
287	}
288
289	++CurPtr;
290
291	switch (*CurPtr) {
292	case `'\\'`: case `'\''`: case `'"'`:
293	// These turn into their literal character.
294	CurStrVal += *CurPtr++;
295	break;
296	case `'t'`:
297	CurStrVal += `'\t'`;
298	++CurPtr;
299	break;
300	case `'n'`:
301	CurStrVal += `'\n'`;
302	++CurPtr;
303	break;
304
305	case `'\n'`:
306	case `'\r'`:
307	return ReturnError(Loc: CurPtr, Msg: "escaped newlines not supported in tblgen");
308
309	// If we hit the end of the buffer, report an error.
310	case `'\0'`:
311	if (CurPtr == CurBuf.end())
312	return ReturnError(Loc: StrStart, Msg: "End of file in string literal");
313	[[fallthrough]];
314	default:
315	return ReturnError(Loc: CurPtr, Msg: "invalid escape in string literal");
316	}
317	}
318
319	++CurPtr;
320	return tgtok::StrVal;
321	}
322
323	tgtok::TokKind TGLexer::LexVarName() {
324	if (!isalpha(CurPtr[`0`]) && CurPtr[`0`] != `'_'`)
325	return ReturnError(Loc: TokStart, Msg: "Invalid variable name");
326
327	// Otherwise, we're ok, consume the rest of the characters.
328	const char *VarNameStart = CurPtr++;
329
330	while (isalpha(CurPtr) \|\| isdigit(CurPtr) \|\| *CurPtr == `'_'`)
331	++CurPtr;
332
333	CurStrVal.assign(first: VarNameStart, last: CurPtr);
334	return tgtok::VarName;
335	}
336
337	tgtok::TokKind TGLexer::LexIdentifier() {
338	// The first letter is [a-zA-Z_].
339	const char *IdentStart = TokStart;
340
341	// Match the rest of the identifier regex: [0-9a-zA-Z_]*
342	while (isalpha(CurPtr) \|\| isdigit(CurPtr) \|\| *CurPtr == `'_'`)
343	++CurPtr;
344
345	// Check to see if this identifier is a reserved keyword.
346	StringRef Str(IdentStart, CurPtr-IdentStart);
347
348	tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
349	.Case(S: "int", Value: tgtok::Int)
350	.Case(S: "bit", Value: tgtok::Bit)
351	.Case(S: "bits", Value: tgtok::Bits)
352	.Case(S: "string", Value: tgtok::String)
353	.Case(S: "list", Value: tgtok::List)
354	.Case(S: "code", Value: tgtok::Code)
355	.Case(S: "dag", Value: tgtok::Dag)
356	.Case(S: "class", Value: tgtok::Class)
357	.Case(S: "def", Value: tgtok::Def)
358	.Case(S: "true", Value: tgtok::TrueVal)
359	.Case(S: "false", Value: tgtok::FalseVal)
360	.Case(S: "foreach", Value: tgtok::Foreach)
361	.Case(S: "defm", Value: tgtok::Defm)
362	.Case(S: "defset", Value: tgtok::Defset)
363	.Case(S: "deftype", Value: tgtok::Deftype)
364	.Case(S: "multiclass", Value: tgtok::MultiClass)
365	.Case(S: "field", Value: tgtok::Field)
366	.Case(S: "let", Value: tgtok::Let)
367	.Case(S: "in", Value: tgtok::In)
368	.Case(S: "defvar", Value: tgtok::Defvar)
369	.Case(S: "include", Value: tgtok::Include)
370	.Case(S: "if", Value: tgtok::If)
371	.Case(S: "then", Value: tgtok::Then)
372	.Case(S: "else", Value: tgtok::ElseKW)
373	.Case(S: "assert", Value: tgtok::Assert)
374	.Case(S: "dump", Value: tgtok::Dump)
375	.Default(Value: tgtok::Id);
376
377	// A couple of tokens require special processing.
378	switch (Kind) {
379	case tgtok::Include:
380	if (LexInclude()) return tgtok::Error;
381	return Lex();
382	case tgtok::Id:
383	CurStrVal.assign(first: Str.begin(), last: Str.end());
384	break;
385	default:
386	break;
387	}
388
389	return Kind;
390	}
391
392	/// LexInclude - We just read the "include" token. Get the string token that
393	/// comes next and enter the include.
394	bool TGLexer::LexInclude() {
395	// The token after the include must be a string.
396	tgtok::TokKind Tok = LexToken();
397	if (Tok == tgtok::Error) return true;
398	if (Tok != tgtok::StrVal) {
399	PrintError(ErrorLoc: getLoc(), Msg: "Expected filename after include");
400	return true;
401	}
402
403	// Get the string.
404	std::string Filename = CurStrVal;
405	std::string IncludedFile;
406
407	CurBuffer = SrcMgr.AddIncludeFile(Filename, IncludeLoc: SMLoc::getFromPointer(Ptr: CurPtr),
408	IncludedFile);
409	if (!CurBuffer) {
410	PrintError(ErrorLoc: getLoc(), Msg: "Could not find include file '" + Filename + "'");
411	return true;
412	}
413
414	Dependencies.insert(x: IncludedFile);
415	// Save the line number and lex buffer of the includer.
416	CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer();
417	CurPtr = CurBuf.begin();
418
419	PrepIncludeStack.push_back(
420	x: std::make_unique<std::vector<PreprocessorControlDesc>>());
421	return false;
422	}
423
424	/// SkipBCPLComment - Skip over the comment by finding the next CR or LF.
425	/// Or we may end up at the end of the buffer.
426	void TGLexer::SkipBCPLComment() {
427	++CurPtr; // skip the second slash.
428	auto EOLPos = CurBuf.find_first_of(Chars: "\r\n", From: CurPtr - CurBuf.data());
429	CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos;
430	}
431
432	/// SkipCComment - This skips C-style // comments. The only difference from C
433	/// is that we allow nesting.
434	bool TGLexer::SkipCComment() {
435	++CurPtr; // skip the star.
436	unsigned CommentDepth = `1`;
437
438	while (true) {
439	int CurChar = getNextChar();
440	switch (CurChar) {
441	case EOF:
442	PrintError(Loc: TokStart, Msg: "Unterminated comment!");
443	return true;
444	case `'*'`:
445	// End of the comment?
446	if (CurPtr[`0`] != `'/'`) break;
447
448	++CurPtr; // End the /.*
449	if (--CommentDepth == `0`)
450	return false;
451	break;
452	case `'/'`:
453	// Start of a nested comment?
454	if (CurPtr[`0`] != `''`) break*;
455	++CurPtr;
456	++CommentDepth;
457	break;
458	}
459	}
460	}
461
462	/// LexNumber - Lex:
463	/// [-+]?[0-9]+
464	/// 0x[0-9a-fA-F]+
465	/// 0b[01]+
466	tgtok::TokKind TGLexer::LexNumber() {
467	unsigned Base = `0`;
468	const char *NumStart;
469
470	// Check if it's a hex or a binary value.
471	if (CurPtr[-`1`] == `'0'`) {
472	NumStart = CurPtr + `1`;
473	if (CurPtr[`0`] == `'x'`) {
474	Base = `16`;
475	do
476	++CurPtr;
477	while (isxdigit(CurPtr[`0`]));
478	} else if (CurPtr[`0`] == `'b'`) {
479	Base = `2`;
480	do
481	++CurPtr;
482	while (CurPtr[`0`] == `'0'` \|\| CurPtr[`0`] == `'1'`);
483	}
484	}
485
486	// For a hex or binary value, we always convert it to an unsigned value.
487	bool IsMinus = false;
488
489	// Check if it's a decimal value.
490	if (Base == `0`) {
491	// Check for a sign without a digit.
492	if (!isdigit(CurPtr[`0`])) {
493	if (CurPtr[-`1`] == `'-'`)
494	return tgtok::minus;
495	else if (CurPtr[-`1`] == `'+'`)
496	return tgtok::plus;
497	}
498
499	Base = `10`;
500	NumStart = TokStart;
501	IsMinus = CurPtr[-`1`] == `'-'`;
502
503	while (isdigit(CurPtr[`0`]))
504	++CurPtr;
505	}
506
507	// Requires at least one digit.
508	if (CurPtr == NumStart)
509	return ReturnError(Loc: TokStart, Msg: "Invalid number");
510
511	errno = `0`;
512	if (IsMinus)
513	CurIntVal = strtoll(nptr: NumStart, endptr: nullptr, base: Base);
514	else
515	CurIntVal = strtoull(nptr: NumStart, endptr: nullptr, base: Base);
516
517	if (errno == EINVAL)
518	return ReturnError(Loc: TokStart, Msg: "Invalid number");
519	if (errno == ERANGE)
520	return ReturnError(Loc: TokStart, Msg: "Number out of range");
521
522	return Base == `2` ? tgtok::BinaryIntVal : tgtok::IntVal;
523	}
524
525	/// LexBracket - We just read '['. If this is a code block, return it,
526	/// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ \| }[^]] ) }]'*
527	tgtok::TokKind TGLexer::LexBracket() {
528	if (CurPtr[`0`] != `'{'`)
529	return tgtok::l_square;
530	++CurPtr;
531	const char *CodeStart = CurPtr;
532	while (true) {
533	int Char = getNextChar();
534	if (Char == EOF) break;
535
536	if (Char != `'}'`) continue;
537
538	Char = getNextChar();
539	if (Char == EOF) break;
540	if (Char == `']'`) {
541	CurStrVal.assign(first: CodeStart, last: CurPtr-`2`);
542	return tgtok::CodeFragment;
543	}
544	}
545
546	return ReturnError(Loc: CodeStart - `2`, Msg: "Unterminated code block");
547	}
548
549	/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
550	tgtok::TokKind TGLexer::LexExclaim() {
551	if (!isalpha(*CurPtr))
552	return ReturnError(Loc: CurPtr - `1`, Msg: "Invalid \"!operator\"");
553
554	const char *Start = CurPtr++;
555	while (isalpha(*CurPtr))
556	++CurPtr;
557
558	// Check to see which operator this is.
559	tgtok::TokKind Kind =
560	StringSwitch<tgtok::TokKind>(StringRef (Start, CurPtr - Start))
561	.Case(S: "eq", Value: tgtok::XEq)
562	.Case(S: "ne", Value: tgtok::XNe)
563	.Case(S: "le", Value: tgtok::XLe)
564	.Case(S: "lt", Value: tgtok::XLt)
565	.Case(S: "ge", Value: tgtok::XGe)
566	.Case(S: "gt", Value: tgtok::XGt)
567	.Case(S: "if", Value: tgtok::XIf)
568	.Case(S: "cond", Value: tgtok::XCond)
569	.Case(S: "isa", Value: tgtok::XIsA)
570	.Case(S: "head", Value: tgtok::XHead)
571	.Case(S: "tail", Value: tgtok::XTail)
572	.Case(S: "size", Value: tgtok::XSize)
573	.Case(S: "con", Value: tgtok::XConcat)
574	.Case(S: "dag", Value: tgtok::XDag)
575	.Case(S: "add", Value: tgtok::XADD)
576	.Case(S: "sub", Value: tgtok::XSUB)
577	.Case(S: "mul", Value: tgtok::XMUL)
578	.Case(S: "div", Value: tgtok::XDIV)
579	.Case(S: "not", Value: tgtok::XNOT)
580	.Case(S: "logtwo", Value: tgtok::XLOG2)
581	.Case(S: "and", Value: tgtok::XAND)
582	.Case(S: "or", Value: tgtok::XOR)
583	.Case(S: "xor", Value: tgtok::XXOR)
584	.Case(S: "shl", Value: tgtok::XSHL)
585	.Case(S: "sra", Value: tgtok::XSRA)
586	.Case(S: "srl", Value: tgtok::XSRL)
587	.Case(S: "cast", Value: tgtok::XCast)
588	.Case(S: "empty", Value: tgtok::XEmpty)
589	.Case(S: "subst", Value: tgtok::XSubst)
590	.Case(S: "foldl", Value: tgtok::XFoldl)
591	.Case(S: "foreach", Value: tgtok::XForEach)
592	.Case(S: "filter", Value: tgtok::XFilter)
593	.Case(S: "listconcat", Value: tgtok::XListConcat)
594	.Case(S: "listsplat", Value: tgtok::XListSplat)
595	.Case(S: "listremove", Value: tgtok::XListRemove)
596	.Case(S: "range", Value: tgtok::XRange)
597	.Case(S: "strconcat", Value: tgtok::XStrConcat)
598	.Case(S: "interleave", Value: tgtok::XInterleave)
599	.Case(S: "substr", Value: tgtok::XSubstr)
600	.Case(S: "find", Value: tgtok::XFind)
601	.Cases(S0: "setdagop", S1: "setop", Value: tgtok::XSetDagOp) // !setop is deprecated.
602	.Cases(S0: "getdagop", S1: "getop", Value: tgtok::XGetDagOp) // !getop is deprecated.
603	.Case(S: "getdagarg", Value: tgtok::XGetDagArg)
604	.Case(S: "getdagname", Value: tgtok::XGetDagName)
605	.Case(S: "setdagarg", Value: tgtok::XSetDagArg)
606	.Case(S: "setdagname", Value: tgtok::XSetDagName)
607	.Case(S: "exists", Value: tgtok::XExists)
608	.Case(S: "tolower", Value: tgtok::XToLower)
609	.Case(S: "toupper", Value: tgtok::XToUpper)
610	.Case(S: "repr", Value: tgtok::XRepr)
611	.Default(Value: tgtok::Error);
612
613	return Kind != tgtok::Error ? Kind : ReturnError(Loc: Start-`1`, Msg: "Unknown operator");
614	}
615
616	bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
617	// Report an error, if preprocessor control stack for the current
618	// file is not empty.
619	if (!PrepIncludeStack.back()->empty()) {
620	prepReportPreprocessorStackError();
621
622	return false;
623	}
624
625	// Pop the preprocessing controls from the include stack.
626	if (PrepIncludeStack.empty()) {
627	PrintFatalError(Msg: "Preprocessor include stack is empty");
628	}
629
630	PrepIncludeStack.pop_back();
631
632	if (IncludeStackMustBeEmpty) {
633	if (!PrepIncludeStack.empty())
634	PrintFatalError(Msg: "Preprocessor include stack is not empty");
635	} else {
636	if (PrepIncludeStack.empty())
637	PrintFatalError(Msg: "Preprocessor include stack is empty");
638	}
639
640	return true;
641	}
642
643	tgtok::TokKind TGLexer::prepIsDirective() const {
644	for (const auto &PD : PreprocessorDirs) {
645	int NextChar = *CurPtr;
646	bool Match = true;
647	unsigned I = `0`;
648	for (; I < strlen(s: PD.Word); ++I) {
649	if (NextChar != PD.Word[I]) {
650	Match = false;
651	break;
652	}
653
654	NextChar = peekNextChar(Index: I + `1`);
655	}
656
657	// Check for whitespace after the directive. If there is no whitespace,
658	// then we do not recognize it as a preprocessing directive.
659	if (Match) {
660	tgtok::TokKind Kind = PD.Kind;
661
662	// New line and EOF may follow only #else/#endif. It will be reported
663	// as an error for #ifdef/#define after the call to prepLexMacroName().
664	if (NextChar == `' '` \|\| NextChar == `'\t'` \|\| NextChar == EOF \|\|
665	NextChar == `'\n'` \|\|
666	// It looks like TableGen does not support '\r' as the actual
667	// carriage return, e.g. getNextChar() treats a single '\r'
668	// as '\n'. So we do the same here.
669	NextChar == `'\r'`)
670	return Kind;
671
672	// Allow comments after some directives, e.g.:
673	// #else// OR #else//
674	// #endif// OR #endif//
675	//
676	// Note that we do allow comments after #ifdef/#define here, e.g.
677	// #ifdef// AND #ifdef//
678	// #define// AND #define//
679	//
680	// These cases will be reported as incorrect after calling
681	// prepLexMacroName(). We could have supported C-style comments
682	// after #ifdef/#define, but this would complicate the code
683	// for little benefit.
684	if (NextChar == `'/'`) {
685	NextChar = peekNextChar(Index: I + `1`);
686
687	if (NextChar == `'*'` \|\| NextChar == `'/'`)
688	return Kind;
689
690	// Pretend that we do not recognize the directive.
691	}
692	}
693	}
694
695	return tgtok::Error;
696	}
697
698	bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
699	TokStart = CurPtr;
700
701	for (const auto &PD : PreprocessorDirs)
702	if (PD.Kind == Kind) {
703	// Advance CurPtr to the end of the preprocessing word.
704	CurPtr += strlen(s: PD.Word);
705	return true;
706	}
707
708	PrintFatalError(Msg: "Unsupported preprocessing token in "
709	"prepEatPreprocessorDirective()");
710	return false;
711	}
712
713	tgtok::TokKind TGLexer::lexPreprocessor(
714	tgtok::TokKind Kind, bool ReturnNextLiveToken) {
715
716	// We must be looking at a preprocessing directive. Eat it!
717	if (!prepEatPreprocessorDirective(Kind))
718	PrintFatalError(Msg: "lexPreprocessor() called for unknown "
719	"preprocessor directive");
720
721	if (Kind == tgtok::Ifdef \|\| Kind == tgtok::Ifndef) {
722	StringRef MacroName = prepLexMacroName();
723	StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef";
724	if (MacroName.empty())
725	return ReturnError(Loc: TokStart, Msg: "Expected macro name after " + IfTokName);
726
727	bool MacroIsDefined = DefinedMacros.count(Key: MacroName) != `0`;
728
729	// Canonicalize ifndef's MacroIsDefined to its ifdef equivalent.
730	if (Kind == tgtok::Ifndef)
731	MacroIsDefined = !MacroIsDefined;
732
733	// Regardless of whether we are processing tokens or not,
734	// we put the #ifdef control on stack.
735	// Note that MacroIsDefined has been canonicalized against ifdef.
736	PrepIncludeStack.back()->push_back(
737	x: {.Kind: tgtok::Ifdef, .IsDefined: MacroIsDefined, .SrcPos: SMLoc::getFromPointer(Ptr: TokStart)});
738
739	if (!prepSkipDirectiveEnd())
740	return ReturnError(Loc: CurPtr, Msg: "Only comments are supported after " +
741	IfTokName + " NAME");
742
743	// If we were not processing tokens before this #ifdef,
744	// then just return back to the lines skipping code.
745	if (!ReturnNextLiveToken)
746	return Kind;
747
748	// If we were processing tokens before this #ifdef,
749	// and the macro is defined, then just return the next token.
750	if (MacroIsDefined)
751	return LexToken();
752
753	// We were processing tokens before this #ifdef, and the macro
754	// is not defined, so we have to start skipping the lines.
755	// If the skipping is successful, it will return the token following
756	// either #else or #endif corresponding to this #ifdef.
757	if (prepSkipRegion(MustNeverBeFalse: ReturnNextLiveToken))
758	return LexToken();
759
760	return tgtok::Error;
761	} else if (Kind == tgtok::Else) {
762	// Check if this #else is correct before calling prepSkipDirectiveEnd(),
763	// which will move CurPtr away from the beginning of #else.
764	if (PrepIncludeStack.back()->empty())
765	return ReturnError(Loc: TokStart, Msg: "#else without #ifdef or #ifndef");
766
767	PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back();
768
769	if (IfdefEntry.Kind != tgtok::Ifdef) {
770	PrintError(Loc: TokStart, Msg: "double #else");
771	return ReturnError(Loc: IfdefEntry.SrcPos, Msg: "Previous #else is here");
772	}
773
774	// Replace the corresponding #ifdef's control with its negation
775	// on the control stack.
776	PrepIncludeStack.back()->pop_back();
777	PrepIncludeStack.back()->push_back(
778	x: {.Kind: Kind, .IsDefined: !IfdefEntry.IsDefined, .SrcPos: SMLoc::getFromPointer(Ptr: TokStart)});
779
780	if (!prepSkipDirectiveEnd())
781	return ReturnError(Loc: CurPtr, Msg: "Only comments are supported after #else");
782
783	// If we were processing tokens before this #else,
784	// we have to start skipping lines until the matching #endif.
785	if (ReturnNextLiveToken) {
786	if (prepSkipRegion(MustNeverBeFalse: ReturnNextLiveToken))
787	return LexToken();
788
789	return tgtok::Error;
790	}
791
792	// Return to the lines skipping code.
793	return Kind;
794	} else if (Kind == tgtok::Endif) {
795	// Check if this #endif is correct before calling prepSkipDirectiveEnd(),
796	// which will move CurPtr away from the beginning of #endif.
797	if (PrepIncludeStack.back()->empty())
798	return ReturnError(Loc: TokStart, Msg: "#endif without #ifdef");
799
800	auto &IfdefOrElseEntry = PrepIncludeStack.back()->back();
801
802	if (IfdefOrElseEntry.Kind != tgtok::Ifdef &&
803	IfdefOrElseEntry.Kind != tgtok::Else) {
804	PrintFatalError(Msg: "Invalid preprocessor control on the stack");
805	return tgtok::Error;
806	}
807
808	if (!prepSkipDirectiveEnd())
809	return ReturnError(Loc: CurPtr, Msg: "Only comments are supported after #endif");
810
811	PrepIncludeStack.back()->pop_back();
812
813	// If we were processing tokens before this #endif, then
814	// we should continue it.
815	if (ReturnNextLiveToken) {
816	return LexToken();
817	}
818
819	// Return to the lines skipping code.
820	return Kind;
821	} else if (Kind == tgtok::Define) {
822	StringRef MacroName = prepLexMacroName();
823	if (MacroName.empty())
824	return ReturnError(Loc: TokStart, Msg: "Expected macro name after #define");
825
826	if (!DefinedMacros.insert(key: MacroName).second)
827	PrintWarning(WarningLoc: getLoc(),
828	Msg: "Duplicate definition of macro: " + Twine (MacroName));
829
830	if (!prepSkipDirectiveEnd())
831	return ReturnError(Loc: CurPtr,
832	Msg: "Only comments are supported after #define NAME");
833
834	if (!ReturnNextLiveToken) {
835	PrintFatalError(Msg: "#define must be ignored during the lines skipping");
836	return tgtok::Error;
837	}
838
839	return LexToken();
840	}
841
842	PrintFatalError(Msg: "Preprocessing directive is not supported");
843	return tgtok::Error;
844	}
845
846	bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
847	if (!MustNeverBeFalse)
848	PrintFatalError(Msg: "Invalid recursion.");
849
850	do {
851	// Skip all symbols to the line end.
852	while (*CurPtr != `'\n'`)
853	++CurPtr;
854
855	// Find the first non-whitespace symbol in the next line(s).
856	if (!prepSkipLineBegin())
857	return false;
858
859	// If the first non-blank/comment symbol on the line is '#',
860	// it may be a start of preprocessing directive.
861	//
862	// If it is not '#' just go to the next line.
863	if (*CurPtr == `'#'`)
864	++CurPtr;
865	else
866	continue;
867
868	tgtok::TokKind Kind = prepIsDirective();
869
870	// If we did not find a preprocessing directive or it is #define,
871	// then just skip to the next line. We do not have to do anything
872	// for #define in the line-skipping mode.
873	if (Kind == tgtok::Error \|\| Kind == tgtok::Define)
874	continue;
875
876	tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, ReturnNextLiveToken: false);
877
878	// If lexPreprocessor() encountered an error during lexing this
879	// preprocessor idiom, then return false to the calling lexPreprocessor().
880	// This will force tgtok::Error to be returned to the tokens processing.
881	if (ProcessedKind == tgtok::Error)
882	return false;
883
884	if (Kind != ProcessedKind)
885	PrintFatalError(Msg: "prepIsDirective() and lexPreprocessor() "
886	"returned different token kinds");
887
888	// If this preprocessing directive enables tokens processing,
889	// then return to the lexPreprocessor() and get to the next token.
890	// We can move from line-skipping mode to processing tokens only
891	// due to #else or #endif.
892	if (prepIsProcessingEnabled()) {
893	if (Kind != tgtok::Else && Kind != tgtok::Endif) {
894	PrintFatalError(Msg: "Tokens processing was enabled by an unexpected "
895	"preprocessing directive");
896	return false;
897	}
898
899	return true;
900	}
901	} while (CurPtr != CurBuf.end());
902
903	// We have reached the end of the file, but never left the lines-skipping
904	// mode. This means there is no matching #endif.
905	prepReportPreprocessorStackError();
906	return false;
907	}
908
909	StringRef TGLexer::prepLexMacroName() {
910	// Skip whitespaces between the preprocessing directive and the macro name.
911	while (CurPtr == `' '` \|\| CurPtr == `'\t'`)
912	++CurPtr;
913
914	TokStart = CurPtr;
915	// Macro names start with [a-zA-Z_].
916	if (CurPtr != `'_'` && !isalpha(CurPtr))
917	return "";
918
919	// Match the rest of the identifier regex: [0-9a-zA-Z_]*
920	while (isalpha(CurPtr) \|\| isdigit(CurPtr) \|\| *CurPtr == `'_'`)
921	++CurPtr;
922
923	return StringRef (TokStart, CurPtr - TokStart);
924	}
925
926	bool TGLexer::prepSkipLineBegin() {
927	while (CurPtr != CurBuf.end()) {
928	switch (*CurPtr) {
929	case `' '`:
930	case `'\t'`:
931	case `'\n'`:
932	case `'\r'`:
933	break;
934
935	case `'/'`: {
936	int NextChar = peekNextChar(Index: `1`);
937	if (NextChar == `'*'`) {
938	// Skip C-style comment.
939	// Note that we do not care about skipping the C++-style comments.
940	// If the line contains "//", it may not contain any processable
941	// preprocessing directive. Just return CurPtr pointing to
942	// the first '/' in this case. We also do not care about
943	// incorrect symbols after the first '/' - we are in lines-skipping
944	// mode, so incorrect code is allowed to some extent.
945
946	// Set TokStart to the beginning of the comment to enable proper
947	// diagnostic printing in case of error in SkipCComment().
948	TokStart = CurPtr;
949
950	// CurPtr must point to '' before call to SkipCComment().*
951	++CurPtr;
952	if (SkipCComment())
953	return false;
954	} else {
955	// CurPtr points to the non-whitespace '/'.
956	return true;
957	}
958
959	// We must not increment CurPtr after the comment was lexed.
960	continue;
961	}
962
963	default:
964	return true;
965	}
966
967	++CurPtr;
968	}
969
970	// We have reached the end of the file. Return to the lines skipping
971	// code, and allow it to handle the EOF as needed.
972	return true;
973	}
974
975	bool TGLexer::prepSkipDirectiveEnd() {
976	while (CurPtr != CurBuf.end()) {
977	switch (*CurPtr) {
978	case `' '`:
979	case `'\t'`:
980	break;
981
982	case `'\n'`:
983	case `'\r'`:
984	return true;
985
986	case `'/'`: {
987	int NextChar = peekNextChar(Index: `1`);
988	if (NextChar == `'/'`) {
989	// Skip C++-style comment.
990	// We may just return true now, but let's skip to the line/buffer end
991	// to simplify the method specification.
992	++CurPtr;
993	SkipBCPLComment();
994	} else if (NextChar == `'*'`) {
995	// When we are skipping C-style comment at the end of a preprocessing
996	// directive, we can skip several lines. If any meaningful TD token
997	// follows the end of the C-style comment on the same line, it will
998	// be considered as an invalid usage of TD token.
999	// For example, we want to forbid usages like this one:
1000	// #define MACRO class Class {}
1001	// But with C-style comments we also disallow the following:
1002	// #define MACRO / This macro is used*
1003	// to ... / class Class {}*
1004	// One can argue that this should be allowed, but it does not seem
1005	// to be worth of the complication. Moreover, this matches
1006	// the C preprocessor behavior.
1007
1008	// Set TokStart to the beginning of the comment to enable proper
1009	// diagnostic printer in case of error in SkipCComment().
1010	TokStart = CurPtr;
1011	++CurPtr;
1012	if (SkipCComment())
1013	return false;
1014	} else {
1015	TokStart = CurPtr;
1016	PrintError(Loc: CurPtr, Msg: "Unexpected character");
1017	return false;
1018	}
1019
1020	// We must not increment CurPtr after the comment was lexed.
1021	continue;
1022	}
1023
1024	default:
1025	// Do not allow any non-whitespaces after the directive.
1026	TokStart = CurPtr;
1027	return false;
1028	}
1029
1030	++CurPtr;
1031	}
1032
1033	return true;
1034	}
1035
1036	bool TGLexer::prepIsProcessingEnabled() {
1037	for (const PreprocessorControlDesc &I :
1038	llvm::reverse(C&: *PrepIncludeStack.back()))
1039	if (!I.IsDefined)
1040	return false;
1041
1042	return true;
1043	}
1044
1045	void TGLexer::prepReportPreprocessorStackError() {
1046	if (PrepIncludeStack.back()->empty())
1047	PrintFatalError(Msg: "prepReportPreprocessorStackError() called with "
1048	"empty control stack");
1049
1050	auto &PrepControl = PrepIncludeStack.back()->back();
1051	PrintError(Loc: CurBuf.end(), Msg: "Reached EOF without matching #endif");
1052	PrintError(ErrorLoc: PrepControl.SrcPos, Msg: "The latest preprocessor control is here");
1053
1054	TokStart = CurPtr;
1055	}
1056

source code of llvm/lib/TableGen/TGLexer.cpp