1//===- Lexer.h - Lexer for the Toy language -------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a simple Lexer for the Toy language.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef TOY_LEXER_H
14#define TOY_LEXER_H
15
16#include "llvm/ADT/StringRef.h"
17
18#include <memory>
19#include <string>
20
21namespace toy {
22
23/// Structure definition a location in a file.
24struct Location {
25 std::shared_ptr<std::string> file; ///< filename.
26 int line; ///< line number.
27 int col; ///< column number.
28};
29
30// List of Token returned by the lexer.
31enum Token : int {
32 tok_semicolon = ';',
33 tok_parenthese_open = '(',
34 tok_parenthese_close = ')',
35 tok_bracket_open = '{',
36 tok_bracket_close = '}',
37 tok_sbracket_open = '[',
38 tok_sbracket_close = ']',
39
40 tok_eof = -1,
41
42 // commands
43 tok_return = -2,
44 tok_var = -3,
45 tok_def = -4,
46
47 // primary
48 tok_identifier = -5,
49 tok_number = -6,
50};
51
52/// The Lexer is an abstract base class providing all the facilities that the
53/// Parser expects. It goes through the stream one token at a time and keeps
54/// track of the location in the file for debugging purpose.
55/// It relies on a subclass to provide a `readNextLine()` method. The subclass
56/// can proceed by reading the next line from the standard input or from a
57/// memory mapped file.
58class Lexer {
59public:
60 /// Create a lexer for the given filename. The filename is kept only for
61 /// debugging purpose (attaching a location to a Token).
62 Lexer(std::string filename)
63 : lastLocation(
64 {.file: std::make_shared<std::string>(args: std::move(filename)), .line: 0, .col: 0}) {}
65 virtual ~Lexer() = default;
66
67 /// Look at the current token in the stream.
68 Token getCurToken() { return curTok; }
69
70 /// Move to the next token in the stream and return it.
71 Token getNextToken() { return curTok = getTok(); }
72
73 /// Move to the next token in the stream, asserting on the current token
74 /// matching the expectation.
75 void consume(Token tok) {
76 assert(tok == curTok && "consume Token mismatch expectation");
77 getNextToken();
78 }
79
80 /// Return the current identifier (prereq: getCurToken() == tok_identifier)
81 llvm::StringRef getId() {
82 assert(curTok == tok_identifier);
83 return identifierStr;
84 }
85
86 /// Return the current number (prereq: getCurToken() == tok_number)
87 double getValue() {
88 assert(curTok == tok_number);
89 return numVal;
90 }
91
92 /// Return the location for the beginning of the current token.
93 Location getLastLocation() { return lastLocation; }
94
95 // Return the current line in the file.
96 int getLine() { return curLineNum; }
97
98 // Return the current column in the file.
99 int getCol() { return curCol; }
100
101private:
102 /// Delegate to a derived class fetching the next line. Returns an empty
103 /// string to signal end of file (EOF). Lines are expected to always finish
104 /// with "\n"
105 virtual llvm::StringRef readNextLine() = 0;
106
107 /// Return the next character from the stream. This manages the buffer for the
108 /// current line and request the next line buffer to the derived class as
109 /// needed.
110 int getNextChar() {
111 // The current line buffer should not be empty unless it is the end of file.
112 if (curLineBuffer.empty())
113 return EOF;
114 ++curCol;
115 auto nextchar = curLineBuffer.front();
116 curLineBuffer = curLineBuffer.drop_front();
117 if (curLineBuffer.empty())
118 curLineBuffer = readNextLine();
119 if (nextchar == '\n') {
120 ++curLineNum;
121 curCol = 0;
122 }
123 return nextchar;
124 }
125
126 /// Return the next token from standard input.
127 Token getTok() {
128 // Skip any whitespace.
129 while (isspace(lastChar))
130 lastChar = Token(getNextChar());
131
132 // Save the current location before reading the token characters.
133 lastLocation.line = curLineNum;
134 lastLocation.col = curCol;
135
136 // Identifier: [a-zA-Z][a-zA-Z0-9_]*
137 if (isalpha(lastChar)) {
138 identifierStr = (char)lastChar;
139 while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
140 identifierStr += (char)lastChar;
141
142 if (identifierStr == "return")
143 return tok_return;
144 if (identifierStr == "def")
145 return tok_def;
146 if (identifierStr == "var")
147 return tok_var;
148 return tok_identifier;
149 }
150
151 // Number: [0-9.]+
152 if (isdigit(lastChar) || lastChar == '.') {
153 std::string numStr;
154 do {
155 numStr += lastChar;
156 lastChar = Token(getNextChar());
157 } while (isdigit(lastChar) || lastChar == '.');
158
159 numVal = strtod(nptr: numStr.c_str(), endptr: nullptr);
160 return tok_number;
161 }
162
163 if (lastChar == '#') {
164 // Comment until end of line.
165 do {
166 lastChar = Token(getNextChar());
167 } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
168
169 if (lastChar != EOF)
170 return getTok();
171 }
172
173 // Check for end of file. Don't eat the EOF.
174 if (lastChar == EOF)
175 return tok_eof;
176
177 // Otherwise, just return the character as its ascii value.
178 Token thisChar = Token(lastChar);
179 lastChar = Token(getNextChar());
180 return thisChar;
181 }
182
183 /// The last token read from the input.
184 Token curTok = tok_eof;
185
186 /// Location for `curTok`.
187 Location lastLocation;
188
189 /// If the current Token is an identifier, this string contains the value.
190 std::string identifierStr;
191
192 /// If the current Token is a number, this contains the value.
193 double numVal = 0;
194
195 /// The last value returned by getNextChar(). We need to keep it around as we
196 /// always need to read ahead one character to decide when to end a token and
197 /// we can't put it back in the stream after reading from it.
198 Token lastChar = Token(' ');
199
200 /// Keep track of the current line number in the input stream
201 int curLineNum = 0;
202
203 /// Keep track of the current column number in the input stream
204 int curCol = 0;
205
206 /// Buffer supplied by the derived class on calls to `readNextLine()`
207 llvm::StringRef curLineBuffer = "\n";
208};
209
210/// A lexer implementation operating on a buffer in memory.
211class LexerBuffer final : public Lexer {
212public:
213 LexerBuffer(const char *begin, const char *end, std::string filename)
214 : Lexer(std::move(filename)), current(begin), end(end) {}
215
216private:
217 /// Provide one line at a time to the Lexer, return an empty string when
218 /// reaching the end of the buffer.
219 llvm::StringRef readNextLine() override {
220 auto *begin = current;
221 while (current <= end && *current && *current != '\n')
222 ++current;
223 if (current <= end && *current)
224 ++current;
225 llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
226 return result;
227 }
228 const char *current, *end;
229};
230} // namespace toy
231
232#endif // TOY_LEXER_H
233

source code of mlir/examples/toy/Ch4/include/toy/Lexer.h