1// Copyright (c) 2001-2010 Hartmut Kaiser
2//
3// Distributed under the Boost Software License, Version 1.0. (See accompanying
4// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
6// This example shows how to create a simple lexer recognizing a couple of
7// different tokens aimed at a simple language and how to use this lexer with
8// a grammar. It shows how to associate attributes to tokens and how to access the
9// token attributes from inside the grammar.
10//
11// Additionally, this example demonstrates, how to define a token set usable
12// as the skip parser during parsing, allowing to define several tokens to be
13// ignored.
14//
15// The main purpose of this example is to show how inheritance can be used to
16// overload parts of a base grammar and add token definitions to a base lexer.
17//
18// Further, it shows how you can use the 'omit' attribute type specifier
19// for token definitions to force the token to have no attribute (expose an
20// unused attribute).
21//
22// This example recognizes a very simple programming language having
23// assignment statements and if and while control structures. Look at the file
24// example5.input for an example.
25
26#include <boost/spirit/include/qi.hpp>
27#include <boost/spirit/include/lex_lexertl.hpp>
28#include <boost/phoenix/operator.hpp>
29
30#include <iostream>
31#include <fstream>
32#include <string>
33
34#include "example.hpp"
35
36using namespace boost::spirit;
37using boost::phoenix::val;
38
39///////////////////////////////////////////////////////////////////////////////
40// Token definition base, defines all tokens for the base grammar below
41///////////////////////////////////////////////////////////////////////////////
42template <typename Lexer>
43struct example5_base_tokens : lex::lexer<Lexer>
44{
45protected:
46 // this lexer is supposed to be used as a base type only
47 example5_base_tokens() {}
48
49public:
50 void init_token_definitions()
51 {
52 // define the tokens to match
53 identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
54 constant = "[0-9]+";
55 if_ = "if";
56 while_ = "while";
57
58 // associate the tokens and the token set with the lexer
59 this->self += lex::token_def<>('(') | ')' | '{' | '}' | '=' | ';' | constant;
60 this->self += if_ | while_ | identifier;
61
62 // define the whitespace to ignore (spaces, tabs, newlines and C-style
63 // comments)
64 this->self("WS")
65 = lex::token_def<>("[ \\t\\n]+")
66 | "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"
67 ;
68 }
69
70 // these tokens have no attribute
71 lex::token_def<lex::omit> if_, while_;
72
73 // The following two tokens have an associated attribute type, 'identifier'
74 // carries a string (the identifier name) and 'constant' carries the
75 // matched integer value.
76 //
77 // Note: any token attribute type explicitly specified in a token_def<>
78 // declaration needs to be listed during token type definition as
79 // well (see the typedef for the token_type below).
80 //
81 // The conversion of the matched input to an instance of this type occurs
82 // once (on first access), which makes token attributes as efficient as
83 // possible. Moreover, token instances are constructed once by the lexer
84 // library. From this point on tokens are passed by reference only,
85 // avoiding them being copied around.
86 lex::token_def<std::string> identifier;
87 lex::token_def<unsigned int> constant;
88};
89
90///////////////////////////////////////////////////////////////////////////////
91// Grammar definition base, defines a basic language
92///////////////////////////////////////////////////////////////////////////////
93template <typename Iterator, typename Lexer>
94struct example5_base_grammar
95 : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
96{
97 template <typename TokenDef>
98 example5_base_grammar(TokenDef const& tok)
99 : example5_base_grammar::base_type(program)
100 {
101 using boost::spirit::_val;
102
103 program
104 = +block
105 ;
106
107 block
108 = '{' >> *statement >> '}'
109 ;
110
111 statement
112 = assignment
113 | if_stmt
114 | while_stmt
115 ;
116
117 assignment
118 = (tok.identifier >> '=' >> expression >> ';')
119 [
120 std::cout << val(t: "assignment statement to: ") << _1 << "\n"
121 ]
122 ;
123
124 if_stmt
125 = (tok.if_ >> '(' >> expression >> ')' >> block)
126 [
127 std::cout << val(t: "if expression: ") << _1 << "\n"
128 ]
129 ;
130
131 while_stmt
132 = (tok.while_ >> '(' >> expression >> ')' >> block)
133 [
134 std::cout << val(t: "while expression: ") << _1 << "\n"
135 ]
136 ;
137
138 // since expression has a variant return type accommodating for
139 // std::string and unsigned integer, both possible values may be
140 // returned to the calling rule
141 expression
142 = tok.identifier [ _val = _1 ]
143 | tok.constant [ _val = _1 ]
144 ;
145 }
146
147 typedef qi::in_state_skipper<Lexer> skipper_type;
148
149 qi::rule<Iterator, skipper_type> program, block, statement;
150 qi::rule<Iterator, skipper_type> assignment, if_stmt;
151 qi::rule<Iterator, skipper_type> while_stmt;
152
153 // the expression is the only rule having a return value
154 typedef boost::variant<unsigned int, std::string> expression_type;
155 qi::rule<Iterator, expression_type(), skipper_type> expression;
156};
157
158///////////////////////////////////////////////////////////////////////////////
159// Token definition for derived lexer, defines additional tokens
160///////////////////////////////////////////////////////////////////////////////
161template <typename Lexer>
162struct example5_tokens : example5_base_tokens<Lexer>
163{
164 typedef example5_base_tokens<Lexer> base_type;
165
166 example5_tokens()
167 {
168 // define the additional token to match
169 else_ = "else";
170
171 // associate the new token with the lexer, note we add 'else' before
172 // anything else to add it to the token set before the identifier
173 // token, otherwise "else" would be matched as an identifier
174 this->self = else_;
175
176 // now add the token definitions from the base class
177 this->base_type::init_token_definitions();
178 }
179
180 // this token has no attribute
181 lex::token_def<lex::omit> else_;
182};
183
184///////////////////////////////////////////////////////////////////////////////
185// Derived grammar definition, defines a language extension
186///////////////////////////////////////////////////////////////////////////////
187template <typename Iterator, typename Lexer>
188struct example5_grammar : example5_base_grammar<Iterator, Lexer>
189{
190 template <typename TokenDef>
191 example5_grammar(TokenDef const& tok)
192 : example5_base_grammar<Iterator, Lexer>(tok)
193 {
194 // we alter the if_stmt only
195 this->if_stmt
196 = this->if_stmt.copy() >> -(tok.else_ >> this->block)
197 ;
198 }
199};
200
201///////////////////////////////////////////////////////////////////////////////
202int main()
203{
204 // iterator type used to expose the underlying input stream
205 typedef std::string::iterator base_iterator_type;
206
207 // This is the lexer token type to use. The second template parameter lists
208 // all attribute types used for token_def's during token definition (see
209 // example5_base_tokens<> above). Here we use the predefined lexertl token
210 // type, but any compatible token type may be used instead.
211 //
212 // If you don't list any token attribute types in the following declaration
213 // (or just use the default token type: lexertl_token<base_iterator_type>)
214 // it will compile and work just fine, just a bit less efficient. This is
215 // because the token attribute will be generated from the matched input
216 // sequence every time it is requested. But as soon as you specify at
217 // least one token attribute type you'll have to list all attribute types
218 // used for token_def<> declarations in the token definition class above,
219 // otherwise compilation errors will occur.
220 typedef lex::lexertl::token<
221 base_iterator_type, boost::mpl::vector<unsigned int, std::string>
222 > token_type;
223
224 // Here we use the lexertl based lexer engine.
225 typedef lex::lexertl::lexer<token_type> lexer_type;
226
227 // This is the token definition type (derived from the given lexer type).
228 typedef example5_tokens<lexer_type> example5_tokens;
229
230 // this is the iterator type exposed by the lexer
231 typedef example5_tokens::iterator_type iterator_type;
232
233 // this is the type of the grammar to parse
234 typedef example5_grammar<iterator_type, example5_tokens::lexer_def> example5_grammar;
235
236 // now we use the types defined above to create the lexer and grammar
237 // object instances needed to invoke the parsing process
238 example5_tokens tokens; // Our lexer
239 example5_grammar calc(tokens); // Our parser
240
241 std::string str (read_from_file(infile: "example5.input"));
242
243 // At this point we generate the iterator pair used to expose the
244 // tokenized input stream.
245 std::string::iterator it = str.begin();
246 iterator_type iter = tokens.begin(first&: it, last: str.end());
247 iterator_type end = tokens.end();
248
249 // Parsing is done based on the token stream, not the character
250 // stream read from the input.
251 // Note how we use the lexer defined above as the skip parser. It must
252 // be explicitly wrapped inside a state directive, switching the lexer
253 // state for the duration of skipping whitespace.
254 std::string ws("WS");
255 bool r = qi::phrase_parse(first&: iter, last: end, expr&: calc, skipper: qi::in_state(ws)[tokens.self]);
256
257 if (r && iter == end)
258 {
259 std::cout << "-------------------------\n";
260 std::cout << "Parsing succeeded\n";
261 std::cout << "-------------------------\n";
262 }
263 else
264 {
265 std::cout << "-------------------------\n";
266 std::cout << "Parsing failed\n";
267 std::cout << "-------------------------\n";
268 }
269
270 std::cout << "Bye... :-) \n\n";
271 return 0;
272}
273

source code of boost/libs/spirit/example/lex/example5.cpp