1/*
2 *
3 * Copyright (c) 1998-2002
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE states.cpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Declares internal state machine structures.
17 */
18
19#ifndef BOOST_REGEX_V4_STATES_HPP
20#define BOOST_REGEX_V4_STATES_HPP
21
22#ifdef BOOST_MSVC
23#pragma warning(push)
24#pragma warning(disable: 4103)
25#endif
26#ifdef BOOST_HAS_ABI_HEADERS
27# include BOOST_ABI_PREFIX
28#endif
29#ifdef BOOST_MSVC
30#pragma warning(pop)
31#endif
32
33namespace boost{
34namespace BOOST_REGEX_DETAIL_NS{
35
36/*** mask_type *******************************************************
37Whenever we have a choice of two alternatives, we use an array of bytes
38to indicate which of the two alternatives it is possible to take for any
39given input character. If mask_take is set, then we can take the next
40state, and if mask_skip is set then we can take the alternative.
41***********************************************************************/
42enum mask_type
43{
44 mask_take = 1,
45 mask_skip = 2,
46 mask_init = 4,
47 mask_any = mask_skip | mask_take,
48 mask_all = mask_any
49};
50
51/*** helpers **********************************************************
52These helpers let us use function overload resolution to detect whether
53we have narrow or wide character strings:
54***********************************************************************/
55struct _narrow_type{};
56struct _wide_type{};
57template <class charT> struct is_byte;
58template<> struct is_byte<char> { typedef _narrow_type width_type; };
59template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
60template<> struct is_byte<signed char> { typedef _narrow_type width_type; };
61template <class charT> struct is_byte { typedef _wide_type width_type; };
62
63/*** enum syntax_element_type ******************************************
64Every record in the state machine falls into one of the following types:
65***********************************************************************/
66enum syntax_element_type
67{
68 // start of a marked sub-expression, or perl-style (?...) extension
69 syntax_element_startmark = 0,
70 // end of a marked sub-expression, or perl-style (?...) extension
71 syntax_element_endmark = syntax_element_startmark + 1,
72 // any sequence of literal characters
73 syntax_element_literal = syntax_element_endmark + 1,
74 // start of line assertion: ^
75 syntax_element_start_line = syntax_element_literal + 1,
76 // end of line assertion $
77 syntax_element_end_line = syntax_element_start_line + 1,
78 // match any character: .
79 syntax_element_wild = syntax_element_end_line + 1,
80 // end of expression: we have a match when we get here
81 syntax_element_match = syntax_element_wild + 1,
82 // perl style word boundary: \b
83 syntax_element_word_boundary = syntax_element_match + 1,
84 // perl style within word boundary: \B
85 syntax_element_within_word = syntax_element_word_boundary + 1,
86 // start of word assertion: \<
87 syntax_element_word_start = syntax_element_within_word + 1,
88 // end of word assertion: \>
89 syntax_element_word_end = syntax_element_word_start + 1,
90 // start of buffer assertion: \`
91 syntax_element_buffer_start = syntax_element_word_end + 1,
92 // end of buffer assertion: \'
93 syntax_element_buffer_end = syntax_element_buffer_start + 1,
94 // backreference to previously matched sub-expression
95 syntax_element_backref = syntax_element_buffer_end + 1,
96 // either a wide character set [..] or one with multicharacter collating elements:
97 syntax_element_long_set = syntax_element_backref + 1,
98 // narrow character set: [...]
99 syntax_element_set = syntax_element_long_set + 1,
100 // jump to a new state in the machine:
101 syntax_element_jump = syntax_element_set + 1,
102 // choose between two production states:
103 syntax_element_alt = syntax_element_jump + 1,
104 // a repeat
105 syntax_element_rep = syntax_element_alt + 1,
106 // match a combining character sequence
107 syntax_element_combining = syntax_element_rep + 1,
108 // perl style soft buffer end: \z
109 syntax_element_soft_buffer_end = syntax_element_combining + 1,
110 // perl style continuation: \G
111 syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
112 // single character repeats:
113 syntax_element_dot_rep = syntax_element_restart_continue + 1,
114 syntax_element_char_rep = syntax_element_dot_rep + 1,
115 syntax_element_short_set_rep = syntax_element_char_rep + 1,
116 syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
117 // a backstep for lookbehind repeats:
118 syntax_element_backstep = syntax_element_long_set_rep + 1,
119 // an assertion that a mark was matched:
120 syntax_element_assert_backref = syntax_element_backstep + 1,
121 syntax_element_toggle_case = syntax_element_assert_backref + 1,
122 // a recursive expression:
123 syntax_element_recurse = syntax_element_toggle_case + 1,
124 // Verbs:
125 syntax_element_fail = syntax_element_recurse + 1,
126 syntax_element_accept = syntax_element_fail + 1,
127 syntax_element_commit = syntax_element_accept + 1,
128 syntax_element_then = syntax_element_commit + 1
129};
130
131#ifdef BOOST_REGEX_DEBUG
132// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
133std::ostream& operator<<(std::ostream&, syntax_element_type);
134#endif
135
136struct re_syntax_base;
137
138/*** union offset_type ************************************************
139Points to another state in the machine. During machine construction
140we use integral offsets, but these are converted to pointers before
141execution of the machine.
142***********************************************************************/
143union offset_type
144{
145 re_syntax_base* p;
146 std::ptrdiff_t i;
147};
148
149/*** struct re_syntax_base ********************************************
150Base class for all states in the machine.
151***********************************************************************/
152struct re_syntax_base
153{
154 syntax_element_type type; // what kind of state this is
155 offset_type next; // next state in the machine
156};
157
158/*** struct re_brace **************************************************
159A marked parenthesis.
160***********************************************************************/
161struct re_brace : public re_syntax_base
162{
163 // The index to match, can be zero (don't mark the sub-expression)
164 // or negative (for perl style (?...) extentions):
165 int index;
166 bool icase;
167};
168
169/*** struct re_dot **************************************************
170Match anything.
171***********************************************************************/
172enum
173{
174 dont_care = 1,
175 force_not_newline = 0,
176 force_newline = 2,
177
178 test_not_newline = 2,
179 test_newline = 3
180};
181struct re_dot : public re_syntax_base
182{
183 unsigned char mask;
184};
185
186/*** struct re_literal ************************************************
187A string of literals, following this structure will be an
188array of characters: charT[length]
189***********************************************************************/
190struct re_literal : public re_syntax_base
191{
192 unsigned int length;
193};
194
195/*** struct re_case ************************************************
196Indicates whether we are moving to a case insensive block or not
197***********************************************************************/
198struct re_case : public re_syntax_base
199{
200 bool icase;
201};
202
203/*** struct re_set_long ***********************************************
204A wide character set of characters, following this structure will be
205an array of type charT:
206First csingles null-terminated strings
207Then 2 * cranges NULL terminated strings
208Then cequivalents NULL terminated strings
209***********************************************************************/
210template <class mask_type>
211struct re_set_long : public re_syntax_base
212{
213 unsigned int csingles, cranges, cequivalents;
214 mask_type cclasses;
215 mask_type cnclasses;
216 bool isnot;
217 bool singleton;
218};
219
220/*** struct re_set ****************************************************
221A set of narrow-characters, matches any of _map which is none-zero
222***********************************************************************/
223struct re_set : public re_syntax_base
224{
225 unsigned char _map[1 << CHAR_BIT];
226};
227
228/*** struct re_jump ***************************************************
229Jump to a new location in the machine (not next).
230***********************************************************************/
231struct re_jump : public re_syntax_base
232{
233 offset_type alt; // location to jump to
234};
235
236/*** struct re_alt ***************************************************
237Jump to a new location in the machine (possibly next).
238***********************************************************************/
239struct re_alt : public re_jump
240{
241 unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump
242 unsigned int can_be_null; // true if we match a NULL string
243};
244
245/*** struct re_repeat *************************************************
246Repeat a section of the machine
247***********************************************************************/
248struct re_repeat : public re_alt
249{
250 std::size_t min, max; // min and max allowable repeats
251 int state_id; // Unique identifier for this repeat
252 bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches)
253 bool greedy; // True if this is a greedy repeat
254};
255
256/*** struct re_recurse ************************************************
257Recurse to a particular subexpression.
258**********************************************************************/
259struct re_recurse : public re_jump
260{
261 int state_id; // identifier of first nested repeat within the recursion.
262};
263
264/*** struct re_commit *************************************************
265Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happens
266if no match is found and we start searching forward.
267**********************************************************************/
268enum commit_type
269{
270 commit_prune,
271 commit_skip,
272 commit_commit
273};
274struct re_commit : public re_syntax_base
275{
276 commit_type action;
277};
278
279/*** enum re_jump_size_type *******************************************
280Provides compiled size of re_jump structure (allowing for trailing alignment).
281We provide this so we know how manybytes to insert when constructing the machine
282(The value of padding_mask is defined in regex_raw_buffer.hpp).
283***********************************************************************/
284enum re_jump_size_type
285{
286 re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
287 re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
288 re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
289};
290
291/*** proc re_is_set_member *********************************************
292Forward declaration: we'll need this one later...
293***********************************************************************/
294
295template<class charT, class traits>
296struct regex_data;
297
298template <class iterator, class charT, class traits_type, class char_classT>
299iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
300 iterator last,
301 const re_set_long<char_classT>* set_,
302 const regex_data<charT, traits_type>& e, bool icase);
303
304} // namespace BOOST_REGEX_DETAIL_NS
305
306} // namespace boost
307
308#ifdef BOOST_MSVC
309#pragma warning(push)
310#pragma warning(disable: 4103)
311#endif
312#ifdef BOOST_HAS_ABI_HEADERS
313# include BOOST_ABI_SUFFIX
314#endif
315#ifdef BOOST_MSVC
316#pragma warning(pop)
317#endif
318
319#endif
320
321
322

source code of boost/boost/regex/v4/states.hpp