1/*
2 *
3 * Copyright (c) 2002
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12#ifndef BOOST_REGEX_MATCHER_HPP
13#define BOOST_REGEX_MATCHER_HPP
14
15#include <boost/regex/v4/iterator_category.hpp>
16
17#ifdef BOOST_MSVC
18#pragma warning(push)
19#pragma warning(disable: 4103)
20#endif
21#ifdef BOOST_HAS_ABI_HEADERS
22# include BOOST_ABI_PREFIX
23#endif
24#ifdef BOOST_MSVC
25#pragma warning(pop)
26#endif
27
28#ifdef BOOST_MSVC
29# pragma warning(push)
30# pragma warning(disable: 4800)
31#endif
32
33namespace boost{
34namespace BOOST_REGEX_DETAIL_NS{
35
36//
37// error checking API:
38//
39BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf);
40//
41// function can_start:
42//
43template <class charT>
44inline bool can_start(charT c, const unsigned char* map, unsigned char mask)
45{
46 return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask));
47}
48inline bool can_start(char c, const unsigned char* map, unsigned char mask)
49{
50 return map[(unsigned char)c] & mask;
51}
52inline bool can_start(signed char c, const unsigned char* map, unsigned char mask)
53{
54 return map[(unsigned char)c] & mask;
55}
56inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask)
57{
58 return map[c] & mask;
59}
60inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask)
61{
62 return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
63}
64#if !defined(__hpux) && !defined(__WINSCW__)// WCHAR_MIN not usable in pp-directives.
65#if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T)
66inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask)
67{
68 return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask);
69}
70#endif
71#endif
72#if !defined(BOOST_NO_INTRINSIC_WCHAR_T)
73inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask)
74{
75 return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask));
76}
77#endif
78
79
80//
81// Unfortunately Rogue Waves standard library appears to have a bug
82// in std::basic_string::compare that results in eroneous answers
83// in some cases (tested with Borland C++ 5.1, Rogue Wave lib version
84// 0x020101) the test case was:
85// {39135,0} < {0xff,0}
86// which succeeds when it should not.
87//
88#ifndef _RWSTD_VER
89template <class C, class T, class A>
90inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
91{
92 if(0 == *p)
93 {
94 if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
95 return 0;
96 }
97 return s.compare(p);
98}
99#else
100template <class C, class T, class A>
101inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
102{
103 if(0 == *p)
104 {
105 if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
106 return 0;
107 }
108 return s.compare(p);
109}
110inline int string_compare(const std::string& s, const char* p)
111{ return std::strcmp(s.c_str(), p); }
112# ifndef BOOST_NO_WREGEX
113inline int string_compare(const std::wstring& s, const wchar_t* p)
114{ return std::wcscmp(s.c_str(), p); }
115#endif
116#endif
117template <class Seq, class C>
118inline int string_compare(const Seq& s, const C* p)
119{
120 std::size_t i = 0;
121 while((i < s.size()) && (p[i] == s[i]))
122 {
123 ++i;
124 }
125 return (i == s.size()) ? -p[i] : s[i] - p[i];
126}
127# define STR_COMP(s,p) string_compare(s,p)
128
129template<class charT>
130inline const charT* re_skip_past_null(const charT* p)
131{
132 while (*p != static_cast<charT>(0)) ++p;
133 return ++p;
134}
135
136template <class iterator, class charT, class traits_type, class char_classT>
137iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
138 iterator last,
139 const re_set_long<char_classT>* set_,
140 const regex_data<charT, traits_type>& e, bool icase)
141{
142 const charT* p = reinterpret_cast<const charT*>(set_+1);
143 iterator ptr;
144 unsigned int i;
145 //bool icase = e.m_flags & regex_constants::icase;
146
147 if(next == last) return next;
148
149 typedef typename traits_type::string_type traits_string_type;
150 const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits);
151
152 // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never
153 // referenced
154 (void)traits_inst;
155
156 // try and match a single character, could be a multi-character
157 // collating element...
158 for(i = 0; i < set_->csingles; ++i)
159 {
160 ptr = next;
161 if(*p == static_cast<charT>(0))
162 {
163 // treat null string as special case:
164 if(traits_inst.translate(*ptr, icase) != *p)
165 {
166 while(*p == static_cast<charT>(0))++p;
167 continue;
168 }
169 return set_->isnot ? next : (ptr == next) ? ++next : ptr;
170 }
171 else
172 {
173 while(*p && (ptr != last))
174 {
175 if(traits_inst.translate(*ptr, icase) != *p)
176 break;
177 ++p;
178 ++ptr;
179 }
180
181 if(*p == static_cast<charT>(0)) // if null we've matched
182 return set_->isnot ? next : (ptr == next) ? ++next : ptr;
183
184 p = re_skip_past_null(p); // skip null
185 }
186 }
187
188 charT col = traits_inst.translate(*next, icase);
189
190
191 if(set_->cranges || set_->cequivalents)
192 {
193 traits_string_type s1;
194 //
195 // try and match a range, NB only a single character can match
196 if(set_->cranges)
197 {
198 if((e.m_flags & regex_constants::collate) == 0)
199 s1.assign(1, col);
200 else
201 {
202 charT a[2] = { col, charT(0), };
203 s1 = traits_inst.transform(a, a + 1);
204 }
205 for(i = 0; i < set_->cranges; ++i)
206 {
207 if(STR_COMP(s1, p) >= 0)
208 {
209 do{ ++p; }while(*p);
210 ++p;
211 if(STR_COMP(s1, p) <= 0)
212 return set_->isnot ? next : ++next;
213 }
214 else
215 {
216 // skip first string
217 do{ ++p; }while(*p);
218 ++p;
219 }
220 // skip second string
221 do{ ++p; }while(*p);
222 ++p;
223 }
224 }
225 //
226 // try and match an equivalence class, NB only a single character can match
227 if(set_->cequivalents)
228 {
229 charT a[2] = { col, charT(0), };
230 s1 = traits_inst.transform_primary(a, a +1);
231 for(i = 0; i < set_->cequivalents; ++i)
232 {
233 if(STR_COMP(s1, p) == 0)
234 return set_->isnot ? next : ++next;
235 // skip string
236 do{ ++p; }while(*p);
237 ++p;
238 }
239 }
240 }
241 if(traits_inst.isctype(col, set_->cclasses) == true)
242 return set_->isnot ? next : ++next;
243 if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false))
244 return set_->isnot ? next : ++next;
245 return set_->isnot ? ++next : next;
246}
247
248template <class BidiIterator>
249class repeater_count
250{
251 repeater_count** stack;
252 repeater_count* next;
253 int state_id;
254 std::size_t count; // the number of iterations so far
255 BidiIterator start_pos; // where the last repeat started
256
257 repeater_count* unwind_until(int n, repeater_count* p, int current_recursion_id)
258 {
259 while(p && (p->state_id != n))
260 {
261 if(-2 - current_recursion_id == p->state_id)
262 return 0;
263 p = p->next;
264 if(p && (p->state_id < 0))
265 {
266 p = unwind_until(n: p->state_id, p, current_recursion_id);
267 if(!p)
268 return p;
269 p = p->next;
270 }
271 }
272 return p;
273 }
274public:
275 repeater_count(repeater_count** s) : stack(s), next(0), state_id(-1), count(0), start_pos() {}
276
277 repeater_count(int i, repeater_count** s, BidiIterator start, int current_recursion_id)
278 : start_pos(start)
279 {
280 state_id = i;
281 stack = s;
282 next = *stack;
283 *stack = this;
284 if((state_id > next->state_id) && (next->state_id >= 0))
285 count = 0;
286 else
287 {
288 repeater_count* p = next;
289 p = unwind_until(n: state_id, p, current_recursion_id);
290 if(p)
291 {
292 count = p->count;
293 start_pos = p->start_pos;
294 }
295 else
296 count = 0;
297 }
298 }
299 ~repeater_count()
300 {
301 if(next)
302 *stack = next;
303 }
304 std::size_t get_count() { return count; }
305 int get_id() { return state_id; }
306 std::size_t operator++() { return ++count; }
307 bool check_null_repeat(const BidiIterator& pos, std::size_t max)
308 {
309 // this is called when we are about to start a new repeat,
310 // if the last one was NULL move our count to max,
311 // otherwise save the current position.
312 bool result = (count == 0) ? false : (pos == start_pos);
313 if(result)
314 count = max;
315 else
316 start_pos = pos;
317 return result;
318 }
319};
320
321struct saved_state;
322
323enum saved_state_type
324{
325 saved_type_end = 0,
326 saved_type_paren = 1,
327 saved_type_recurse = 2,
328 saved_type_assertion = 3,
329 saved_state_alt = 4,
330 saved_state_repeater_count = 5,
331 saved_state_extra_block = 6,
332 saved_state_greedy_single_repeat = 7,
333 saved_state_rep_slow_dot = 8,
334 saved_state_rep_fast_dot = 9,
335 saved_state_rep_char = 10,
336 saved_state_rep_short_set = 11,
337 saved_state_rep_long_set = 12,
338 saved_state_non_greedy_long_repeat = 13,
339 saved_state_count = 14
340};
341
342template <class Results>
343struct recursion_info
344{
345 typedef typename Results::value_type value_type;
346 typedef typename value_type::iterator iterator;
347 int idx;
348 const re_syntax_base* preturn_address;
349 Results results;
350 repeater_count<iterator>* repeater_stack;
351};
352
353#ifdef BOOST_MSVC
354#pragma warning(push)
355#pragma warning(disable : 4251 4231)
356# if BOOST_MSVC < 1600
357# pragma warning(disable : 4660)
358# endif
359#endif
360
361template <class BidiIterator, class Allocator, class traits>
362class perl_matcher
363{
364public:
365 typedef typename traits::char_type char_type;
366 typedef perl_matcher<BidiIterator, Allocator, traits> self_type;
367 typedef bool (self_type::*matcher_proc_type)(void);
368 typedef std::size_t traits_size_type;
369 typedef typename is_byte<char_type>::width_type width_type;
370 typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type;
371 typedef match_results<BidiIterator, Allocator> results_type;
372
373 perl_matcher(BidiIterator first, BidiIterator end,
374 match_results<BidiIterator, Allocator>& what,
375 const basic_regex<char_type, traits>& e,
376 match_flag_type f,
377 BidiIterator l_base)
378 : m_result(what), base(first), last(end),
379 position(first), backstop(l_base), re(e), traits_inst(e.get_traits()),
380 m_independent(false), next_count(&rep_obj), rep_obj(&next_count)
381 {
382 construct_init(e, f);
383 }
384
385 bool match();
386 bool find();
387
388 void setf(match_flag_type f)
389 { m_match_flags |= f; }
390 void unsetf(match_flag_type f)
391 { m_match_flags &= ~f; }
392
393private:
394 void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f);
395
396 bool find_imp();
397 bool match_imp();
398#ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
399 typedef bool (perl_matcher::*protected_proc_type)();
400 bool protected_call(protected_proc_type);
401#endif
402 void estimate_max_state_count(std::random_access_iterator_tag*);
403 void estimate_max_state_count(void*);
404 bool match_prefix();
405 bool match_all_states();
406
407 // match procs, stored in s_match_vtable:
408 bool match_startmark();
409 bool match_endmark();
410 bool match_literal();
411 bool match_start_line();
412 bool match_end_line();
413 bool match_wild();
414 bool match_match();
415 bool match_word_boundary();
416 bool match_within_word();
417 bool match_word_start();
418 bool match_word_end();
419 bool match_buffer_start();
420 bool match_buffer_end();
421 bool match_backref();
422 bool match_long_set();
423 bool match_set();
424 bool match_jump();
425 bool match_alt();
426 bool match_rep();
427 bool match_combining();
428 bool match_soft_buffer_end();
429 bool match_restart_continue();
430 bool match_long_set_repeat();
431 bool match_set_repeat();
432 bool match_char_repeat();
433 bool match_dot_repeat_fast();
434 bool match_dot_repeat_slow();
435 bool match_dot_repeat_dispatch()
436 {
437 return ::boost::is_random_access_iterator<BidiIterator>::value ? match_dot_repeat_fast() : match_dot_repeat_slow();
438 }
439 bool match_backstep();
440 bool match_assert_backref();
441 bool match_toggle_case();
442#ifdef BOOST_REGEX_RECURSIVE
443 bool backtrack_till_match(std::size_t count);
444#endif
445 bool match_recursion();
446 bool match_fail();
447 bool match_accept();
448 bool match_commit();
449 bool match_then();
450 bool skip_until_paren(int index, bool match = true);
451
452 // find procs stored in s_find_vtable:
453 bool find_restart_any();
454 bool find_restart_word();
455 bool find_restart_line();
456 bool find_restart_buf();
457 bool find_restart_lit();
458
459private:
460 // final result structure to be filled in:
461 match_results<BidiIterator, Allocator>& m_result;
462 // temporary result for POSIX matches:
463 scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match;
464 // pointer to actual result structure to fill in:
465 match_results<BidiIterator, Allocator>* m_presult;
466 // start of sequence being searched:
467 BidiIterator base;
468 // end of sequence being searched:
469 BidiIterator last;
470 // current character being examined:
471 BidiIterator position;
472 // where to restart next search after failed match attempt:
473 BidiIterator restart;
474 // where the current search started from, acts as base for $` during grep:
475 BidiIterator search_base;
476 // how far we can go back when matching lookbehind:
477 BidiIterator backstop;
478 // the expression being examined:
479 const basic_regex<char_type, traits>& re;
480 // the expression's traits class:
481 const ::boost::regex_traits_wrapper<traits>& traits_inst;
482 // the next state in the machine being matched:
483 const re_syntax_base* pstate;
484 // matching flags in use:
485 match_flag_type m_match_flags;
486 // how many states we have examined so far:
487 std::ptrdiff_t state_count;
488 // max number of states to examine before giving up:
489 std::ptrdiff_t max_state_count;
490 // whether we should ignore case or not:
491 bool icase;
492 // set to true when (position == last), indicates that we may have a partial match:
493 bool m_has_partial_match;
494 // set to true whenever we get a match:
495 bool m_has_found_match;
496 // set to true whenever we're inside an independent sub-expression:
497 bool m_independent;
498 // the current repeat being examined:
499 repeater_count<BidiIterator>* next_count;
500 // the first repeat being examined (top of linked list):
501 repeater_count<BidiIterator> rep_obj;
502 // the mask to pass when matching word boundaries:
503 typename traits::char_class_type m_word_mask;
504 // the bitmask to use when determining whether a match_any matches a newline or not:
505 unsigned char match_any_mask;
506 // recursion information:
507 std::vector<recursion_info<results_type> > recursion_stack;
508#ifdef BOOST_REGEX_RECURSIVE
509 // Set to false by a (*COMMIT):
510 bool m_can_backtrack;
511 bool m_have_accept;
512 bool m_have_then;
513#endif
514#ifdef BOOST_REGEX_NON_RECURSIVE
515 //
516 // additional members for non-recursive version:
517 //
518 typedef bool (self_type::*unwind_proc_type)(bool);
519
520 void extend_stack();
521 bool unwind(bool);
522 bool unwind_end(bool);
523 bool unwind_paren(bool);
524 bool unwind_recursion_stopper(bool);
525 bool unwind_assertion(bool);
526 bool unwind_alt(bool);
527 bool unwind_repeater_counter(bool);
528 bool unwind_extra_block(bool);
529 bool unwind_greedy_single_repeat(bool);
530 bool unwind_slow_dot_repeat(bool);
531 bool unwind_fast_dot_repeat(bool);
532 bool unwind_char_repeat(bool);
533 bool unwind_short_set_repeat(bool);
534 bool unwind_long_set_repeat(bool);
535 bool unwind_non_greedy_repeat(bool);
536 bool unwind_recursion(bool);
537 bool unwind_recursion_pop(bool);
538 bool unwind_commit(bool);
539 bool unwind_then(bool);
540 void destroy_single_repeat();
541 void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
542 void push_recursion_stopper();
543 void push_assertion(const re_syntax_base* ps, bool positive);
544 void push_alt(const re_syntax_base* ps);
545 void push_repeater_count(int i, repeater_count<BidiIterator>** s);
546 void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id);
547 void push_non_greedy_repeat(const re_syntax_base* ps);
548 void push_recursion(int idx, const re_syntax_base* p, results_type* presults);
549 void push_recursion_pop();
550
551 // pointer to base of stack:
552 saved_state* m_stack_base;
553 // pointer to current stack position:
554 saved_state* m_backup_state;
555 // how many memory blocks have we used up?:
556 unsigned used_block_count;
557 // determines what value to return when unwinding from recursion,
558 // allows for mixed recursive/non-recursive algorithm:
559 bool m_recursive_result;
560 // We have unwound to a lookahead/lookbehind, used by COMMIT/PRUNE/SKIP:
561 bool m_unwound_lookahead;
562 // We have unwound to an alternative, used by THEN:
563 bool m_unwound_alt;
564 // We are unwinding a commit - used by independent subs to determine whether to stop there or carry on unwinding:
565 //bool m_unwind_commit;
566#endif
567
568 // these operations aren't allowed, so are declared private,
569 // bodies are provided to keep explicit-instantiation requests happy:
570 perl_matcher& operator=(const perl_matcher&)
571 {
572 return *this;
573 }
574 perl_matcher(const perl_matcher& that)
575 : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {}
576};
577
578#ifdef BOOST_MSVC
579#pragma warning(pop)
580#endif
581
582} // namespace BOOST_REGEX_DETAIL_NS
583
584#ifdef BOOST_MSVC
585#pragma warning(push)
586#pragma warning(disable: 4103)
587#endif
588#ifdef BOOST_HAS_ABI_HEADERS
589# include BOOST_ABI_SUFFIX
590#endif
591#ifdef BOOST_MSVC
592#pragma warning(pop)
593#endif
594
595} // namespace boost
596
597#ifdef BOOST_MSVC
598# pragma warning(pop)
599#endif
600
601//
602// include the implementation of perl_matcher:
603//
604#ifdef BOOST_REGEX_RECURSIVE
605#include <boost/regex/v4/perl_matcher_recursive.hpp>
606#else
607#include <boost/regex/v4/perl_matcher_non_recursive.hpp>
608#endif
609// this one has to be last:
610#include <boost/regex/v4/perl_matcher_common.hpp>
611
612#endif
613
614

source code of boost/boost/regex/v4/perl_matcher.hpp