1 | // Boost token_functions.hpp ------------------------------------------------// |
2 | |
3 | // Copyright John R. Bandela 2001. |
4 | |
5 | // Distributed under the Boost Software License, Version 1.0. (See |
6 | // accompanying file LICENSE_1_0.txt or copy at |
7 | // http://www.boost.org/LICENSE_1_0.txt) |
8 | |
9 | // See http://www.boost.org/libs/tokenizer/ for documentation. |
10 | |
11 | // Revision History: |
12 | // 01 Oct 2004 Joaquin M Lopez Munoz |
13 | // Workaround for a problem with string::assign in msvc-stlport |
14 | // 06 Apr 2004 John Bandela |
15 | // Fixed a bug involving using char_delimiter with a true input iterator |
16 | // 28 Nov 2003 Robert Zeh and John Bandela |
17 | // Converted into "fast" functions that avoid using += when |
18 | // the supplied iterator isn't an input_iterator; based on |
19 | // some work done at Archelon and a version that was checked into |
20 | // the boost CVS for a short period of time. |
21 | // 20 Feb 2002 John Maddock |
22 | // Removed using namespace std declarations and added |
23 | // workaround for BOOST_NO_STDC_NAMESPACE (the library |
24 | // can be safely mixed with regex). |
25 | // 06 Feb 2002 Jeremy Siek |
26 | // Added char_separator. |
27 | // 02 Feb 2002 Jeremy Siek |
28 | // Removed tabs and a little cleanup. |
29 | |
30 | |
31 | #ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ |
32 | #define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ |
33 | |
34 | #include <vector> |
35 | #include <stdexcept> |
36 | #include <string> |
37 | #include <cctype> |
38 | #include <algorithm> // for find_if |
39 | #include <boost/config.hpp> |
40 | #include <boost/assert.hpp> |
41 | #include <boost/detail/workaround.hpp> |
42 | #include <boost/mpl/if.hpp> |
43 | #include <boost/throw_exception.hpp> |
44 | #if !defined(BOOST_NO_CWCTYPE) |
45 | #include <cwctype> |
46 | #endif |
47 | |
48 | // |
49 | // the following must not be macros if we are to prefix them |
50 | // with std:: (they shouldn't be macros anyway...) |
51 | // |
52 | #ifdef ispunct |
53 | # undef ispunct |
54 | #endif |
55 | #ifdef iswpunct |
56 | # undef iswpunct |
57 | #endif |
58 | #ifdef isspace |
59 | # undef isspace |
60 | #endif |
61 | #ifdef iswspace |
62 | # undef iswspace |
63 | #endif |
64 | // |
65 | // fix namespace problems: |
66 | // |
67 | #ifdef BOOST_NO_STDC_NAMESPACE |
68 | namespace std{ |
69 | using ::ispunct; |
70 | using ::isspace; |
71 | #if !defined(BOOST_NO_CWCTYPE) |
72 | using ::iswpunct; |
73 | using ::iswspace; |
74 | #endif |
75 | } |
76 | #endif |
77 | |
78 | namespace boost{ |
79 | //=========================================================================== |
80 | // The escaped_list_separator class. Which is a model of TokenizerFunction |
81 | // An escaped list is a super-set of what is commonly known as a comma |
82 | // separated value (csv) list.It is separated into fields by a comma or |
83 | // other character. If the delimiting character is inside quotes, then it is |
84 | // counted as a regular character.To allow for embedded quotes in a field, |
85 | // there can be escape sequences using the \ much like C. |
86 | // The role of the comma, the quotation mark, and the escape |
87 | // character (backslash \), can be assigned to other characters. |
88 | |
89 | struct escaped_list_error : public std::runtime_error{ |
90 | escaped_list_error(const std::string& what_arg):std::runtime_error(what_arg) { } |
91 | }; |
92 | |
93 | |
94 | // The out of the box GCC 2.95 on cygwin does not have a char_traits class. |
95 | // MSVC does not like the following typename |
96 | template <class Char, |
97 | class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > |
98 | class escaped_list_separator { |
99 | |
100 | private: |
101 | typedef std::basic_string<Char,Traits> string_type; |
102 | struct char_eq { |
103 | Char e_; |
104 | char_eq(Char e):e_(e) { } |
105 | bool operator()(Char c) { |
106 | return Traits::eq(e_,c); |
107 | } |
108 | }; |
109 | string_type escape_; |
110 | string_type c_; |
111 | string_type quote_; |
112 | bool last_; |
113 | |
114 | bool is_escape(Char e) { |
115 | char_eq f(e); |
116 | return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end(); |
117 | } |
118 | bool is_c(Char e) { |
119 | char_eq f(e); |
120 | return std::find_if(c_.begin(),c_.end(),f)!=c_.end(); |
121 | } |
122 | bool is_quote(Char e) { |
123 | char_eq f(e); |
124 | return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end(); |
125 | } |
126 | template <typename iterator, typename Token> |
127 | void do_escape(iterator& next,iterator end,Token& tok) { |
128 | if (++next == end) |
129 | BOOST_THROW_EXCEPTION(escaped_list_error(std::string("cannot end with escape" ))); |
130 | if (Traits::eq(*next,'n')) { |
131 | tok+='\n'; |
132 | return; |
133 | } |
134 | else if (is_quote(e: *next)) { |
135 | tok+=*next; |
136 | return; |
137 | } |
138 | else if (is_c(e: *next)) { |
139 | tok+=*next; |
140 | return; |
141 | } |
142 | else if (is_escape(e: *next)) { |
143 | tok+=*next; |
144 | return; |
145 | } |
146 | else |
147 | BOOST_THROW_EXCEPTION(escaped_list_error(std::string("unknown escape sequence" ))); |
148 | } |
149 | |
150 | public: |
151 | |
152 | explicit escaped_list_separator(Char e = '\\', |
153 | Char c = ',',Char q = '\"') |
154 | : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { } |
155 | |
156 | escaped_list_separator(string_type e, string_type c, string_type q) |
157 | : escape_(e), c_(c), quote_(q), last_(false) { } |
158 | |
159 | void reset() {last_=false;} |
160 | |
161 | template <typename InputIterator, typename Token> |
162 | bool operator()(InputIterator& next,InputIterator end,Token& tok) { |
163 | bool bInQuote = false; |
164 | tok = Token(); |
165 | |
166 | if (next == end) { |
167 | if (last_) { |
168 | last_ = false; |
169 | return true; |
170 | } |
171 | else |
172 | return false; |
173 | } |
174 | last_ = false; |
175 | for (;next != end;++next) { |
176 | if (is_escape(e: *next)) { |
177 | do_escape(next,end,tok); |
178 | } |
179 | else if (is_c(e: *next)) { |
180 | if (!bInQuote) { |
181 | // If we are not in quote, then we are done |
182 | ++next; |
183 | // The last character was a c, that means there is |
184 | // 1 more blank field |
185 | last_ = true; |
186 | return true; |
187 | } |
188 | else tok+=*next; |
189 | } |
190 | else if (is_quote(e: *next)) { |
191 | bInQuote=!bInQuote; |
192 | } |
193 | else { |
194 | tok += *next; |
195 | } |
196 | } |
197 | return true; |
198 | } |
199 | }; |
200 | |
201 | //=========================================================================== |
202 | // The classes here are used by offset_separator and char_separator to implement |
203 | // faster assigning of tokens using assign instead of += |
204 | |
205 | namespace tokenizer_detail { |
206 | //=========================================================================== |
207 | // Tokenizer was broken for wide character separators, at least on Windows, since |
208 | // CRT functions isspace etc only expect values in [0, 0xFF]. Debug build asserts |
209 | // if higher values are passed in. The traits extension class should take care of this. |
210 | // Assuming that the conditional will always get optimized out in the function |
211 | // implementations, argument types are not a problem since both forms of character classifiers |
212 | // expect an int. |
213 | |
214 | #if !defined(BOOST_NO_CWCTYPE) |
215 | template<typename traits, int N> |
216 | struct traits_extension_details : public traits { |
217 | typedef typename traits::char_type char_type; |
218 | static bool isspace(char_type c) |
219 | { |
220 | return std::iswspace(wc: c) != 0; |
221 | } |
222 | static bool ispunct(char_type c) |
223 | { |
224 | return std::iswpunct(wc: c) != 0; |
225 | } |
226 | }; |
227 | |
228 | template<typename traits> |
229 | struct traits_extension_details<traits, 1> : public traits { |
230 | typedef typename traits::char_type char_type; |
231 | static bool isspace(char_type c) |
232 | { |
233 | return std::isspace(c) != 0; |
234 | } |
235 | static bool ispunct(char_type c) |
236 | { |
237 | return std::ispunct(c) != 0; |
238 | } |
239 | }; |
240 | #endif |
241 | |
242 | |
243 | // In case there is no cwctype header, we implement the checks manually. |
244 | // We make use of the fact that the tested categories should fit in ASCII. |
245 | template<typename traits> |
246 | struct traits_extension : public traits { |
247 | typedef typename traits::char_type char_type; |
248 | static bool isspace(char_type c) |
249 | { |
250 | #if !defined(BOOST_NO_CWCTYPE) |
251 | return traits_extension_details<traits, sizeof(char_type)>::isspace(c); |
252 | #else |
253 | return static_cast< unsigned >(c) <= 255 && std::isspace(c) != 0; |
254 | #endif |
255 | } |
256 | |
257 | static bool ispunct(char_type c) |
258 | { |
259 | #if !defined(BOOST_NO_CWCTYPE) |
260 | return traits_extension_details<traits, sizeof(char_type)>::ispunct(c); |
261 | #else |
262 | return static_cast< unsigned >(c) <= 255 && std::ispunct(c) != 0; |
263 | #endif |
264 | } |
265 | }; |
266 | |
267 | // The assign_or_plus_equal struct contains functions that implement |
268 | // assign, +=, and clearing based on the iterator type. The |
269 | // generic case does nothing for plus_equal and clearing, while |
270 | // passing through the call for assign. |
271 | // |
272 | // When an input iterator is being used, the situation is reversed. |
273 | // The assign method does nothing, plus_equal invokes operator +=, |
274 | // and the clearing method sets the supplied token to the default |
275 | // token constructor's result. |
276 | // |
277 | |
278 | template<class IteratorTag> |
279 | struct assign_or_plus_equal { |
280 | template<class Iterator, class Token> |
281 | static void assign(Iterator b, Iterator e, Token &t) { |
282 | t.assign(b, e); |
283 | } |
284 | |
285 | template<class Token, class Value> |
286 | static void plus_equal(Token &, const Value &) { } |
287 | |
288 | // If we are doing an assign, there is no need for the |
289 | // the clear. |
290 | // |
291 | template<class Token> |
292 | static void clear(Token &) { } |
293 | }; |
294 | |
295 | template <> |
296 | struct assign_or_plus_equal<std::input_iterator_tag> { |
297 | template<class Iterator, class Token> |
298 | static void assign(Iterator , Iterator , Token &) { } |
299 | template<class Token, class Value> |
300 | static void plus_equal(Token &t, const Value &v) { |
301 | t += v; |
302 | } |
303 | template<class Token> |
304 | static void clear(Token &t) { |
305 | t = Token(); |
306 | } |
307 | }; |
308 | |
309 | |
310 | template<class Iterator> |
311 | struct pointer_iterator_category{ |
312 | typedef std::random_access_iterator_tag type; |
313 | }; |
314 | |
315 | |
316 | template<class Iterator> |
317 | struct class_iterator_category{ |
318 | typedef typename Iterator::iterator_category type; |
319 | }; |
320 | |
321 | |
322 | |
323 | // This portably gets the iterator_tag without partial template specialization |
324 | template<class Iterator> |
325 | struct get_iterator_category{ |
326 | typedef typename mpl::if_<is_pointer<Iterator>, |
327 | pointer_iterator_category<Iterator>, |
328 | class_iterator_category<Iterator> |
329 | >::type cat; |
330 | |
331 | typedef typename cat::type iterator_category; |
332 | }; |
333 | |
334 | |
335 | } // namespace tokenizer_detail |
336 | |
337 | |
338 | //=========================================================================== |
339 | // The offset_separator class, which is a model of TokenizerFunction. |
340 | // Offset breaks a string into tokens based on a range of offsets |
341 | |
342 | class offset_separator { |
343 | private: |
344 | |
345 | std::vector<int> offsets_; |
346 | unsigned int current_offset_; |
347 | bool wrap_offsets_; |
348 | bool return_partial_last_; |
349 | |
350 | public: |
351 | template <typename Iter> |
352 | offset_separator(Iter begin, Iter end, bool wrap_offsets = true, |
353 | bool return_partial_last = true) |
354 | : offsets_(begin,end), current_offset_(0), |
355 | wrap_offsets_(wrap_offsets), |
356 | return_partial_last_(return_partial_last) { } |
357 | |
358 | offset_separator() |
359 | : offsets_(1,1), current_offset_(), |
360 | wrap_offsets_(true), return_partial_last_(true) { } |
361 | |
362 | void reset() { |
363 | current_offset_ = 0; |
364 | } |
365 | |
366 | template <typename InputIterator, typename Token> |
367 | bool operator()(InputIterator& next, InputIterator end, Token& tok) |
368 | { |
369 | typedef tokenizer_detail::assign_or_plus_equal< |
370 | BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category< |
371 | InputIterator |
372 | >::iterator_category |
373 | > assigner; |
374 | |
375 | BOOST_ASSERT(!offsets_.empty()); |
376 | |
377 | assigner::clear(tok); |
378 | InputIterator start(next); |
379 | |
380 | if (next == end) |
381 | return false; |
382 | |
383 | if (current_offset_ == offsets_.size()) |
384 | { |
385 | if (wrap_offsets_) |
386 | current_offset_=0; |
387 | else |
388 | return false; |
389 | } |
390 | |
391 | int c = offsets_[current_offset_]; |
392 | int i = 0; |
393 | for (; i < c; ++i) { |
394 | if (next == end)break; |
395 | assigner::plus_equal(tok,*next++); |
396 | } |
397 | assigner::assign(start,next,tok); |
398 | |
399 | if (!return_partial_last_) |
400 | if (i < (c-1) ) |
401 | return false; |
402 | |
403 | ++current_offset_; |
404 | return true; |
405 | } |
406 | }; |
407 | |
408 | |
409 | //=========================================================================== |
410 | // The char_separator class breaks a sequence of characters into |
411 | // tokens based on the character delimiters (very much like bad old |
412 | // strtok). A delimiter character can either be kept or dropped. A |
413 | // kept delimiter shows up as an output token, whereas a dropped |
414 | // delimiter does not. |
415 | |
416 | // This class replaces the char_delimiters_separator class. The |
417 | // constructor for the char_delimiters_separator class was too |
418 | // confusing and needed to be deprecated. However, because of the |
419 | // default arguments to the constructor, adding the new constructor |
420 | // would cause ambiguity, so instead I deprecated the whole class. |
421 | // The implementation of the class was also simplified considerably. |
422 | |
423 | enum empty_token_policy { drop_empty_tokens, keep_empty_tokens }; |
424 | |
425 | // The out of the box GCC 2.95 on cygwin does not have a char_traits class. |
426 | template <typename Char, |
427 | typename Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > |
428 | class char_separator |
429 | { |
430 | typedef tokenizer_detail::traits_extension<Tr> Traits; |
431 | typedef std::basic_string<Char,Tr> string_type; |
432 | public: |
433 | explicit |
434 | char_separator(const Char* dropped_delims, |
435 | const Char* kept_delims = 0, |
436 | empty_token_policy empty_tokens = drop_empty_tokens) |
437 | : m_dropped_delims(dropped_delims), |
438 | m_use_ispunct(false), |
439 | m_use_isspace(false), |
440 | m_empty_tokens(empty_tokens), |
441 | m_output_done(false) |
442 | { |
443 | // Borland workaround |
444 | if (kept_delims) |
445 | m_kept_delims = kept_delims; |
446 | } |
447 | |
448 | // use ispunct() for kept delimiters and isspace for dropped. |
449 | explicit |
450 | char_separator() |
451 | : m_use_ispunct(true), |
452 | m_use_isspace(true), |
453 | m_empty_tokens(drop_empty_tokens) { } |
454 | |
455 | void reset() { } |
456 | |
457 | template <typename InputIterator, typename Token> |
458 | bool operator()(InputIterator& next, InputIterator end, Token& tok) |
459 | { |
460 | typedef tokenizer_detail::assign_or_plus_equal< |
461 | BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category< |
462 | InputIterator |
463 | >::iterator_category |
464 | > assigner; |
465 | |
466 | assigner::clear(tok); |
467 | |
468 | // skip past all dropped_delims |
469 | if (m_empty_tokens == drop_empty_tokens) |
470 | for (; next != end && is_dropped(E: *next); ++next) |
471 | { } |
472 | |
473 | InputIterator start(next); |
474 | |
475 | if (m_empty_tokens == drop_empty_tokens) { |
476 | |
477 | if (next == end) |
478 | return false; |
479 | |
480 | |
481 | // if we are on a kept_delims move past it and stop |
482 | if (is_kept(E: *next)) { |
483 | assigner::plus_equal(tok,*next); |
484 | ++next; |
485 | } else |
486 | // append all the non delim characters |
487 | for (; next != end && !is_dropped(E: *next) && !is_kept(E: *next); ++next) |
488 | assigner::plus_equal(tok,*next); |
489 | } |
490 | else { // m_empty_tokens == keep_empty_tokens |
491 | |
492 | // Handle empty token at the end |
493 | if (next == end) |
494 | { |
495 | if (m_output_done == false) |
496 | { |
497 | m_output_done = true; |
498 | assigner::assign(start,next,tok); |
499 | return true; |
500 | } |
501 | else |
502 | return false; |
503 | } |
504 | |
505 | if (is_kept(E: *next)) { |
506 | if (m_output_done == false) |
507 | m_output_done = true; |
508 | else { |
509 | assigner::plus_equal(tok,*next); |
510 | ++next; |
511 | m_output_done = false; |
512 | } |
513 | } |
514 | else if (m_output_done == false && is_dropped(E: *next)) { |
515 | m_output_done = true; |
516 | } |
517 | else { |
518 | if (is_dropped(E: *next)) |
519 | start=++next; |
520 | for (; next != end && !is_dropped(E: *next) && !is_kept(E: *next); ++next) |
521 | assigner::plus_equal(tok,*next); |
522 | m_output_done = true; |
523 | } |
524 | } |
525 | assigner::assign(start,next,tok); |
526 | return true; |
527 | } |
528 | |
529 | private: |
530 | string_type m_kept_delims; |
531 | string_type m_dropped_delims; |
532 | bool m_use_ispunct; |
533 | bool m_use_isspace; |
534 | empty_token_policy m_empty_tokens; |
535 | bool m_output_done; |
536 | |
537 | bool is_kept(Char E) const |
538 | { |
539 | if (m_kept_delims.length()) |
540 | return m_kept_delims.find(E) != string_type::npos; |
541 | else if (m_use_ispunct) { |
542 | return Traits::ispunct(E) != 0; |
543 | } else |
544 | return false; |
545 | } |
546 | bool is_dropped(Char E) const |
547 | { |
548 | if (m_dropped_delims.length()) |
549 | return m_dropped_delims.find(E) != string_type::npos; |
550 | else if (m_use_isspace) { |
551 | return Traits::isspace(E) != 0; |
552 | } else |
553 | return false; |
554 | } |
555 | }; |
556 | |
557 | //=========================================================================== |
558 | // The following class is DEPRECATED, use class char_separators instead. |
559 | // |
560 | // The char_delimiters_separator class, which is a model of |
561 | // TokenizerFunction. char_delimiters_separator breaks a string |
562 | // into tokens based on character delimiters. There are 2 types of |
563 | // delimiters. returnable delimiters can be returned as |
564 | // tokens. These are often punctuation. nonreturnable delimiters |
565 | // cannot be returned as tokens. These are often whitespace |
566 | |
567 | // The out of the box GCC 2.95 on cygwin does not have a char_traits class. |
568 | template <class Char, |
569 | class Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > |
570 | class char_delimiters_separator { |
571 | private: |
572 | |
573 | typedef tokenizer_detail::traits_extension<Tr> Traits; |
574 | typedef std::basic_string<Char,Tr> string_type; |
575 | string_type returnable_; |
576 | string_type nonreturnable_; |
577 | bool return_delims_; |
578 | bool no_ispunct_; |
579 | bool no_isspace_; |
580 | |
581 | bool is_ret(Char E)const |
582 | { |
583 | if (returnable_.length()) |
584 | return returnable_.find(E) != string_type::npos; |
585 | else{ |
586 | if (no_ispunct_) {return false;} |
587 | else{ |
588 | int r = Traits::ispunct(E); |
589 | return r != 0; |
590 | } |
591 | } |
592 | } |
593 | bool is_nonret(Char E)const |
594 | { |
595 | if (nonreturnable_.length()) |
596 | return nonreturnable_.find(E) != string_type::npos; |
597 | else{ |
598 | if (no_isspace_) {return false;} |
599 | else{ |
600 | int r = Traits::isspace(E); |
601 | return r != 0; |
602 | } |
603 | } |
604 | } |
605 | |
606 | public: |
607 | explicit char_delimiters_separator(bool return_delims = false, |
608 | const Char* returnable = 0, |
609 | const Char* nonreturnable = 0) |
610 | : returnable_(returnable ? returnable : string_type().c_str()), |
611 | nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()), |
612 | return_delims_(return_delims), no_ispunct_(returnable!=0), |
613 | no_isspace_(nonreturnable!=0) { } |
614 | |
615 | void reset() { } |
616 | |
617 | public: |
618 | |
619 | template <typename InputIterator, typename Token> |
620 | bool operator()(InputIterator& next, InputIterator end,Token& tok) { |
621 | tok = Token(); |
622 | |
623 | // skip past all nonreturnable delims |
624 | // skip past the returnable only if we are not returning delims |
625 | for (;next!=end && ( is_nonret(E: *next) || (is_ret(E: *next) |
626 | && !return_delims_ ) );++next) { } |
627 | |
628 | if (next == end) { |
629 | return false; |
630 | } |
631 | |
632 | // if we are to return delims and we are one a returnable one |
633 | // move past it and stop |
634 | if (is_ret(E: *next) && return_delims_) { |
635 | tok+=*next; |
636 | ++next; |
637 | } |
638 | else |
639 | // append all the non delim characters |
640 | for (;next!=end && !is_nonret(E: *next) && !is_ret(E: *next);++next) |
641 | tok+=*next; |
642 | |
643 | |
644 | return true; |
645 | } |
646 | }; |
647 | |
648 | |
649 | } //namespace boost |
650 | |
651 | #endif |
652 | |