token_functions.hpp source code [boost/boost/token_functions.hpp]

1	// Boost token_functions.hpp ------------------------------------------------//
2
3	// Copyright John R. Bandela 2001.
4
5	// Distributed under the Boost Software License, Version 1.0. (See
6	// accompanying file LICENSE_1_0.txt or copy at
7	// http://www.boost.org/LICENSE_1_0.txt)
8
9	// See http://www.boost.org/libs/tokenizer/ for documentation.
10
11	// Revision History:
12	// 01 Oct 2004 Joaquin M Lopez Munoz
13	// Workaround for a problem with string::assign in msvc-stlport
14	// 06 Apr 2004 John Bandela
15	// Fixed a bug involving using char_delimiter with a true input iterator
16	// 28 Nov 2003 Robert Zeh and John Bandela
17	// Converted into "fast" functions that avoid using += when
18	// the supplied iterator isn't an input_iterator; based on
19	// some work done at Archelon and a version that was checked into
20	// the boost CVS for a short period of time.
21	// 20 Feb 2002 John Maddock
22	// Removed using namespace std declarations and added
23	// workaround for BOOST_NO_STDC_NAMESPACE (the library
24	// can be safely mixed with regex).
25	// 06 Feb 2002 Jeremy Siek
26	// Added char_separator.
27	// 02 Feb 2002 Jeremy Siek
28	// Removed tabs and a little cleanup.
29
30
31	#ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
32	#define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
33
34	#include <vector>
35	#include <stdexcept>
36	#include <string>
37	#include <cctype>
38	#include <algorithm> // for find_if
39	#include <boost/config.hpp>
40	#include <boost/assert.hpp>
41	#include <boost/detail/workaround.hpp>
42	#include <boost/mpl/if.hpp>
43	#include <boost/throw_exception.hpp>
44	#if !defined(BOOST_NO_CWCTYPE)
45	#include <cwctype>
46	#endif
47
48	//
49	// the following must not be macros if we are to prefix them
50	// with std:: (they shouldn't be macros anyway...)
51	//
52	#ifdef ispunct
53	# undef ispunct
54	#endif
55	#ifdef iswpunct
56	# undef iswpunct
57	#endif
58	#ifdef isspace
59	# undef isspace
60	#endif
61	#ifdef iswspace
62	# undef iswspace
63	#endif
64	//
65	// fix namespace problems:
66	//
67	#ifdef BOOST_NO_STDC_NAMESPACE
68	namespace std{
69	using ::ispunct;
70	using ::isspace;
71	#if !defined(BOOST_NO_CWCTYPE)
72	using ::iswpunct;
73	using ::iswspace;
74	#endif
75	}
76	#endif
77
78	namespace boost{
79	//===========================================================================
80	// The escaped_list_separator class. Which is a model of TokenizerFunction
81	// An escaped list is a super-set of what is commonly known as a comma
82	// separated value (csv) list.It is separated into fields by a comma or
83	// other character. If the delimiting character is inside quotes, then it is
84	// counted as a regular character.To allow for embedded quotes in a field,
85	// there can be escape sequences using the \ much like C.
86	// The role of the comma, the quotation mark, and the escape
87	// character (backslash \), can be assigned to other characters.
88
89	struct escaped_list_error : public std::runtime_error{
90	escaped_list_error(const std::string& what_arg):std::runtime_error (what_arg) { }
91	};
92
93
94	// The out of the box GCC 2.95 on cygwin does not have a char_traits class.
95	// MSVC does not like the following typename
96	template <class Char,
97	class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
98	class escaped_list_separator {
99
100	private:
101	typedef std::basic_string<Char,Traits> string_type;
102	struct char_eq {
103	Char e_;
104	char_eq(Char e):e_(e) { }
105	bool operator()(Char c) {
106	return Traits::eq(e_,c);
107	}
108	};
109	string_type escape_;
110	string_type c_;
111	string_type quote_;
112	bool last_;
113
114	bool is_escape(Char e) {
115	char_eq f(e);
116	return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end();
117	}
118	bool is_c(Char e) {
119	char_eq f(e);
120	return std::find_if(c_.begin(),c_.end(),f)!=c_.end();
121	}
122	bool is_quote(Char e) {
123	char_eq f(e);
124	return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end();
125	}
126	template <typename iterator, typename Token>
127	void do_escape(iterator& next,iterator end,Token& tok) {
128	if (++next == end)
129	BOOST_THROW_EXCEPTION(escaped_list_error (std::string ("cannot end with escape")));
130	if (Traits::eq(*next,`'n'`)) {
131	tok+=`'\n'`;
132	return;
133	}
134	else if (is_quote(e: *next)) {
135	tok+=*next;
136	return;
137	}
138	else if (is_c(e: *next)) {
139	tok+=*next;
140	return;
141	}
142	else if (is_escape(e: *next)) {
143	tok+=*next;
144	return;
145	}
146	else
147	BOOST_THROW_EXCEPTION(escaped_list_error (std::string ("unknown escape sequence")));
148	}
149
150	public:
151
152	explicit escaped_list_separator(Char e = `'\\'`,
153	Char c = `','`,Char q = `'\"'`)
154	: escape_(`1`,e), c_(`1`,c), quote_(`1`,q), last_(false) { }
155
156	escaped_list_separator(string_type e, string_type c, string_type q)
157	: escape_(e), c_(c), quote_(q), last_(false) { }
158
159	void reset() {last_=false;}
160
161	template <typename InputIterator, typename Token>
162	bool operator()(InputIterator& next,InputIterator end,Token& tok) {
163	bool bInQuote = false;
164	tok = Token();
165
166	if (next == end) {
167	if (last_) {
168	last_ = false;
169	return true;
170	}
171	else
172	return false;
173	}
174	last_ = false;
175	for (;next != end;++next) {
176	if (is_escape(e: *next)) {
177	do_escape(next,end,tok);
178	}
179	else if (is_c(e: *next)) {
180	if (!bInQuote) {
181	// If we are not in quote, then we are done
182	++next;
183	// The last character was a c, that means there is
184	// 1 more blank field
185	last_ = true;
186	return true;
187	}
188	else tok+=*next;
189	}
190	else if (is_quote(e: *next)) {
191	bInQuote=!bInQuote;
192	}
193	else {
194	tok += *next;
195	}
196	}
197	return true;
198	}
199	};
200
201	//===========================================================================
202	// The classes here are used by offset_separator and char_separator to implement
203	// faster assigning of tokens using assign instead of +=
204
205	namespace tokenizer_detail {
206	//===========================================================================
207	// Tokenizer was broken for wide character separators, at least on Windows, since
208	// CRT functions isspace etc only expect values in [0, 0xFF]. Debug build asserts
209	// if higher values are passed in. The traits extension class should take care of this.
210	// Assuming that the conditional will always get optimized out in the function
211	// implementations, argument types are not a problem since both forms of character classifiers
212	// expect an int.
213
214	#if !defined(BOOST_NO_CWCTYPE)
215	template<typename traits, int N>
216	struct traits_extension_details : public traits {
217	typedef typename traits::char_type char_type;
218	static bool isspace(char_type c)
219	{
220	return std::iswspace(wc: c) != `0`;
221	}
222	static bool ispunct(char_type c)
223	{
224	return std::iswpunct(wc: c) != `0`;
225	}
226	};
227
228	template<typename traits>
229	struct traits_extension_details<traits, `1`> : public traits {
230	typedef typename traits::char_type char_type;
231	static bool isspace(char_type c)
232	{
233	return std::isspace(c) != `0`;
234	}
235	static bool ispunct(char_type c)
236	{
237	return std::ispunct(c) != `0`;
238	}
239	};
240	#endif
241
242
243	// In case there is no cwctype header, we implement the checks manually.
244	// We make use of the fact that the tested categories should fit in ASCII.
245	template<typename traits>
246	struct traits_extension : public traits {
247	typedef typename traits::char_type char_type;
248	static bool isspace(char_type c)
249	{
250	#if !defined(BOOST_NO_CWCTYPE)
251	return traits_extension_details<traits, sizeof(char_type)>::isspace(c);
252	#else
253	return static_cast< unsigned >(c) <= `255` && std::isspace(c) != `0`;
254	#endif
255	}
256
257	static bool ispunct(char_type c)
258	{
259	#if !defined(BOOST_NO_CWCTYPE)
260	return traits_extension_details<traits, sizeof(char_type)>::ispunct(c);
261	#else
262	return static_cast< unsigned >(c) <= `255` && std::ispunct(c) != `0`;
263	#endif
264	}
265	};
266
267	// The assign_or_plus_equal struct contains functions that implement
268	// assign, +=, and clearing based on the iterator type. The
269	// generic case does nothing for plus_equal and clearing, while
270	// passing through the call for assign.
271	//
272	// When an input iterator is being used, the situation is reversed.
273	// The assign method does nothing, plus_equal invokes operator +=,
274	// and the clearing method sets the supplied token to the default
275	// token constructor's result.
276	//
277
278	template<class IteratorTag>
279	struct assign_or_plus_equal {
280	template<class Iterator, class Token>
281	static void assign(Iterator b, Iterator e, Token &t) {
282	t.assign(b, e);
283	}
284
285	template<class Token, class Value>
286	static void plus_equal(Token &, const Value &) { }
287
288	// If we are doing an assign, there is no need for the
289	// the clear.
290	//
291	template<class Token>
292	static void clear(Token &) { }
293	};
294
295	template <>
296	struct assign_or_plus_equal<std::input_iterator_tag> {
297	template<class Iterator, class Token>
298	static void assign(Iterator , Iterator , Token &) { }
299	template<class Token, class Value>
300	static void plus_equal(Token &t, const Value &v) {
301	t += v;
302	}
303	template<class Token>
304	static void clear(Token &t) {
305	t = Token();
306	}
307	};
308
309
310	template<class Iterator>
311	struct pointer_iterator_category{
312	typedef std::random_access_iterator_tag type;
313	};
314
315
316	template<class Iterator>
317	struct class_iterator_category{
318	typedef typename Iterator::iterator_category type;
319	};
320
321
322
323	// This portably gets the iterator_tag without partial template specialization
324	template<class Iterator>
325	struct get_iterator_category{
326	typedef typename mpl::if_<is_pointer<Iterator>,
327	pointer_iterator_category<Iterator>,
328	class_iterator_category<Iterator>
329	>::type cat;
330
331	typedef typename cat::type iterator_category;
332	};
333
334
335	} // namespace tokenizer_detail
336
337
338	//===========================================================================
339	// The offset_separator class, which is a model of TokenizerFunction.
340	// Offset breaks a string into tokens based on a range of offsets
341
342	class offset_separator {
343	private:
344
345	std::vector<int> offsets_;
346	unsigned int current_offset_;
347	bool wrap_offsets_;
348	bool return_partial_last_;
349
350	public:
351	template <typename Iter>
352	offset_separator(Iter begin, Iter end, bool wrap_offsets = true,
353	bool return_partial_last = true)
354	: offsets_(begin,end), current_offset_(`0`),
355	wrap_offsets_(wrap_offsets),
356	return_partial_last_(return_partial_last) { }
357
358	offset_separator()
359	: offsets_(`1`,`1`), current_offset_(),
360	wrap_offsets_(true), return_partial_last_(true) { }
361
362	void reset() {
363	current_offset_ = `0`;
364	}
365
366	template <typename InputIterator, typename Token>
367	bool operator()(InputIterator& next, InputIterator end, Token& tok)
368	{
369	typedef tokenizer_detail::assign_or_plus_equal<
370	BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category<
371	InputIterator
372	>::iterator_category
373	> assigner;
374
375	BOOST_ASSERT(!offsets_.empty());
376
377	assigner::clear(tok);
378	InputIterator start(next);
379
380	if (next == end)
381	return false;
382
383	if (current_offset_ == offsets_.size())
384	{
385	if (wrap_offsets_)
386	current_offset_=`0`;
387	else
388	return false;
389	}
390
391	int c = offsets_[current_offset_];
392	int i = `0`;
393	for (; i < c; ++i) {
394	if (next == end)break;
395	assigner::plus_equal(tok,*next++);
396	}
397	assigner::assign(start,next,tok);
398
399	if (!return_partial_last_)
400	if (i < (c-`1`) )
401	return false;
402
403	++current_offset_;
404	return true;
405	}
406	};
407
408
409	//===========================================================================
410	// The char_separator class breaks a sequence of characters into
411	// tokens based on the character delimiters (very much like bad old
412	// strtok). A delimiter character can either be kept or dropped. A
413	// kept delimiter shows up as an output token, whereas a dropped
414	// delimiter does not.
415
416	// This class replaces the char_delimiters_separator class. The
417	// constructor for the char_delimiters_separator class was too
418	// confusing and needed to be deprecated. However, because of the
419	// default arguments to the constructor, adding the new constructor
420	// would cause ambiguity, so instead I deprecated the whole class.
421	// The implementation of the class was also simplified considerably.
422
423	enum empty_token_policy { drop_empty_tokens, keep_empty_tokens };
424
425	// The out of the box GCC 2.95 on cygwin does not have a char_traits class.
426	template <typename Char,
427	typename Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
428	class char_separator
429	{
430	typedef tokenizer_detail::traits_extension<Tr> Traits;
431	typedef std::basic_string<Char,Tr> string_type;
432	public:
433	explicit
434	char_separator(const Char* dropped_delims,
435	const Char* kept_delims = `0`,
436	empty_token_policy empty_tokens = drop_empty_tokens)
437	: m_dropped_delims(dropped_delims),
438	m_use_ispunct(false),
439	m_use_isspace(false),
440	m_empty_tokens(empty_tokens),
441	m_output_done(false)
442	{
443	// Borland workaround
444	if (kept_delims)
445	m_kept_delims = kept_delims;
446	}
447
448	// use ispunct() for kept delimiters and isspace for dropped.
449	explicit
450	char_separator()
451	: m_use_ispunct(true),
452	m_use_isspace(true),
453	m_empty_tokens(drop_empty_tokens) { }
454
455	void reset() { }
456
457	template <typename InputIterator, typename Token>
458	bool operator()(InputIterator& next, InputIterator end, Token& tok)
459	{
460	typedef tokenizer_detail::assign_or_plus_equal<
461	BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category<
462	InputIterator
463	>::iterator_category
464	> assigner;
465
466	assigner::clear(tok);
467
468	// skip past all dropped_delims
469	if (m_empty_tokens == drop_empty_tokens)
470	for (; next != end && is_dropped(E: *next); ++next)
471	{ }
472
473	InputIterator start(next);
474
475	if (m_empty_tokens == drop_empty_tokens) {
476
477	if (next == end)
478	return false;
479
480
481	// if we are on a kept_delims move past it and stop
482	if (is_kept(E: *next)) {
483	assigner::plus_equal(tok,*next);
484	++next;
485	} else
486	// append all the non delim characters
487	for (; next != end && !is_dropped(E: next) && !is_kept(E: next); ++next)
488	assigner::plus_equal(tok,*next);
489	}
490	else { // m_empty_tokens == keep_empty_tokens
491
492	// Handle empty token at the end
493	if (next == end)
494	{
495	if (m_output_done == false)
496	{
497	m_output_done = true;
498	assigner::assign(start,next,tok);
499	return true;
500	}
501	else
502	return false;
503	}
504
505	if (is_kept(E: *next)) {
506	if (m_output_done == false)
507	m_output_done = true;
508	else {
509	assigner::plus_equal(tok,*next);
510	++next;
511	m_output_done = false;
512	}
513	}
514	else if (m_output_done == false && is_dropped(E: *next)) {
515	m_output_done = true;
516	}
517	else {
518	if (is_dropped(E: *next))
519	start=++next;
520	for (; next != end && !is_dropped(E: next) && !is_kept(E: next); ++next)
521	assigner::plus_equal(tok,*next);
522	m_output_done = true;
523	}
524	}
525	assigner::assign(start,next,tok);
526	return true;
527	}
528
529	private:
530	string_type m_kept_delims;
531	string_type m_dropped_delims;
532	bool m_use_ispunct;
533	bool m_use_isspace;
534	empty_token_policy m_empty_tokens;
535	bool m_output_done;
536
537	bool is_kept(Char E) const
538	{
539	if (m_kept_delims.length())
540	return m_kept_delims.find(E) != string_type::npos;
541	else if (m_use_ispunct) {
542	return Traits::ispunct(E) != `0`;
543	} else
544	return false;
545	}
546	bool is_dropped(Char E) const
547	{
548	if (m_dropped_delims.length())
549	return m_dropped_delims.find(E) != string_type::npos;
550	else if (m_use_isspace) {
551	return Traits::isspace(E) != `0`;
552	} else
553	return false;
554	}
555	};
556
557	//===========================================================================
558	// The following class is DEPRECATED, use class char_separators instead.
559	//
560	// The char_delimiters_separator class, which is a model of
561	// TokenizerFunction. char_delimiters_separator breaks a string
562	// into tokens based on character delimiters. There are 2 types of
563	// delimiters. returnable delimiters can be returned as
564	// tokens. These are often punctuation. nonreturnable delimiters
565	// cannot be returned as tokens. These are often whitespace
566
567	// The out of the box GCC 2.95 on cygwin does not have a char_traits class.
568	template <class Char,
569	class Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
570	class char_delimiters_separator {
571	private:
572
573	typedef tokenizer_detail::traits_extension<Tr> Traits;
574	typedef std::basic_string<Char,Tr> string_type;
575	string_type returnable_;
576	string_type nonreturnable_;
577	bool return_delims_;
578	bool no_ispunct_;
579	bool no_isspace_;
580
581	bool is_ret(Char E)const
582	{
583	if (returnable_.length())
584	return returnable_.find(E) != string_type::npos;
585	else{
586	if (no_ispunct_) {return false;}
587	else{
588	int r = Traits::ispunct(E);
589	return r != `0`;
590	}
591	}
592	}
593	bool is_nonret(Char E)const
594	{
595	if (nonreturnable_.length())
596	return nonreturnable_.find(E) != string_type::npos;
597	else{
598	if (no_isspace_) {return false;}
599	else{
600	int r = Traits::isspace(E);
601	return r != `0`;
602	}
603	}
604	}
605
606	public:
607	explicit char_delimiters_separator(bool return_delims = false,
608	const Char* returnable = `0`,
609	const Char* nonreturnable = `0`)
610	: returnable_(returnable ? returnable : string_type().c_str()),
611	nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()),
612	return_delims_(return_delims), no_ispunct_(returnable!=`0`),
613	no_isspace_(nonreturnable!=`0`) { }
614
615	void reset() { }
616
617	public:
618
619	template <typename InputIterator, typename Token>
620	bool operator()(InputIterator& next, InputIterator end,Token& tok) {
621	tok = Token();
622
623	// skip past all nonreturnable delims
624	// skip past the returnable only if we are not returning delims
625	for (;next!=end && ( is_nonret(E: next) \|\| (is_ret(E: next)
626	&& !return_delims_ ) );++next) { }
627
628	if (next == end) {
629	return false;
630	}
631
632	// if we are to return delims and we are one a returnable one
633	// move past it and stop
634	if (is_ret(E: *next) && return_delims_) {
635	tok+=*next;
636	++next;
637	}
638	else
639	// append all the non delim characters
640	for (;next!=end && !is_nonret(E: next) && !is_ret(E: next);++next)
641	tok+=*next;
642
643
644	return true;
645	}
646	};
647
648
649	} //namespace boost
650
651	#endif
652

source code of boost/boost/token_functions.hpp