kmime_header_parsing.cpp [kdepimlibs/kmime/kmime_header_parsing.cpp]

1	/ -- c++ --*
2	kmime_header_parsing.cpp
3
4	KMime, the KDE Internet mail/usenet news message library.
5	Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
6
7	This library is free software; you can redistribute it and/or
8	modify it under the terms of the GNU Library General Public
9	License as published by the Free Software Foundation; either
10	version 2 of the License, or (at your option) any later version.
11
12	This library is distributed in the hope that it will be useful,
13	but WITHOUT ANY WARRANTY; without even the implied warranty of
14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15	Library General Public License for more details.
16
17	You should have received a copy of the GNU Library General Public License
18	along with this library; see the file COPYING.LIB. If not, write to
19	the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20	Boston, MA 02110-1301, USA.
21	*/
22
23	#include "kmime_header_parsing.h"
24
25	#include "kmime_codecs.h"
26	#include "kmime_headerfactory_p.h"
27	#include "kmime_headers.h"
28	#include "kmime_util.h"
29	#include "kmime_util_p.h"
30	#include "kmime_dateformatter.h"
31	#include "kmime_warning.h"
32
33	#include <kglobal.h>
34	#include <kcharsets.h>
35
36	#include <QtCore/QTextCodec>
37	#include <QtCore/QMap>
38	#include <QtCore/QStringList>
39	#include <QtCore/QUrl>
40
41	#include <ctype.h> // for isdigit
42	#include <cassert>
43
44	using namespace KMime;
45	using namespace KMime::Types;
46
47	namespace KMime {
48
49	namespace Types {
50
51	// QUrl::fromAce is extremely expensive, so only use it when necessary.
52	// Fortunately, the presence of IDNA is readily detected with a substring match...
53	static inline QString QUrl_fromAce_wrapper( const QString & domain )
54	{
55	if ( domain.contains( QLatin1String ( "xn--" ) ) ) {
56	return QUrl::fromAce( domain.toLatin1() );
57	} else {
58	return domain;
59	}
60	}
61
62	static QString addr_spec_as_string( const AddrSpec & as, bool pretty )
63	{
64	if ( as.isEmpty() ) {
65	return QString ();
66	}
67
68	static QChar dotChar = QLatin1Char ( '.' );
69	static QChar backslashChar = QLatin1Char ( '\\' );
70	static QChar quoteChar = QLatin1Char ( '"' );
71
72	bool needsQuotes = false;
73	QString result;
74	result.reserve( as.localPart.length() + as.domain.length() + `1` );
75	for ( int i = `0` ; i < as.localPart.length() ; ++i ) {
76	const QChar ch = as.localPart.at( i );
77	if ( ch == dotChar \|\| isAText( ch.toLatin1() ) ) {
78	result += ch;
79	} else {
80	needsQuotes = true;
81	if ( ch == backslashChar \|\| ch == quoteChar ) {
82	result += backslashChar;
83	}
84	result += ch;
85	}
86	}
87	const QString dom = pretty ? QUrl_fromAce_wrapper( as.domain ) : as.domain ;
88	if ( needsQuotes ) {
89	result = quoteChar + result + quoteChar;
90	}
91	if ( dom.isEmpty() ) {
92	return result;
93	} else {
94	result += QLatin1Char ( '@' );
95	result += dom;
96	return result;
97	}
98	}
99
100	QString AddrSpec::asString() const
101	{
102	return addr_spec_as_string( *this, false );
103	}
104
105	QString AddrSpec::asPrettyString() const
106	{
107	return addr_spec_as_string( *this, true );
108	}
109
110	bool AddrSpec::isEmpty() const
111	{
112	return localPart.isEmpty() && domain.isEmpty();
113	}
114
115	QByteArray Mailbox::address() const
116	{
117	QByteArray result;
118	const QString asString = addr_spec_as_string( mAddrSpec, false );
119	if ( !asString.isEmpty() ) {
120	result = asString.toLatin1();
121	}
122	return result;
123	//return mAddrSpec.asString().toLatin1();
124	}
125
126	AddrSpec Mailbox::addrSpec() const
127	{
128	return mAddrSpec;
129	}
130
131	QString Mailbox::name() const
132	{
133	return mDisplayName;
134	}
135
136	void Mailbox::setAddress( const AddrSpec &addr )
137	{
138	mAddrSpec = addr;
139	}
140
141	void Mailbox::setAddress( const QByteArray &addr )
142	{
143	const char *cursor = addr.constData();
144	if ( !HeaderParsing::parseAngleAddr( cursor,
145	cursor + addr.length(), mAddrSpec ) ) {
146	if ( !HeaderParsing::parseAddrSpec( cursor, cursor + addr.length(),
147	mAddrSpec ) ) {
148	kWarning () << "Invalid address";
149	return;
150	}
151	}
152	}
153
154	void Mailbox::setName( const QString &name )
155	{
156	mDisplayName = removeBidiControlChars( name );
157	}
158
159	void Mailbox::setNameFrom7Bit( const QByteArray &name,
160	const QByteArray &defaultCharset )
161	{
162	QByteArray cs;
163	setName( decodeRFC2047String( name, cs, defaultCharset, false ) );
164	}
165
166	bool Mailbox::hasAddress() const
167	{
168	return !mAddrSpec.isEmpty();
169	}
170
171	bool Mailbox::hasName() const
172	{
173	return !mDisplayName.isEmpty();
174	}
175
176	QString Mailbox::prettyAddress() const
177	{
178	return prettyAddress( QuoteNever );
179	}
180
181	QString Mailbox::prettyAddress( Quoting quoting ) const
182	{
183	if ( !hasName() ) {
184	return QLatin1String ( address() );
185	}
186	QString s = name();
187	if ( quoting != QuoteNever ) {
188	addQuotes( s, quoting == QuoteAlways /bool force/ );
189	}
190
191	if ( hasAddress() ) {
192	s += QLatin1String ( " <" ) + QLatin1String ( address() ) + QLatin1Char ( '>' );
193	}
194	return s;
195	}
196
197	void Mailbox::fromUnicodeString( const QString &s )
198	{
199	from7BitString( encodeRFC2047Sentence( s, "utf-8" ) );
200	}
201
202	void Mailbox::from7BitString( const QByteArray &s )
203	{
204	const char *cursor = s.constData();
205	HeaderParsing::parseMailbox( cursor, cursor + s.length(), *this );
206	}
207
208	QByteArray KMime::Types::Mailbox::as7BitString( const QByteArray &encCharset ) const
209	{
210	if ( !hasName() ) {
211	return address();
212	}
213	QByteArray rv;
214	if ( isUsAscii( name() ) ) {
215	QByteArray tmp = name().toLatin1();
216	addQuotes( tmp, false );
217	rv += tmp;
218	} else {
219	rv += encodeRFC2047String( name(), encCharset, true );
220	}
221	if ( hasAddress() ) {
222	rv += " <" + address() + '>';
223	}
224	return rv;
225	}
226
227	} // namespace Types
228
229	namespace HeaderParsing {
230
231	// parse the encoded-word (scursor points to after the initial '=')
232	bool parseEncodedWord( const char* &scursor, const char * const send,
233	QString &result, QByteArray &language,
234	QByteArray &usedCS, const QByteArray &defaultCS,
235	bool forceCS )
236	{
237	// make sure the caller already did a bit of the work.
238	assert( *( scursor - `1` ) == '=' );
239
240	//
241	// STEP 1:
242	// scan for the charset/language portion of the encoded-word
243	//
244
245	char ch = *scursor++;
246
247	if ( ch != '?' ) {
248	// kDebug() << "first";
249	//KMIME_WARN_PREMATURE_END_OF( EncodedWord );
250	return false;
251	}
252
253	// remember start of charset (ie. just after the initial "=?") and
254	// language (just after the first '') fields:*
255	const char * charsetStart = scursor;
256	const char * languageStart = `0`;
257
258	// find delimiting '?' (and the '' separating charset and language*
259	// tags, if any):
260	for ( ; scursor != send ; scursor++ ) {
261	if ( *scursor == '?' ) {
262	break;
263	} else if ( scursor == '' && languageStart == `0` ) {
264	languageStart = scursor + `1`;
265	}
266	}
267
268	// not found? can't be an encoded-word!
269	if ( scursor == send \|\| *scursor != '?' ) {
270	// kDebug() << "second";
271	KMIME_WARN_PREMATURE_END_OF( EncodedWord );
272	return false;
273	}
274
275	// extract the language information, if any (if languageStart is 0,
276	// language will be null, too):
277	QByteArray maybeLanguage( languageStart, scursor - languageStart );
278	// extract charset information (keep in mind: the size given to the
279	// ctor is one off due to the \0 terminator):
280	QByteArray maybeCharset( charsetStart,
281	( languageStart ? languageStart - `1` : scursor ) - charsetStart );
282
283	//
284	// STEP 2:
285	// scan for the encoding portion of the encoded-word
286	//
287
288	// remember start of encoding (just _after_ the second '?'):
289	scursor++;
290	const char * encodingStart = scursor;
291
292	// find next '?' (ending the encoding tag):
293	for ( ; scursor != send ; scursor++ ) {
294	if ( *scursor == '?' ) {
295	break;
296	}
297	}
298
299	// not found? Can't be an encoded-word!
300	if ( scursor == send \|\| *scursor != '?' ) {
301	// kDebug() << "third";
302	KMIME_WARN_PREMATURE_END_OF( EncodedWord );
303	return false;
304	}
305
306	// extract the encoding information:
307	QByteArray maybeEncoding( encodingStart, scursor - encodingStart );
308
309	// kDebug() << "parseEncodedWord: found charset == \"" << maybeCharset
310	// << "\"; language == \"" << maybeLanguage
311	// << "\"; encoding == \"" << maybeEncoding << "\"";
312
313	//
314	// STEP 3:
315	// scan for encoded-text portion of encoded-word
316	//
317
318	// remember start of encoded-text (just after the third '?'):
319	scursor++;
320	const char * encodedTextStart = scursor;
321
322	// find the '?=' sequence (ending the encoded-text):
323	for ( ; scursor != send ; scursor++ ) {
324	if ( *scursor == '?' ) {
325	if ( scursor + `1` != send ) {
326	if ( ( scursor + `1` ) != '=' ) { // We expect a '=' after the '?', but we got something else; ignore*
327	KMIME_WARN << "Stray '?' in q-encoded word, ignoring this.";
328	continue;
329	}
330	else { // yep, found a '?=' sequence
331	scursor += `2`;
332	break;
333	}
334	}
335	else { // The '?' is the last char, but we need a '=' after it!
336	KMIME_WARN_PREMATURE_END_OF( EncodedWord );
337	return false;
338	}
339	}
340	}
341
342	if ( ( scursor - `2` ) != '?' \|\| ( scursor - `1` ) != '=' \|\|
343	scursor < encodedTextStart + `2` ) {
344	KMIME_WARN_PREMATURE_END_OF( EncodedWord );
345	return false;
346	}
347
348	// set end sentinel for encoded-text:
349	const char * const encodedTextEnd = scursor - `2`;
350
351	//
352	// STEP 4:
353	// setup decoders for the transfer encoding and the charset
354	//
355
356	// try if there's a codec for the encoding found:
357	Codec * codec = Codec::codecForName( maybeEncoding );
358	if ( !codec ) {
359	KMIME_WARN_UNKNOWN( Encoding, maybeEncoding );
360	return false;
361	}
362
363	// get an instance of a corresponding decoder:
364	Decoder * dec = codec->makeDecoder();
365	assert( dec );
366
367	// try if there's a (text)codec for the charset found:
368	bool matchOK = false;
369	QTextCodec *textCodec = `0`;
370	if ( forceCS \|\| maybeCharset.isEmpty() ) {
371	textCodec = KGlobal::charsets()->codecForName( QLatin1String ( defaultCS ), matchOK );
372	usedCS = cachedCharset( defaultCS );
373	} else {
374	textCodec = KGlobal::charsets()->codecForName( QLatin1String ( maybeCharset ), matchOK );
375	if ( !matchOK ) { //no suitable codec found => use default charset
376	textCodec = KGlobal::charsets()->codecForName( QLatin1String ( defaultCS ), matchOK );
377	usedCS = cachedCharset( defaultCS );
378	} else {
379	usedCS = cachedCharset( maybeCharset );
380	}
381	}
382
383	if ( !matchOK \|\| !textCodec ) {
384	KMIME_WARN_UNKNOWN( Charset, maybeCharset );
385	delete dec;
386	return false;
387	};
388
389	// kDebug() << "mimeName(): \"" << textCodec->name() << "\"";
390
391	// allocate a temporary buffer to store the 8bit text:
392	int encodedTextLength = encodedTextEnd - encodedTextStart;
393	QByteArray buffer;
394	buffer.resize( codec->maxDecodedSizeFor( encodedTextLength ) );
395	char *bbegin = buffer.data();
396	char *bend = bbegin + buffer.length();
397
398	//
399	// STEP 5:
400	// do the actual decoding
401	//
402
403	if ( !dec->decode( encodedTextStart, encodedTextEnd, bbegin, bend ) ) {
404	KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor("
405	<< encodedTextLength << ")\nresult may be truncated";
406	}
407
408	result = textCodec->toUnicode( buffer.data(), bbegin - buffer.data() );
409
410	// kDebug() << "result now: \"" << result << "\"";
411	// cleanup:
412	delete dec;
413	language = maybeLanguage;
414
415	return true;
416	}
417
418	static inline void eatWhiteSpace( const char* &scursor, const char * const send )
419	{
420	while ( scursor != send &&
421	( scursor == ' ' \|\| scursor == '\n' \|\|
422	scursor == '\t' \|\| scursor == '\r' ) )
423	scursor++;
424	}
425
426	bool parseAtom( const char * &scursor, const char * const send,
427	QString &result, bool allow8Bit )
428	{
429	QPair<const char, int*> maybeResult;
430
431	if ( parseAtom( scursor, send, maybeResult, allow8Bit ) ) {
432	result += QString::fromLatin1( maybeResult.first, maybeResult.second );
433	return true;
434	}
435
436	return false;
437	}
438
439	bool parseAtom( const char * &scursor, const char * const send,
440	QPair<const char,int> &result, bool* allow8Bit )
441	{
442	bool success = false;
443	const char *start = scursor;
444
445	while ( scursor != send ) {
446	signed char ch = *scursor++;
447	if ( ch > `0` && isAText( ch ) ) {
448	// AText: OK
449	success = true;
450	} else if ( allow8Bit && ch < `0` ) {
451	// 8bit char: not OK, but be tolerant.
452	KMIME_WARN_8BIT( ch );
453	success = true;
454	} else {
455	// CTL or special - marking the end of the atom:
456	// re-set sursor to point to the offending
457	// char and return:
458	scursor--;
459	break;
460	}
461	}
462	result.first = start;
463	result.second = scursor - start;
464	return success;
465	}
466
467	// FIXME: Remove this and the other parseToken() method. add a new one where "result" is a
468	// QByteArray.
469	bool parseToken( const char * &scursor, const char * const send,
470	QString &result, bool allow8Bit )
471	{
472	QPair<const char, int*> maybeResult;
473
474	if ( parseToken( scursor, send, maybeResult, allow8Bit ) ) {
475	result += QString::fromLatin1( maybeResult.first, maybeResult.second );
476	return true;
477	}
478
479	return false;
480	}
481
482	bool parseToken( const char * &scursor, const char * const send,
483	QPair<const char,int> &result, bool* allow8Bit )
484	{
485	bool success = false;
486	const char * start = scursor;
487
488	while ( scursor != send ) {
489	signed char ch = *scursor++;
490	if ( ch > `0` && isTText( ch ) ) {
491	// TText: OK
492	success = true;
493	} else if ( allow8Bit && ch < `0` ) {
494	// 8bit char: not OK, but be tolerant.
495	KMIME_WARN_8BIT( ch );
496	success = true;
497	} else {
498	// CTL or tspecial - marking the end of the atom:
499	// re-set sursor to point to the offending
500	// char and return:
501	scursor--;
502	break;
503	}
504	}
505	result.first = start;
506	result.second = scursor - start;
507	return success;
508	}
509
510	#define READ_ch_OR_FAIL if ( scursor == send ) { \
511	KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \
512	return false; \
513	} else { \
514	ch = *scursor++; \
515	}
516
517	// known issues:
518	//
519	// - doesn't handle quoted CRLF
520
521	// FIXME: Why is result a QString? This should be a QByteArray, since at this level, we don't
522	// know about encodings yet!
523	bool parseGenericQuotedString( const char* &scursor, const char * const send,
524	QString &result, bool isCRLF,
525	const char openChar, const char closeChar )
526	{
527	char ch;
528	// We are in a quoted-string or domain-literal or comment and the
529	// cursor points to the first char after the openChar.
530	// We will apply unfolding and quoted-pair removal.
531	// We return when we either encounter the end or unescaped openChar
532	// or closeChar.
533
534	assert( ( scursor - `1` ) == openChar \|\| ( scursor - `1` ) == closeChar );
535
536	while ( scursor != send ) {
537	ch = *scursor++;
538
539	if ( ch == closeChar \|\| ch == openChar ) {
540	// end of quoted-string or another opening char:
541	// let caller decide what to do.
542	return true;
543	}
544
545	switch ( ch ) {
546	case '\\': // quoted-pair
547	// misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5
548	READ_ch_OR_FAIL;
549	KMIME_WARN_IF_8BIT( ch );
550	result += QLatin1Char ( ch );
551	break;
552	case '\r':
553	// ###
554	// The case of lonely '\r' is easy to solve, as they're
555	// not part of Unix Line-ending conventions.
556	// But I see a problem if we are given Unix-native
557	// line-ending-mails, where we cannot determine anymore
558	// whether a given '\n' was part of a CRLF or was occurring
559	// on it's own.
560	READ_ch_OR_FAIL;
561	if ( ch != '\n' ) {
562	// CR on it's own...
563	KMIME_WARN_LONE( CR );
564	result += QLatin1Char ( '\r' );
565	scursor--; // points to after the '\r' again
566	} else {
567	// CRLF encountered.
568	// lookahead: check for folding
569	READ_ch_OR_FAIL;
570	if ( ch == ' ' \|\| ch == '\t' ) {
571	// correct folding;
572	// position cursor behind the CRLF WSP (unfolding)
573	// and add the WSP to the result
574	result += QLatin1Char ( ch );
575	} else {
576	// this is the "shouldn't happen"-case. There is a CRLF
577	// inside a quoted-string without it being part of FWS.
578	// We take it verbatim.
579	KMIME_WARN_NON_FOLDING( CRLF );
580	result += QLatin1String ( "\r\n" );
581	// the cursor is decremented again, so's we need not
582	// duplicate the whole switch here. "ch" could've been
583	// everything (incl. openChar or closeChar).
584	scursor--;
585	}
586	}
587	break;
588	case '\n':
589	// Note: CRLF has been handled above already!
590	// ### LF needs special treatment, depending on whether isCRLF
591	// is true (we can be sure a lonely '\n' was meant this way) or
592	// false ('\n' alone could have meant LF or CRLF in the original
593	// message. This parser assumes CRLF iff the LF is followed by
594	// either WSP (folding) or NULL (premature end of quoted-string;
595	// Should be fixed, since NULL is allowed as per rfc822).
596	READ_ch_OR_FAIL;
597	if ( !isCRLF && ( ch == ' ' \|\| ch == '\t' ) ) {
598	// folding
599	// correct folding
600	result += QLatin1Char ( ch );
601	} else {
602	// non-folding
603	KMIME_WARN_LONE( LF );
604	result += QLatin1Char ( '\n' );
605	// pos is decremented, so's we need not duplicate the whole
606	// switch here. ch could've been everything (incl. <">, "\").
607	scursor--;
608	}
609	break;
610	case '=':
611	{
612	// ### Work around broken clients that send encoded words in quoted-strings
613	// For example, older KMail versions.
614	if ( scursor == send ) {
615	break;
616	}
617
618	const char *oldscursor = scursor;
619	QString tmp;
620	QByteArray lang, charset;
621	if ( *scursor++ == '?' ) {
622	--scursor;
623	if ( parseEncodedWord( scursor, send, tmp, lang, charset ) ) {
624	result += tmp;
625	break;
626	} else {
627	scursor = oldscursor;
628	}
629	} else {
630	scursor = oldscursor;
631	}
632	// fall through
633	}
634	default:
635	KMIME_WARN_IF_8BIT( ch );
636	result += QLatin1Char ( ch );
637	}
638	}
639
640	return false;
641	}
642
643	// known issues:
644	//
645	// - doesn't handle encoded-word inside comments.
646
647	bool parseComment( const char* &scursor, const char * const send,
648	QString &result, bool isCRLF, bool reallySave )
649	{
650	int commentNestingDepth = `1`;
651	const char *afterLastClosingParenPos = `0`;
652	QString maybeCmnt;
653	const char *oldscursor = scursor;
654
655	assert( *( scursor - `1` ) == '(' );
656
657	while ( commentNestingDepth ) {
658	QString cmntPart;
659	if ( parseGenericQuotedString( scursor, send, cmntPart, isCRLF, '(', ')' ) ) {
660	assert( ( scursor - `1` ) == ')' \|\| ( scursor - `1` ) == '(' );
661	// see the kdoc for above function for the possible conditions
662	// we have to check:
663	switch ( *( scursor - `1` ) ) {
664	case ')':
665	if ( reallySave ) {
666	// add the chunk that's now surely inside the comment.
667	result += maybeCmnt;
668	result += cmntPart;
669	if ( commentNestingDepth > `1` ) {
670	// don't add the outermost ')'...
671	result += QLatin1Char ( ')' );
672	}
673	maybeCmnt.clear();
674	}
675	afterLastClosingParenPos = scursor;
676	--commentNestingDepth;
677	break;
678	case '(':
679	if ( reallySave ) {
680	// don't add to "result" yet, because we might find that we
681	// are already outside the (broken) comment...
682	maybeCmnt += cmntPart;
683	maybeCmnt += QLatin1Char ( '(' );
684	}
685	++commentNestingDepth;
686	break;
687	default: assert( `0` );
688	} // switch
689	} else {
690	// !parseGenericQuotedString, ie. premature end
691	if ( afterLastClosingParenPos ) {
692	scursor = afterLastClosingParenPos;
693	} else {
694	scursor = oldscursor;
695	}
696	return false;
697	}
698	} // while
699
700	return true;
701	}
702
703	// known issues: none.
704
705	bool parsePhrase( const char* &scursor, const char * const send,
706	QString &result, bool isCRLF )
707	{
708	enum {
709	None, Phrase, Atom, EncodedWord, QuotedString
710	} found = None;
711
712	QString tmp;
713	QByteArray lang, charset;
714	const char *successfullyParsed = `0`;
715	// only used by the encoded-word branch
716	const char *oldscursor;
717	// used to suppress whitespace between adjacent encoded-words
718	// (rfc2047, 6.2):
719	bool lastWasEncodedWord = false;
720
721	while ( scursor != send ) {
722	char ch = *scursor++;
723	switch ( ch ) {
724	case '.': // broken, but allow for intorop's sake
725	if ( found == None ) {
726	--scursor;
727	return false;
728	} else {
729	if ( scursor != send && ( scursor == ' ' \|\| scursor == '\t' ) ) {
730	result += QLatin1String ( ". " );
731	} else {
732	result += QLatin1Char ( '.' );
733	}
734	successfullyParsed = scursor;
735	}
736	break;
737	case '"': // quoted-string
738	tmp.clear();
739	if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
740	successfullyParsed = scursor;
741	assert( *( scursor - `1` ) == '"' );
742	switch ( found ) {
743	case None:
744	found = QuotedString;
745	break;
746	case Phrase:
747	case Atom:
748	case EncodedWord:
749	case QuotedString:
750	found = Phrase;
751	result += QLatin1Char ( ' ' ); // rfc822, 3.4.4
752	break;
753	default:
754	assert( `0` );
755	}
756	lastWasEncodedWord = false;
757	result += tmp;
758	} else {
759	// premature end of quoted string.
760	// What to do? Return leading '"' as special? Return as quoted-string?
761	// We do the latter if we already found something, else signal failure.
762	if ( found == None ) {
763	return false;
764	} else {
765	result += QLatin1Char ( ' ' ); // rfc822, 3.4.4
766	result += tmp;
767	return true;
768	}
769	}
770	break;
771	case '(': // comment
772	// parse it, but ignore content:
773	tmp.clear();
774	if ( parseComment( scursor, send, tmp, isCRLF,
775	false /don't bother with the content/ ) ) {
776	successfullyParsed = scursor;
777	lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2
778	} else {
779	if ( found == None ) {
780	return false;
781	} else {
782	scursor = successfullyParsed;
783	return true;
784	}
785	}
786	break;
787	case '=': // encoded-word
788	tmp.clear();
789	oldscursor = scursor;
790	lang.clear();
791	charset.clear();
792	if ( parseEncodedWord( scursor, send, tmp, lang, charset ) ) {
793	successfullyParsed = scursor;
794	switch ( found ) {
795	case None:
796	found = EncodedWord;
797	break;
798	case Phrase:
799	case EncodedWord:
800	case Atom:
801	case QuotedString:
802	if ( !lastWasEncodedWord ) {
803	result += QLatin1Char ( ' ' ); // rfc822, 3.4.4
804	}
805	found = Phrase;
806	break;
807	default: assert( `0` );
808	}
809	lastWasEncodedWord = true;
810	result += tmp;
811	break;
812	} else {
813	// parse as atom:
814	scursor = oldscursor;
815	}
816	// fall though...
817
818	default: //atom
819	tmp.clear();
820	scursor--;
821	if ( parseAtom( scursor, send, tmp, true / allow 8bit / ) ) {
822	successfullyParsed = scursor;
823	switch ( found ) {
824	case None:
825	found = Atom;
826	break;
827	case Phrase:
828	case Atom:
829	case EncodedWord:
830	case QuotedString:
831	found = Phrase;
832	result += QLatin1Char ( ' ' ); // rfc822, 3.4.4
833	break;
834	default:
835	assert( `0` );
836	}
837	lastWasEncodedWord = false;
838	result += tmp;
839	} else {
840	if ( found == None ) {
841	return false;
842	} else {
843	scursor = successfullyParsed;
844	return true;
845	}
846	}
847	}
848	eatWhiteSpace( scursor, send );
849	}
850
851	return found != None;
852	}
853
854	// FIXME: This should probably by QByteArray &result instead?
855	bool parseDotAtom( const char* &scursor, const char * const send,
856	QString &result, bool isCRLF )
857	{
858	eatCFWS( scursor, send, isCRLF );
859
860	// always points to just after the last atom parsed:
861	const char *successfullyParsed;
862
863	QString tmp;
864	if ( !parseAtom( scursor, send, tmp, false / no 8bit / ) ) {
865	return false;
866	}
867	result += tmp;
868	successfullyParsed = scursor;
869
870	while ( scursor != send ) {
871
872	// end of header or no '.' -> return
873	if ( scursor == send \|\| *scursor != '.' ) {
874	return true;
875	}
876	scursor++; // eat '.'
877
878	if ( scursor == send \|\| !isAText( *scursor ) ) {
879	// end of header or no AText, but this time following a '.'!:
880	// reset cursor to just after last successfully parsed char and
881	// return:
882	scursor = successfullyParsed;
883	return true;
884	}
885
886	// try to parse the next atom:
887	QString maybeAtom;
888	if ( !parseAtom( scursor, send, maybeAtom, false /no 8bit/ ) ) {
889	scursor = successfullyParsed;
890	return true;
891	}
892
893	result += QLatin1Char ( '.' );
894	result += maybeAtom;
895	successfullyParsed = scursor;
896	}
897
898	scursor = successfullyParsed;
899	return true;
900	}
901
902	void eatCFWS( const char* &scursor, const char * const send, bool isCRLF )
903	{
904	QString dummy;
905
906	while ( scursor != send ) {
907	const char *oldscursor = scursor;
908
909	char ch = *scursor++;
910
911	switch ( ch ) {
912	case ' ':
913	case '\t': // whitespace
914	case '\r':
915	case '\n': // folding
916	continue;
917
918	case '(': // comment
919	if ( parseComment( scursor, send, dummy, isCRLF, false /don't save/ ) ) {
920	continue;
921	}
922	scursor = oldscursor;
923	return;
924
925	default:
926	scursor = oldscursor;
927	return;
928	}
929	}
930	}
931
932	bool parseDomain( const char* &scursor, const char * const send,
933	QString &result, bool isCRLF )
934	{
935	eatCFWS( scursor, send, isCRLF );
936	if ( scursor == send ) {
937	return false;
938	}
939
940	// domain := dot-atom / domain-literal / atom ("." atom)*
941	//
942	// equivalent to:
943	// domain = dot-atom / domain-literal,
944	// since parseDotAtom does allow CFWS between atoms and dots
945
946	if ( *scursor == '[' ) {
947	// domain-literal:
948	QString maybeDomainLiteral;
949	// eat '[':
950	scursor++;
951	while ( parseGenericQuotedString( scursor, send, maybeDomainLiteral,
952	isCRLF, '[', ']' ) ) {
953	if ( scursor == send ) {
954	// end of header: check for closing ']':
955	if ( *( scursor - `1` ) == ']' ) {
956	// OK, last char was ']':
957	result = maybeDomainLiteral;
958	return true;
959	} else {
960	// not OK, domain-literal wasn't closed:
961	return false;
962	}
963	}
964	// we hit openChar in parseGenericQuotedString.
965	// include it in maybeDomainLiteral and keep on parsing:
966	if ( *( scursor - `1` ) == '[' ) {
967	maybeDomainLiteral += QLatin1Char ( '[' );
968	continue;
969	}
970	// OK, real end of domain-literal:
971	result = maybeDomainLiteral;
972	return true;
973	}
974	} else {
975	// dot-atom:
976	QString maybeDotAtom;
977	if ( parseDotAtom( scursor, send, maybeDotAtom, isCRLF ) ) {
978	result = maybeDotAtom;
979	// Domain may end with '.', if so preserve it'
980	if ( scursor != send && *scursor == '.' ) {
981	result += QLatin1Char ( '.' );
982	scursor++;
983	}
984	return true;
985	}
986	}
987	return false;
988	}
989
990	bool parseObsRoute( const char* &scursor, const char* const send,
991	QStringList &result, bool isCRLF, bool save )
992	{
993	while ( scursor != send ) {
994	eatCFWS( scursor, send, isCRLF );
995	if ( scursor == send ) {
996	return false;
997	}
998
999	// empty entry:
1000	if ( *scursor == ',' ) {
1001	scursor++;
1002	if ( save ) {
1003	result.append( QString () );
1004	}
1005	continue;
1006	}
1007
1008	// empty entry ending the list:
1009	if ( *scursor == ':' ) {
1010	scursor++;
1011	if ( save ) {
1012	result.append( QString () );
1013	}
1014	return true;
1015	}
1016
1017	// each non-empty entry must begin with '@':
1018	if ( *scursor != '@' ) {
1019	return false;
1020	} else {
1021	scursor++;
1022	}
1023
1024	QString maybeDomain;
1025	if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
1026	return false;
1027	}
1028	if ( save ) {
1029	result.append( maybeDomain );
1030	}
1031
1032	// eat the following (optional) comma:
1033	eatCFWS( scursor, send, isCRLF );
1034	if ( scursor == send ) {
1035	return false;
1036	}
1037	if ( *scursor == ':' ) {
1038	scursor++;
1039	return true;
1040	}
1041	if ( *scursor == ',' ) {
1042	scursor++;
1043	}
1044	}
1045
1046	return false;
1047	}
1048
1049	bool parseAddrSpec( const char* &scursor, const char * const send,
1050	AddrSpec &result, bool isCRLF )
1051	{
1052	//
1053	// STEP 1:
1054	// local-part := dot-atom / quoted-string / word ("." word)*
1055	//
1056	// this is equivalent to:
1057	// local-part := word ("." word)*
1058
1059	QString maybeLocalPart;
1060	QString tmp;
1061
1062	while ( scursor != send ) {
1063	// first, eat any whitespace
1064	eatCFWS( scursor, send, isCRLF );
1065
1066	char ch = *scursor++;
1067	switch ( ch ) {
1068	case '.': // dot
1069	maybeLocalPart += QLatin1Char ( '.' );
1070	break;
1071
1072	case '@':
1073	goto SAW_AT_SIGN;
1074	break;
1075
1076	case '"': // quoted-string
1077	tmp.clear();
1078	if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
1079	maybeLocalPart += tmp;
1080	} else {
1081	return false;
1082	}
1083	break;
1084
1085	default: // atom
1086	scursor--; // re-set scursor to point to ch again
1087	tmp.clear();
1088	if ( parseAtom( scursor, send, tmp, false / no 8bit / ) ) {
1089	maybeLocalPart += tmp;
1090	} else {
1091	return false; // parseAtom can only fail if the first char is non-atext.
1092	}
1093	break;
1094	}
1095	}
1096
1097	return false;
1098
1099	//
1100	// STEP 2:
1101	// domain
1102	//
1103
1104	SAW_AT_SIGN:
1105
1106	assert( *( scursor - `1` ) == '@' );
1107
1108	QString maybeDomain;
1109	if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
1110	return false;
1111	}
1112
1113	result.localPart = maybeLocalPart;
1114	result.domain = maybeDomain;
1115
1116	return true;
1117	}
1118
1119	bool parseAngleAddr( const char* &scursor, const char * const send,
1120	AddrSpec &result, bool isCRLF )
1121	{
1122	// first, we need an opening angle bracket:
1123	eatCFWS( scursor, send, isCRLF );
1124	if ( scursor == send \|\| *scursor != '<' ) {
1125	return false;
1126	}
1127	scursor++; // eat '<'
1128
1129	eatCFWS( scursor, send, isCRLF );
1130	if ( scursor == send ) {
1131	return false;
1132	}
1133
1134	if ( scursor == '@' \|\| scursor == ',' ) {
1135	// obs-route: parse, but ignore:
1136	KMIME_WARN << "obsolete source route found! ignoring.";
1137	QStringList dummy;
1138	if ( !parseObsRoute( scursor, send, dummy,
1139	isCRLF, false / don't save / ) ) {
1140	return false;
1141	}
1142	// angle-addr isn't complete until after the '>':
1143	if ( scursor == send ) {
1144	return false;
1145	}
1146	}
1147
1148	// parse addr-spec:
1149	AddrSpec maybeAddrSpec;
1150	if ( !parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
1151	return false;
1152	}
1153
1154	eatCFWS( scursor, send, isCRLF );
1155	if ( scursor == send \|\| *scursor != '>' ) {
1156	return false;
1157	}
1158	scursor++;
1159
1160	result = maybeAddrSpec;
1161	return true;
1162
1163	}
1164
1165	static QString stripQuotes( const QString &input )
1166	{
1167	const QLatin1Char quotes( '"' );
1168	if ( input.startsWith( quotes ) && input.endsWith( quotes ) ) {
1169	QString stripped( input.mid( `1`, input.size() - `2` ) );
1170	return stripped;
1171	} else {
1172	return input;
1173	}
1174	}
1175
1176	bool parseMailbox( const char* &scursor, const char * const send,
1177	Mailbox &result, bool isCRLF )
1178	{
1179	eatCFWS( scursor, send, isCRLF );
1180	if ( scursor == send ) {
1181	return false;
1182	}
1183
1184	AddrSpec maybeAddrSpec;
1185	QString maybeDisplayName;
1186
1187	// first, try if it's a vanilla addr-spec:
1188	const char * oldscursor = scursor;
1189	if ( parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
1190	result.setAddress( maybeAddrSpec );
1191	// check for the obsolete form of display-name (as comment):
1192	eatWhiteSpace( scursor, send );
1193	if ( scursor != send && *scursor == '(' ) {
1194	scursor++;
1195	if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /keep/ ) ) {
1196	return false;
1197	}
1198	}
1199	result.setName( stripQuotes( maybeDisplayName ) );
1200	return true;
1201	}
1202	scursor = oldscursor;
1203
1204	// second, see if there's a display-name:
1205	if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
1206	// failed: reset cursor, note absent display-name
1207	maybeDisplayName.clear();
1208	scursor = oldscursor;
1209	} else {
1210	// succeeded: eat CFWS
1211	eatCFWS( scursor, send, isCRLF );
1212	if ( scursor == send ) {
1213	return false;
1214	}
1215	}
1216
1217	// third, parse the angle-addr:
1218	if ( !parseAngleAddr( scursor, send, maybeAddrSpec, isCRLF ) ) {
1219	return false;
1220	}
1221
1222	if ( maybeDisplayName.isNull() ) {
1223	// check for the obsolete form of display-name (as comment):
1224	eatWhiteSpace( scursor, send );
1225	if ( scursor != send && *scursor == '(' ) {
1226	scursor++;
1227	if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /keep/ ) ) {
1228	return false;
1229	}
1230	}
1231	}
1232
1233	result.setName( stripQuotes( maybeDisplayName ) );
1234	result.setAddress( maybeAddrSpec );
1235	return true;
1236	}
1237
1238	bool parseGroup( const char* &scursor, const char * const send,
1239	Address &result, bool isCRLF )
1240	{
1241	// group := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS]
1242	//
1243	// equivalent to:
1244	// group := display-name ":" [ obs-mbox-list ] ";"
1245
1246	eatCFWS( scursor, send, isCRLF );
1247	if ( scursor == send ) {
1248	return false;
1249	}
1250
1251	// get display-name:
1252	QString maybeDisplayName;
1253	if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
1254	return false;
1255	}
1256
1257	// get ":":
1258	eatCFWS( scursor, send, isCRLF );
1259	if ( scursor == send \|\| *scursor != ':' ) {
1260	return false;
1261	}
1262
1263	// KDE5 TODO: Don't expose displayName as public, but rather add setter for it that
1264	// automatically calls removeBidiControlChars
1265	result.displayName = removeBidiControlChars( maybeDisplayName );
1266
1267	// get obs-mbox-list (may contain empty entries):
1268	scursor++;
1269	while ( scursor != send ) {
1270	eatCFWS( scursor, send, isCRLF );
1271	if ( scursor == send ) {
1272	return false;
1273	}
1274
1275	// empty entry:
1276	if ( *scursor == ',' ) {
1277	scursor++;
1278	continue;
1279	}
1280
1281	// empty entry ending the list:
1282	if ( *scursor == ';' ) {
1283	scursor++;
1284	return true;
1285	}
1286
1287	Mailbox maybeMailbox;
1288	if ( !parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
1289	return false;
1290	}
1291	result.mailboxList.append( maybeMailbox );
1292
1293	eatCFWS( scursor, send, isCRLF );
1294	// premature end:
1295	if ( scursor == send ) {
1296	return false;
1297	}
1298	// regular end of the list:
1299	if ( *scursor == ';' ) {
1300	scursor++;
1301	return true;
1302	}
1303	// eat regular list entry separator:
1304	if ( *scursor == ',' ) {
1305	scursor++;
1306	}
1307	}
1308	return false;
1309	}
1310
1311	bool parseAddress( const char* &scursor, const char * const send,
1312	Address &result, bool isCRLF )
1313	{
1314	// address := mailbox / group
1315
1316	eatCFWS( scursor, send, isCRLF );
1317	if ( scursor == send ) {
1318	return false;
1319	}
1320
1321	// first try if it's a single mailbox:
1322	Mailbox maybeMailbox;
1323	const char * oldscursor = scursor;
1324	if ( parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
1325	// yes, it is:
1326	result.displayName.clear();
1327	result.mailboxList.append( maybeMailbox );
1328	return true;
1329	}
1330	scursor = oldscursor;
1331
1332	Address maybeAddress;
1333
1334	// no, it's not a single mailbox. Try if it's a group:
1335	if ( !parseGroup( scursor, send, maybeAddress, isCRLF ) ) {
1336	return false;
1337	}
1338
1339	result = maybeAddress;
1340	return true;
1341	}
1342
1343	bool parseAddressList( const char* &scursor, const char * const send,
1344	AddressList &result, bool isCRLF )
1345	{
1346	while ( scursor != send ) {
1347	eatCFWS( scursor, send, isCRLF );
1348	// end of header: this is OK.
1349	if ( scursor == send ) {
1350	return true;
1351	}
1352	// empty entry: ignore:
1353	if ( *scursor == ',' ) {
1354	scursor++;
1355	continue;
1356	}
1357	// broken clients might use ';' as list delimiter, accept that as well
1358	if ( *scursor == ';' ) {
1359	scursor++;
1360	continue;
1361	}
1362
1363	// parse one entry
1364	Address maybeAddress;
1365	if ( !parseAddress( scursor, send, maybeAddress, isCRLF ) ) {
1366	return false;
1367	}
1368	result.append( maybeAddress );
1369
1370	eatCFWS( scursor, send, isCRLF );
1371	// end of header: this is OK.
1372	if ( scursor == send ) {
1373	return true;
1374	}
1375	// comma separating entries: eat it.
1376	if ( *scursor == ',' ) {
1377	scursor++;
1378	}
1379	}
1380	return true;
1381	}
1382
1383	static QString asterisk = QString::fromLatin1( "0", `1` );
1384	static QString asteriskZero = QString::fromLatin1( "0", `2` );
1385	//static QString asteriskZeroAsterisk = QString::fromLatin1( "0", 3 );
1386
1387	// FIXME: Get rid of the very ugly "QStringOrQPair" thing. At this level, we are supposed to work
1388	// on byte arrays, not strings! The result parameter should be a simple
1389	// QPair<QByteArray,QByteArray>, which is the attribute name and the value.
1390	bool parseParameter( const char* &scursor, const char * const send,
1391	QPair<QString,QStringOrQPair> &result, bool isCRLF )
1392	{
1393	// parameter = regular-parameter / extended-parameter
1394	// regular-parameter = regular-parameter-name "=" value
1395	// extended-parameter =
1396	// value = token / quoted-string
1397	//
1398	// note that rfc2231 handling is out of the scope of this function.
1399	// Therefore we return the attribute as QString and the value as
1400	// (start,length) tupel if we see that the value is encoded
1401	// (trailing asterisk), for parseParameterList to decode...
1402
1403	eatCFWS( scursor, send, isCRLF );
1404	if ( scursor == send ) {
1405	return false;
1406	}
1407
1408	//
1409	// parse the parameter name:
1410	//
1411	// FIXME: maybeAttribute should be a QByteArray
1412	QString maybeAttribute;
1413	if ( !parseToken( scursor, send, maybeAttribute, false / no 8bit / ) ) {
1414	return false;
1415	}
1416
1417	eatCFWS( scursor, send, isCRLF );
1418	// premature end: not OK (haven't seen '=' yet).
1419	if ( scursor == send \|\| *scursor != '=' ) {
1420	return false;
1421	}
1422	scursor++; // eat '='
1423
1424	eatCFWS( scursor, send, isCRLF );
1425	if ( scursor == send ) {
1426	// don't choke on attribute=, meaning the value was omitted:
1427	if ( maybeAttribute.endsWith( asterisk ) ) {
1428	KMIME_WARN << "attribute ends with \"*\", but value is empty!"
1429	"Chopping away \"*\".";
1430	maybeAttribute.truncate( maybeAttribute.length() - `1` );
1431	}
1432	result = qMakePair( maybeAttribute.toLower(), QStringOrQPair () );
1433	return true;
1434	}
1435
1436	const char * oldscursor = scursor;
1437
1438	//
1439	// parse the parameter value:
1440	//
1441	QStringOrQPair maybeValue;
1442	if ( *scursor == '"' ) {
1443	// value is a quoted-string:
1444	scursor++;
1445	if ( maybeAttribute.endsWith( asterisk ) ) {
1446	// attributes ending with "" designate extended-parameters,*
1447	// which cannot have quoted-strings as values. So we remove the
1448	// trailing "" to not confuse upper layers.*
1449	KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!"
1450	"Chopping away \"*\".";
1451	maybeAttribute.truncate( maybeAttribute.length() - `1` );
1452	}
1453
1454	if ( !parseGenericQuotedString( scursor, send, maybeValue.qstring, isCRLF ) ) {
1455	scursor = oldscursor;
1456	result = qMakePair( maybeAttribute.toLower(), QStringOrQPair () );
1457	return false; // this case needs further processing by upper layers!!
1458	}
1459	} else {
1460	// value is a token:
1461	if ( !parseToken( scursor, send, maybeValue.qpair, false / no 8bit / ) ) {
1462	scursor = oldscursor;
1463	result = qMakePair( maybeAttribute.toLower(), QStringOrQPair () );
1464	return false; // this case needs further processing by upper layers!!
1465	}
1466	}
1467
1468	result = qMakePair( maybeAttribute.toLower(), maybeValue );
1469	return true;
1470	}
1471
1472	// FIXME: Get rid of QStringOrQPair: Use a simply QMap<QByteArray, QByteArray> for "result"
1473	// instead!
1474	bool parseRawParameterList( const char* &scursor, const char * const send,
1475	QMap<QString,QStringOrQPair> &result,
1476	bool isCRLF )
1477	{
1478	// we use parseParameter() consecutively to obtain a map of raw
1479	// attributes to raw values. "Raw" here means that we don't do
1480	// rfc2231 decoding and concatenation. This is left to
1481	// parseParameterList(), which will call this function.
1482	//
1483	// The main reason for making this chunk of code a separate
1484	// (private) method is that we can deal with broken parameters
1485	// _here_ and leave the rfc2231 handling solely to
1486	// parseParameterList(), which will still be enough work.
1487
1488	while ( scursor != send ) {
1489	eatCFWS( scursor, send, isCRLF );
1490	// empty entry ending the list: OK.
1491	if ( scursor == send ) {
1492	return true;
1493	}
1494	// empty list entry: ignore.
1495	if ( *scursor == ';' ) {
1496	scursor++;
1497	continue;
1498	}
1499
1500	QPair<QString, QStringOrQPair> maybeParameter;
1501	if ( !parseParameter( scursor, send, maybeParameter, isCRLF ) ) {
1502	// we need to do a bit of work if the attribute is not
1503	// NULL. These are the cases marked with "needs further
1504	// processing" in parseParameter(). Specifically, parsing of the
1505	// token or the quoted-string, which should represent the value,
1506	// failed. We take the easy way out and simply search for the
1507	// next ';' to start parsing again. (Another option would be to
1508	// take the text between '=' and ';' as value)
1509	if ( maybeParameter.first.isNull() ) {
1510	return false;
1511	}
1512	while ( scursor != send ) {
1513	if ( *scursor++ == ';' ) {
1514	goto IS_SEMICOLON;
1515	}
1516	}
1517	// scursor == send case: end of list.
1518	return true;
1519	IS_SEMICOLON:
1520	// scursor == ';' case: parse next entry.*
1521	continue;
1522	}
1523	// successful parsing brings us here:
1524	result.insert( maybeParameter.first, maybeParameter.second );
1525
1526	eatCFWS( scursor, send, isCRLF );
1527	// end of header: ends list.
1528	if ( scursor == send ) {
1529	return true;
1530	}
1531	// regular separator: eat it.
1532	if ( *scursor == ';' ) {
1533	scursor++;
1534	}
1535	}
1536	return true;
1537	}
1538
1539	static void decodeRFC2231Value( Codec* &rfc2231Codec,
1540	QTextCodec* &textcodec,
1541	bool isContinuation, QString &value,
1542	QPair<const char,int*> &source, QByteArray& charset )
1543	{
1544	//
1545	// parse the raw value into (charset,language,text):
1546	//
1547
1548	const char * decBegin = source.first;
1549	const char * decCursor = decBegin;
1550	const char * decEnd = decCursor + source.second;
1551
1552	if ( !isContinuation ) {
1553	// find the first single quote
1554	while ( decCursor != decEnd ) {
1555	if ( *decCursor == '\'' ) {
1556	break;
1557	} else {
1558	decCursor++;
1559	}
1560	}
1561
1562	if ( decCursor == decEnd ) {
1563	// there wasn't a single single quote at all!
1564	// take the whole value to be in latin-1:
1565	KMIME_WARN << "No charset in extended-initial-value."
1566	"Assuming \"iso-8859-1\".";
1567	value += QString::fromLatin1( decBegin, source.second );
1568	return;
1569	}
1570
1571	charset = QByteArray ( decBegin, decCursor - decBegin );
1572
1573	const char * oldDecCursor = ++decCursor;
1574	// find the second single quote (we ignore the language tag):
1575	while ( decCursor != decEnd ) {
1576	if ( *decCursor == '\'' ) {
1577	break;
1578	} else {
1579	decCursor++;
1580	}
1581	}
1582	if ( decCursor == decEnd ) {
1583	KMIME_WARN << "No language in extended-initial-value."
1584	"Trying to recover.";
1585	decCursor = oldDecCursor;
1586	} else {
1587	decCursor++;
1588	}
1589
1590	// decCursor now points to the start of the
1591	// "extended-other-values":
1592
1593	//
1594	// get the decoders:
1595	//
1596
1597	bool matchOK = false;
1598	textcodec = KGlobal::charsets()->codecForName( QLatin1String ( charset ), matchOK );
1599	if ( !matchOK ) {
1600	textcodec = `0`;
1601	KMIME_WARN_UNKNOWN( Charset, charset );
1602	}
1603	}
1604
1605	if ( !rfc2231Codec ) {
1606	rfc2231Codec = Codec::codecForName( "x-kmime-rfc2231" );
1607	assert( rfc2231Codec );
1608	}
1609
1610	if ( !textcodec ) {
1611	value += QString::fromLatin1( decCursor, decEnd - decCursor );
1612	return;
1613	}
1614
1615	Decoder * dec = rfc2231Codec->makeDecoder();
1616	assert( dec );
1617
1618	//
1619	// do the decoding:
1620	//
1621
1622	QByteArray buffer;
1623	buffer.resize( rfc2231Codec->maxDecodedSizeFor( decEnd - decCursor ) );
1624	QByteArray::Iterator bit = buffer.begin();
1625	QByteArray::ConstIterator bend = buffer.end();
1626
1627	if ( !dec->decode( decCursor, decEnd, bit, bend ) ) {
1628	KMIME_WARN << rfc2231Codec->name()
1629	<< "codec lies about its maxDecodedSizeFor()" << endl
1630	<< "result may be truncated";
1631	}
1632
1633	value += textcodec->toUnicode( buffer.begin(), bit - buffer.begin() );
1634
1635	// kDebug() << "value now: \"" << value << "\"";
1636	// cleanup:
1637	delete dec;
1638	}
1639
1640	// known issues:
1641	// - permutes rfc2231 continuations when the total number of parts
1642	// exceeds 10 (other-sections then becomes xy, ie. two digits)*
1643
1644	bool parseParameterListWithCharset( const char* &scursor,
1645	const char * const send,
1646	QMap<QString,QString> &result,
1647	QByteArray& charset, bool isCRLF )
1648	{
1649	// parse the list into raw attribute-value pairs:
1650	QMap<QString, QStringOrQPair> rawParameterList;
1651	if ( !parseRawParameterList( scursor, send, rawParameterList, isCRLF ) ) {
1652	return false;
1653	}
1654
1655	if ( rawParameterList.isEmpty() ) {
1656	return true;
1657	}
1658
1659	// decode rfc 2231 continuations and alternate charset encoding:
1660
1661	// NOTE: this code assumes that what QMapIterator delivers is sorted
1662	// by the key!
1663
1664	Codec * rfc2231Codec = `0`;
1665	QTextCodec * textcodec = `0`;
1666	QString attribute;
1667	QString value;
1668	enum Mode {
1669	NoMode = `0x0`, Continued = `0x1`, Encoded = `0x2`
1670	};
1671
1672	enum EncodingMode {
1673	NoEncoding,
1674	RFC2047,
1675	RFC2231
1676	};
1677
1678	QMap<QString, QStringOrQPair>::Iterator it, end = rawParameterList.end();
1679
1680	for ( it = rawParameterList.begin() ; it != end ; ++it ) {
1681	if ( attribute.isNull() \|\| !it.key().startsWith( attribute ) ) {
1682	//
1683	// new attribute:
1684	//
1685
1686	// store the last attribute/value pair in the result map now:
1687	if ( !attribute.isNull() ) {
1688	result.insert( attribute, value );
1689	}
1690	// and extract the information from the new raw attribute:
1691	value.clear();
1692	attribute = it.key();
1693	int mode = NoMode;
1694	EncodingMode encodingMode = NoEncoding;
1695
1696	// is the value rfc2331-encoded?
1697	if ( attribute.endsWith( asterisk ) ) {
1698	attribute.truncate( attribute.length() - `1` );
1699	mode \|= Encoded;
1700	encodingMode = RFC2231;
1701	}
1702	// is the value rfc2047-encoded?
1703	if ( !( it ).qstring.isNull() && ( it ).qstring.contains( QLatin1String ( "=?" ) ) ) {
1704	mode \|= Encoded;
1705	encodingMode = RFC2047;
1706	}
1707	// is the value continued?
1708	if ( attribute.endsWith( asteriskZero ) ) {
1709	attribute.truncate( attribute.length() - `2` );
1710	mode \|= Continued;
1711	}
1712	//
1713	// decode if necessary:
1714	//
1715	if ( mode & Encoded ) {
1716	if ( encodingMode == RFC2231 ) {
1717	decodeRFC2231Value( rfc2231Codec, textcodec,
1718	false, / isn't continuation /
1719	value, ( *it ).qpair, charset );
1720	}
1721	else if ( encodingMode == RFC2047 ) {
1722	value += decodeRFC2047String( ( *it ).qstring.toLatin1(), charset );
1723	}
1724	} else {
1725	// not encoded.
1726	if ( ( *it ).qpair.first ) {
1727	value += QString::fromLatin1( ( it ).qpair.first, ( it ).qpair.second );
1728	} else {
1729	value += ( *it ).qstring;
1730	}
1731	}
1732
1733	//
1734	// shortcut-processing when the value isn't encoded:
1735	//
1736
1737	if ( !( mode & Continued ) ) {
1738	// save result already:
1739	result.insert( attribute, value );
1740	// force begin of a new attribute:
1741	attribute.clear();
1742	}
1743	} else { // it.key().startsWith( attribute )
1744	//
1745	// continuation
1746	//
1747
1748	// ignore the section and trust QMap to have sorted the keys:
1749	if ( it.key().endsWith( asterisk ) ) {
1750	// encoded
1751	decodeRFC2231Value( rfc2231Codec, textcodec,
1752	true, / is continuation /
1753	value, ( *it ).qpair, charset );
1754	} else {
1755	// not encoded
1756	if ( ( *it ).qpair.first ) {
1757	value += QString::fromLatin1( ( it ).qpair.first, ( it ).qpair.second );
1758	} else {
1759	value += ( *it ).qstring;
1760	}
1761	}
1762	}
1763	}
1764
1765	// write last attr/value pair:
1766	if ( !attribute.isNull() ) {
1767	result.insert( attribute, value );
1768	}
1769
1770	return true;
1771	}
1772
1773
1774	bool parseParameterList( const char* &scursor, const char * const send,
1775	QMap<QString,QString> &result, bool isCRLF )
1776	{
1777	QByteArray charset;
1778	return parseParameterListWithCharset( scursor, send, result, charset, isCRLF );
1779	}
1780
1781	static const char * const stdDayNames[] = {
1782	"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
1783	};
1784	static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames;
1785
1786	static bool parseDayName( const char* &scursor, const char * const send )
1787	{
1788	// check bounds:
1789	if ( send - scursor < `3` ) {
1790	return false;
1791	}
1792
1793	for ( int i = `0` ; i < stdDayNamesLen ; ++i ) {
1794	if ( qstrnicmp( scursor, stdDayNames[i], `3` ) == `0` ) {
1795	scursor += `3`;
1796	// kDebug() << "found" << stdDayNames[i];
1797	return true;
1798	}
1799	}
1800
1801	return false;
1802	}
1803
1804	static const char * const stdMonthNames[] = {
1805	"Jan", "Feb", "Mar", "Apr", "May", "Jun",
1806	"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
1807	};
1808	static const int stdMonthNamesLen =
1809	sizeof stdMonthNames / sizeof *stdMonthNames;
1810
1811	static bool parseMonthName( const char* &scursor, const char * const send,
1812	int &result )
1813	{
1814	// check bounds:
1815	if ( send - scursor < `3` ) {
1816	return false;
1817	}
1818
1819	for ( result = `0` ; result < stdMonthNamesLen ; ++result ) {
1820	if ( qstrnicmp( scursor, stdMonthNames[result], `3` ) == `0` ) {
1821	scursor += `3`;
1822	return true;
1823	}
1824	}
1825
1826	// not found:
1827	return false;
1828	}
1829
1830	static const struct {
1831	const char * tzName;
1832	long int secsEastOfGMT;
1833	} timeZones[] = {
1834	// rfc 822 timezones:
1835	{ "GMT", `0` },
1836	{ "UT", `0` },
1837	{ "EDT", -`4`*`3600` },
1838	{ "EST", -`5`*`3600` },
1839	{ "MST", -`5`*`3600` },
1840	{ "CST", -`6`*`3600` },
1841	{ "MDT", -`6`*`3600` },
1842	{ "MST", -`7`*`3600` },
1843	{ "PDT", -`7`*`3600` },
1844	{ "PST", -`8`*`3600` },
1845	// common, non-rfc-822 zones:
1846	{ "CET", `1`*`3600` },
1847	{ "MET", `1`*`3600` },
1848	{ "UTC", `0` },
1849	{ "CEST", `2`*`3600` },
1850	{ "BST", `1`*`3600` },
1851	// rfc 822 military timezones:
1852	{ "Z", `0` },
1853	{ "A", -`1`*`3600` },
1854	{ "B", -`2`*`3600` },
1855	{ "C", -`3`*`3600` },
1856	{ "D", -`4`*`3600` },
1857	{ "E", -`5`*`3600` },
1858	{ "F", -`6`*`3600` },
1859	{ "G", -`7`*`3600` },
1860	{ "H", -`8`*`3600` },
1861	{ "I", -`9`*`3600` },
1862	// J is not used!
1863	{ "K", -`10`*`3600` },
1864	{ "L", -`11`*`3600` },
1865	{ "M", -`12`*`3600` },
1866	{ "N", `1`*`3600` },
1867	{ "O", `2`*`3600` },
1868	{ "P", `3`*`3600` },
1869	{ "Q", `4`*`3600` },
1870	{ "R", `5`*`3600` },
1871	{ "S", `6`*`3600` },
1872	{ "T", `7`*`3600` },
1873	{ "U", `8`*`3600` },
1874	{ "V", `9`*`3600` },
1875	{ "W", `10`*`3600` },
1876	{ "X", `11`*`3600` },
1877	{ "Y", `12`*`3600` },
1878	};
1879	static const int timeZonesLen = sizeof timeZones / sizeof *timeZones;
1880
1881	static bool parseAlphaNumericTimeZone( const char* &scursor,
1882	const char * const send,
1883	long int &secsEastOfGMT,
1884	bool &timeZoneKnown )
1885	{
1886	// allow the timezone to be wrapped in quotes; bug 260761
1887	if ( *scursor == '"' ) {
1888	scursor++;
1889
1890	if ( scursor == send ) {
1891	return false;
1892	}
1893	}
1894
1895	QPair<const char, int*> maybeTimeZone( `0`, `0` );
1896	if ( !parseToken( scursor, send, maybeTimeZone, false /no 8bit/ ) ) {
1897	return false;
1898	}
1899	for ( int i = `0` ; i < timeZonesLen ; ++i ) {
1900	if ( qstrnicmp( timeZones[i].tzName,
1901	maybeTimeZone.first, maybeTimeZone.second ) == `0` ) {
1902	scursor += maybeTimeZone.second;
1903	secsEastOfGMT = timeZones[i].secsEastOfGMT;
1904	timeZoneKnown = true;
1905
1906	if ( *scursor == '"' ) {
1907	scursor++;
1908	}
1909
1910	return true;
1911	}
1912	}
1913
1914	// don't choke just because we don't happen to know the time zone
1915	KMIME_WARN_UNKNOWN( time zone,
1916	QByteArray ( maybeTimeZone.first, maybeTimeZone.second ) );
1917	secsEastOfGMT = `0`;
1918	timeZoneKnown = false;
1919	return true;
1920	}
1921
1922	// parse a number and return the number of digits parsed:
1923	int parseDigits( const char* &scursor, const char * const send, int &result )
1924	{
1925	result = `0`;
1926	int digits = `0`;
1927	for ( ; scursor != send && isdigit( *scursor ) ; scursor++, digits++ ) {
1928	result *= `10`;
1929	result += int( *scursor - '0' );
1930	}
1931	return digits;
1932	}
1933
1934	static bool parseTimeOfDay( const char* &scursor, const char * const send,
1935	int &hour, int &min, int &sec, bool isCRLF=false )
1936	{
1937	// time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ]
1938
1939	//
1940	// 2DIGIT representing "hour":
1941	//
1942	if ( !parseDigits( scursor, send, hour ) ) {
1943	return false;
1944	}
1945
1946	eatCFWS( scursor, send, isCRLF );
1947	if ( scursor == send \|\| *scursor != ':' ) {
1948	return false;
1949	}
1950	scursor++; // eat ':'
1951
1952	eatCFWS( scursor, send, isCRLF );
1953	if ( scursor == send ) {
1954	return false;
1955	}
1956
1957	//
1958	// 2DIGIT representing "minute":
1959	//
1960	if ( !parseDigits( scursor, send, min ) ) {
1961	return false;
1962	}
1963
1964	eatCFWS( scursor, send, isCRLF );
1965	if ( scursor == send ) {
1966	return true; // seconds are optional
1967	}
1968
1969	//
1970	// let's see if we have a 2DIGIT representing "second":
1971	//
1972	if ( *scursor == ':' ) {
1973	// yepp, there are seconds:
1974	scursor++; // eat ':'
1975	eatCFWS( scursor, send, isCRLF );
1976	if ( scursor == send ) {
1977	return false;
1978	}
1979
1980	if ( !parseDigits( scursor, send, sec ) ) {
1981	return false;
1982	}
1983	} else {
1984	sec = `0`;
1985	}
1986
1987	return true;
1988	}
1989
1990	bool parseTime( const char* &scursor, const char * send,
1991	int &hour, int &min, int &sec, long int &secsEastOfGMT,
1992	bool &timeZoneKnown, bool isCRLF )
1993	{
1994	// time := time-of-day CFWS ( zone / obs-zone )
1995	//
1996	// obs-zone := "UT" / "GMT" /
1997	// "EST" / "EDT" / ; -0500 / -0400
1998	// "CST" / "CDT" / ; -0600 / -0500
1999	// "MST" / "MDT" / ; -0700 / -0600
2000	// "PST" / "PDT" / ; -0800 / -0700
2001	// "A"-"I" / "a"-"i" /
2002	// "K"-"Z" / "k"-"z"
2003
2004	eatCFWS( scursor, send, isCRLF );
2005	if ( scursor == send ) {
2006	return false;
2007	}
2008
2009	if ( !parseTimeOfDay( scursor, send, hour, min, sec, isCRLF ) ) {
2010	return false;
2011	}
2012
2013	eatCFWS( scursor, send, isCRLF );
2014	// there might be no timezone but a year following
2015	if ( ( scursor == send ) \|\| isdigit( *scursor ) ) {
2016	timeZoneKnown = false;
2017	secsEastOfGMT = `0`;
2018	return true; // allow missing timezone
2019	}
2020
2021	timeZoneKnown = true;
2022	if ( scursor == '+' \|\| scursor == '-' ) {
2023	// remember and eat '-'/'+':
2024	const char sign = *scursor++;
2025	// numerical timezone:
2026	int maybeTimeZone;
2027	if ( parseDigits( scursor, send, maybeTimeZone ) != `4` ) {
2028	return false;
2029	}
2030	secsEastOfGMT = `60` * ( maybeTimeZone / `100` * `60` + maybeTimeZone % `100` );
2031	if ( sign == '-' ) {
2032	secsEastOfGMT *= -`1`;
2033	if ( secsEastOfGMT == `0` ) {
2034	timeZoneKnown = false; // -0000 means indetermined tz
2035	}
2036	}
2037	} else {
2038	// maybe alphanumeric timezone:
2039	if ( !parseAlphaNumericTimeZone( scursor, send, secsEastOfGMT, timeZoneKnown ) ) {
2040	return false;
2041	}
2042	}
2043	return true;
2044	}
2045
2046	bool parseDateTime( const char* &scursor, const char * const send,
2047	KDateTime &result, bool isCRLF )
2048	{
2049	// Parsing date-time; strict mode:
2050	//
2051	// date-time := [ [CFWS] day-name [CFWS] "," ] ; wday
2052	// (expanded) [CFWS] 12DIGIT CFWS month-name CFWS 2DIGIT [CFWS] ; date
2053	// time
2054	//
2055	// day-name := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
2056	// month-name := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" /
2057	// "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec"
2058
2059	result = KDateTime ();
2060	QDateTime maybeDateTime;
2061
2062	eatCFWS( scursor, send, isCRLF );
2063	if ( scursor == send ) {
2064	return false;
2065	}
2066
2067	//
2068	// let's see if there's a day-of-week:
2069	//
2070	if ( parseDayName( scursor, send ) ) {
2071	eatCFWS( scursor, send, isCRLF );
2072	if ( scursor == send ) {
2073	return false;
2074	}
2075	// day-name should be followed by ',' but we treat it as optional:
2076	if ( *scursor == ',' ) {
2077	scursor++; // eat ','
2078	eatCFWS( scursor, send, isCRLF );
2079	}
2080	}
2081
2082	int maybeMonth = -`1`;
2083	bool asctimeFormat = false;
2084
2085	// ANSI-C asctime() format is: Wed Jun 30 21:49:08 1993
2086	if ( !isdigit( *scursor ) && parseMonthName( scursor, send, maybeMonth ) ) {
2087	asctimeFormat = true;
2088	eatCFWS( scursor, send, isCRLF );
2089	}
2090
2091	//
2092	// 12DIGIT representing "day" (of month):*
2093	//
2094	int maybeDay;
2095	if ( !parseDigits( scursor, send, maybeDay ) ) {
2096	return false;
2097	}
2098
2099	eatCFWS( scursor, send, isCRLF );
2100	if ( scursor == send ) {
2101	return false;
2102	}
2103
2104	// ignore ","; bug 54098
2105	if ( *scursor == ',' ) {
2106	scursor++;
2107	}
2108
2109	//
2110	// month-name:
2111	//
2112	if ( !asctimeFormat && !parseMonthName( scursor, send, maybeMonth ) ) {
2113	return false;
2114	}
2115	if ( scursor == send ) {
2116	return false;
2117	}
2118	assert( maybeMonth >= `0` ); assert( maybeMonth <= `11` );
2119	++maybeMonth; // 0-11 -> 1-12
2120
2121	eatCFWS( scursor, send, isCRLF );
2122	if ( scursor == send ) {
2123	return false;
2124	}
2125
2126	// check for "year HH:MM:SS" or only "HH:MM:SS" (or "H:MM:SS")
2127	bool timeAfterYear = true;
2128	if ( ( send - scursor > `3` ) && ( ( scursor[`1`] == ':' ) \|\| ( scursor[`2`] == ':' ) ) ) {
2129	timeAfterYear = false; // first read time, then year
2130	}
2131
2132	//
2133	// 2DIGIT representing "year":*
2134	//
2135	int maybeYear = `0`;
2136
2137	if ( timeAfterYear && !parseDigits( scursor, send, maybeYear ) ) {
2138	return false;
2139	}
2140
2141	eatCFWS( scursor, send, isCRLF );
2142	if ( scursor == send ) {
2143	return false;
2144	}
2145
2146	//
2147	// time
2148	//
2149	int maybeHour, maybeMinute, maybeSecond;
2150	long int secsEastOfGMT;
2151	bool timeZoneKnown = true;
2152
2153	if ( !parseTime( scursor, send,
2154	maybeHour, maybeMinute, maybeSecond,
2155	secsEastOfGMT, timeZoneKnown, isCRLF ) ) {
2156	return false;
2157	}
2158
2159	// in asctime() the year follows the time
2160	if ( !timeAfterYear ) {
2161	eatCFWS( scursor, send, isCRLF );
2162	if ( scursor == send ) {
2163	return false;
2164	}
2165
2166	if ( !parseDigits( scursor, send, maybeYear ) ) {
2167	return false;
2168	}
2169	}
2170
2171	// RFC 2822 4.3 processing:
2172	if ( maybeYear < `50` ) {
2173	maybeYear += `2000`;
2174	} else if ( maybeYear < `1000` ) {
2175	maybeYear += `1900`;
2176	}
2177	// else keep as is
2178	if ( maybeYear < `1900` ) {
2179	return false; // rfc2822, 3.3
2180	}
2181
2182	maybeDateTime.setDate( QDate ( maybeYear, maybeMonth, maybeDay ) );
2183	maybeDateTime.setTime( QTime ( maybeHour, maybeMinute, maybeSecond ) );
2184
2185	if ( !maybeDateTime.isValid() ) {
2186	return false;
2187	}
2188
2189	result = KDateTime ( maybeDateTime, KDateTime::Spec ( KDateTime::OffsetFromUTC, secsEastOfGMT ) );
2190	if ( !result.isValid() ) {
2191	return false;
2192	}
2193	return true;
2194	}
2195
2196	Headers::Base *extractFirstHeader( QByteArray &head )
2197	{
2198	int endOfFieldBody = `0`;
2199	bool folded = false;
2200	Headers::Base *header = `0`;
2201
2202	int startOfFieldBody = head.indexOf( ':' );
2203	const int endOfFieldHeader = startOfFieldBody;
2204
2205	if ( startOfFieldBody > -`1` ) { //there is another header
2206	startOfFieldBody++; //skip the ':'
2207	if ( head [startOfFieldBody] == ' ' ) { // skip the space after the ':', if there
2208	startOfFieldBody++;
2209	}
2210	endOfFieldBody = findHeaderLineEnd( head, startOfFieldBody, &folded );
2211
2212	QByteArray rawType = head.left( endOfFieldHeader );
2213	QByteArray rawFieldBody = head.mid( startOfFieldBody, endOfFieldBody - startOfFieldBody );
2214	if ( folded ) {
2215	rawFieldBody = unfoldHeader( rawFieldBody );
2216	}
2217	// We might get an invalid mail without a field name, don't crash on that.
2218	if ( !rawType.isEmpty() ) {
2219	header = HeaderFactory::self()->createHeader( rawType );
2220	}
2221	if ( !header ) {
2222	//kWarning() << "Returning Generic header of type" << rawType;
2223	header = new Headers::Generic ( rawType.constData() );
2224	}
2225	header->from7BitString( rawFieldBody );
2226
2227	head.remove( `0`, endOfFieldBody + `1` );
2228	} else {
2229	head.clear();
2230	}
2231
2232	return header;
2233	}
2234
2235	void extractHeaderAndBody( const QByteArray &content, QByteArray &header, QByteArray &body )
2236	{
2237	header.clear();
2238	body.clear();
2239
2240	// empty header
2241	if ( content.startsWith( '\n' ) ) {
2242	body = content.right( content.length() - `1` );
2243	return;
2244	}
2245
2246	int pos = content.indexOf( "\n\n", `0` );
2247	if ( pos > -`1` ) {
2248	header = content.left( ++pos ); //header must* end with "\n" !!*
2249	body = content.mid( pos + `1`, content.length() - pos - `1` );
2250	} else {
2251	header = content;
2252	}
2253	}
2254
2255	Headers::Base::List parseHeaders( const QByteArray &head )
2256	{
2257	Headers::Base::List ret;
2258	Headers::Base *h;
2259
2260	QByteArray copy = head;
2261	while ( ( h = extractFirstHeader( copy ) ) ) {
2262	ret << h;
2263	}
2264
2265	return ret;
2266	}
2267
2268	} // namespace HeaderParsing
2269
2270	} // namespace KMime
2271