1/* -*- c++ -*-
2 kmime_header_parsing.cpp
3
4 KMime, the KDE Internet mail/usenet news message library.
5 Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public
9 License as published by the Free Software Foundation; either
10 version 2 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public License
18 along with this library; see the file COPYING.LIB. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
21*/
22
23#include "kmime_header_parsing.h"
24
25#include "kmime_codecs.h"
26#include "kmime_headerfactory_p.h"
27#include "kmime_headers.h"
28#include "kmime_util.h"
29#include "kmime_util_p.h"
30#include "kmime_dateformatter.h"
31#include "kmime_warning.h"
32
33#include <kglobal.h>
34#include <kcharsets.h>
35
36#include <QtCore/QTextCodec>
37#include <QtCore/QMap>
38#include <QtCore/QStringList>
39#include <QtCore/QUrl>
40
41#include <ctype.h> // for isdigit
42#include <cassert>
43
44using namespace KMime;
45using namespace KMime::Types;
46
47namespace KMime {
48
49namespace Types {
50
51// QUrl::fromAce is extremely expensive, so only use it when necessary.
52// Fortunately, the presence of IDNA is readily detected with a substring match...
53static inline QString QUrl_fromAce_wrapper( const QString & domain )
54{
55 if ( domain.contains( QLatin1String( "xn--" ) ) ) {
56 return QUrl::fromAce( domain.toLatin1() );
57 } else {
58 return domain;
59 }
60}
61
62static QString addr_spec_as_string( const AddrSpec & as, bool pretty )
63{
64 if ( as.isEmpty() ) {
65 return QString();
66 }
67
68 static QChar dotChar = QLatin1Char( '.' );
69 static QChar backslashChar = QLatin1Char( '\\' );
70 static QChar quoteChar = QLatin1Char( '"' );
71
72 bool needsQuotes = false;
73 QString result;
74 result.reserve( as.localPart.length() + as.domain.length() + 1 );
75 for ( int i = 0 ; i < as.localPart.length() ; ++i ) {
76 const QChar ch = as.localPart.at( i );
77 if ( ch == dotChar || isAText( ch.toLatin1() ) ) {
78 result += ch;
79 } else {
80 needsQuotes = true;
81 if ( ch == backslashChar || ch == quoteChar ) {
82 result += backslashChar;
83 }
84 result += ch;
85 }
86 }
87 const QString dom = pretty ? QUrl_fromAce_wrapper( as.domain ) : as.domain ;
88 if ( needsQuotes ) {
89 result = quoteChar + result + quoteChar;
90 }
91 if ( dom.isEmpty() ) {
92 return result;
93 } else {
94 result += QLatin1Char( '@' );
95 result += dom;
96 return result;
97 }
98}
99
100QString AddrSpec::asString() const
101{
102 return addr_spec_as_string( *this, false );
103}
104
105QString AddrSpec::asPrettyString() const
106{
107 return addr_spec_as_string( *this, true );
108}
109
110bool AddrSpec::isEmpty() const
111{
112 return localPart.isEmpty() && domain.isEmpty();
113}
114
115QByteArray Mailbox::address() const
116{
117 QByteArray result;
118 const QString asString = addr_spec_as_string( mAddrSpec, false );
119 if ( !asString.isEmpty() ) {
120 result = asString.toLatin1();
121 }
122 return result;
123 //return mAddrSpec.asString().toLatin1();
124}
125
126AddrSpec Mailbox::addrSpec() const
127{
128 return mAddrSpec;
129}
130
131QString Mailbox::name() const
132{
133 return mDisplayName;
134}
135
136void Mailbox::setAddress( const AddrSpec &addr )
137{
138 mAddrSpec = addr;
139}
140
141void Mailbox::setAddress( const QByteArray &addr )
142{
143 const char *cursor = addr.constData();
144 if ( !HeaderParsing::parseAngleAddr( cursor,
145 cursor + addr.length(), mAddrSpec ) ) {
146 if ( !HeaderParsing::parseAddrSpec( cursor, cursor + addr.length(),
147 mAddrSpec ) ) {
148 kWarning() << "Invalid address";
149 return;
150 }
151 }
152}
153
154void Mailbox::setName( const QString &name )
155{
156 mDisplayName = removeBidiControlChars( name );
157}
158
159void Mailbox::setNameFrom7Bit( const QByteArray &name,
160 const QByteArray &defaultCharset )
161{
162 QByteArray cs;
163 setName( decodeRFC2047String( name, cs, defaultCharset, false ) );
164}
165
166bool Mailbox::hasAddress() const
167{
168 return !mAddrSpec.isEmpty();
169}
170
171bool Mailbox::hasName() const
172{
173 return !mDisplayName.isEmpty();
174}
175
176QString Mailbox::prettyAddress() const
177{
178 return prettyAddress( QuoteNever );
179}
180
181QString Mailbox::prettyAddress( Quoting quoting ) const
182{
183 if ( !hasName() ) {
184 return QLatin1String( address() );
185 }
186 QString s = name();
187 if ( quoting != QuoteNever ) {
188 addQuotes( s, quoting == QuoteAlways /*bool force*/ );
189 }
190
191 if ( hasAddress() ) {
192 s += QLatin1String( " <" ) + QLatin1String( address() ) + QLatin1Char( '>' );
193 }
194 return s;
195}
196
197void Mailbox::fromUnicodeString( const QString &s )
198{
199 from7BitString( encodeRFC2047Sentence( s, "utf-8" ) );
200}
201
202void Mailbox::from7BitString( const QByteArray &s )
203{
204 const char *cursor = s.constData();
205 HeaderParsing::parseMailbox( cursor, cursor + s.length(), *this );
206}
207
208QByteArray KMime::Types::Mailbox::as7BitString( const QByteArray &encCharset ) const
209{
210 if ( !hasName() ) {
211 return address();
212 }
213 QByteArray rv;
214 if ( isUsAscii( name() ) ) {
215 QByteArray tmp = name().toLatin1();
216 addQuotes( tmp, false );
217 rv += tmp;
218 } else {
219 rv += encodeRFC2047String( name(), encCharset, true );
220 }
221 if ( hasAddress() ) {
222 rv += " <" + address() + '>';
223 }
224 return rv;
225}
226
227} // namespace Types
228
229namespace HeaderParsing {
230
231// parse the encoded-word (scursor points to after the initial '=')
232bool parseEncodedWord( const char* &scursor, const char * const send,
233 QString &result, QByteArray &language,
234 QByteArray &usedCS, const QByteArray &defaultCS,
235 bool forceCS )
236{
237 // make sure the caller already did a bit of the work.
238 assert( *( scursor - 1 ) == '=' );
239
240 //
241 // STEP 1:
242 // scan for the charset/language portion of the encoded-word
243 //
244
245 char ch = *scursor++;
246
247 if ( ch != '?' ) {
248 // kDebug() << "first";
249 //KMIME_WARN_PREMATURE_END_OF( EncodedWord );
250 return false;
251 }
252
253 // remember start of charset (ie. just after the initial "=?") and
254 // language (just after the first '*') fields:
255 const char * charsetStart = scursor;
256 const char * languageStart = 0;
257
258 // find delimiting '?' (and the '*' separating charset and language
259 // tags, if any):
260 for ( ; scursor != send ; scursor++ ) {
261 if ( *scursor == '?' ) {
262 break;
263 } else if ( *scursor == '*' && languageStart == 0 ) {
264 languageStart = scursor + 1;
265 }
266 }
267
268 // not found? can't be an encoded-word!
269 if ( scursor == send || *scursor != '?' ) {
270 // kDebug() << "second";
271 KMIME_WARN_PREMATURE_END_OF( EncodedWord );
272 return false;
273 }
274
275 // extract the language information, if any (if languageStart is 0,
276 // language will be null, too):
277 QByteArray maybeLanguage( languageStart, scursor - languageStart );
278 // extract charset information (keep in mind: the size given to the
279 // ctor is one off due to the \0 terminator):
280 QByteArray maybeCharset( charsetStart,
281 ( languageStart ? languageStart - 1 : scursor ) - charsetStart );
282
283 //
284 // STEP 2:
285 // scan for the encoding portion of the encoded-word
286 //
287
288 // remember start of encoding (just _after_ the second '?'):
289 scursor++;
290 const char * encodingStart = scursor;
291
292 // find next '?' (ending the encoding tag):
293 for ( ; scursor != send ; scursor++ ) {
294 if ( *scursor == '?' ) {
295 break;
296 }
297 }
298
299 // not found? Can't be an encoded-word!
300 if ( scursor == send || *scursor != '?' ) {
301 // kDebug() << "third";
302 KMIME_WARN_PREMATURE_END_OF( EncodedWord );
303 return false;
304 }
305
306 // extract the encoding information:
307 QByteArray maybeEncoding( encodingStart, scursor - encodingStart );
308
309 // kDebug() << "parseEncodedWord: found charset == \"" << maybeCharset
310 // << "\"; language == \"" << maybeLanguage
311 // << "\"; encoding == \"" << maybeEncoding << "\"";
312
313 //
314 // STEP 3:
315 // scan for encoded-text portion of encoded-word
316 //
317
318 // remember start of encoded-text (just after the third '?'):
319 scursor++;
320 const char * encodedTextStart = scursor;
321
322 // find the '?=' sequence (ending the encoded-text):
323 for ( ; scursor != send ; scursor++ ) {
324 if ( *scursor == '?' ) {
325 if ( scursor + 1 != send ) {
326 if ( *( scursor + 1 ) != '=' ) { // We expect a '=' after the '?', but we got something else; ignore
327 KMIME_WARN << "Stray '?' in q-encoded word, ignoring this.";
328 continue;
329 }
330 else { // yep, found a '?=' sequence
331 scursor += 2;
332 break;
333 }
334 }
335 else { // The '?' is the last char, but we need a '=' after it!
336 KMIME_WARN_PREMATURE_END_OF( EncodedWord );
337 return false;
338 }
339 }
340 }
341
342 if ( *( scursor - 2 ) != '?' || *( scursor - 1 ) != '=' ||
343 scursor < encodedTextStart + 2 ) {
344 KMIME_WARN_PREMATURE_END_OF( EncodedWord );
345 return false;
346 }
347
348 // set end sentinel for encoded-text:
349 const char * const encodedTextEnd = scursor - 2;
350
351 //
352 // STEP 4:
353 // setup decoders for the transfer encoding and the charset
354 //
355
356 // try if there's a codec for the encoding found:
357 Codec * codec = Codec::codecForName( maybeEncoding );
358 if ( !codec ) {
359 KMIME_WARN_UNKNOWN( Encoding, maybeEncoding );
360 return false;
361 }
362
363 // get an instance of a corresponding decoder:
364 Decoder * dec = codec->makeDecoder();
365 assert( dec );
366
367 // try if there's a (text)codec for the charset found:
368 bool matchOK = false;
369 QTextCodec *textCodec = 0;
370 if ( forceCS || maybeCharset.isEmpty() ) {
371 textCodec = KGlobal::charsets()->codecForName( QLatin1String( defaultCS ), matchOK );
372 usedCS = cachedCharset( defaultCS );
373 } else {
374 textCodec = KGlobal::charsets()->codecForName( QLatin1String( maybeCharset ), matchOK );
375 if ( !matchOK ) { //no suitable codec found => use default charset
376 textCodec = KGlobal::charsets()->codecForName( QLatin1String( defaultCS ), matchOK );
377 usedCS = cachedCharset( defaultCS );
378 } else {
379 usedCS = cachedCharset( maybeCharset );
380 }
381 }
382
383 if ( !matchOK || !textCodec ) {
384 KMIME_WARN_UNKNOWN( Charset, maybeCharset );
385 delete dec;
386 return false;
387 };
388
389 // kDebug() << "mimeName(): \"" << textCodec->name() << "\"";
390
391 // allocate a temporary buffer to store the 8bit text:
392 int encodedTextLength = encodedTextEnd - encodedTextStart;
393 QByteArray buffer;
394 buffer.resize( codec->maxDecodedSizeFor( encodedTextLength ) );
395 char *bbegin = buffer.data();
396 char *bend = bbegin + buffer.length();
397
398 //
399 // STEP 5:
400 // do the actual decoding
401 //
402
403 if ( !dec->decode( encodedTextStart, encodedTextEnd, bbegin, bend ) ) {
404 KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor("
405 << encodedTextLength << ")\nresult may be truncated";
406 }
407
408 result = textCodec->toUnicode( buffer.data(), bbegin - buffer.data() );
409
410 // kDebug() << "result now: \"" << result << "\"";
411 // cleanup:
412 delete dec;
413 language = maybeLanguage;
414
415 return true;
416}
417
418static inline void eatWhiteSpace( const char* &scursor, const char * const send )
419{
420 while ( scursor != send &&
421 ( *scursor == ' ' || *scursor == '\n' ||
422 *scursor == '\t' || *scursor == '\r' ) )
423 scursor++;
424}
425
426bool parseAtom( const char * &scursor, const char * const send,
427 QString &result, bool allow8Bit )
428{
429 QPair<const char*, int> maybeResult;
430
431 if ( parseAtom( scursor, send, maybeResult, allow8Bit ) ) {
432 result += QString::fromLatin1( maybeResult.first, maybeResult.second );
433 return true;
434 }
435
436 return false;
437}
438
439bool parseAtom( const char * &scursor, const char * const send,
440 QPair<const char*,int> &result, bool allow8Bit )
441{
442 bool success = false;
443 const char *start = scursor;
444
445 while ( scursor != send ) {
446 signed char ch = *scursor++;
447 if ( ch > 0 && isAText( ch ) ) {
448 // AText: OK
449 success = true;
450 } else if ( allow8Bit && ch < 0 ) {
451 // 8bit char: not OK, but be tolerant.
452 KMIME_WARN_8BIT( ch );
453 success = true;
454 } else {
455 // CTL or special - marking the end of the atom:
456 // re-set sursor to point to the offending
457 // char and return:
458 scursor--;
459 break;
460 }
461 }
462 result.first = start;
463 result.second = scursor - start;
464 return success;
465}
466
467// FIXME: Remove this and the other parseToken() method. add a new one where "result" is a
468// QByteArray.
469bool parseToken( const char * &scursor, const char * const send,
470 QString &result, bool allow8Bit )
471{
472 QPair<const char*, int> maybeResult;
473
474 if ( parseToken( scursor, send, maybeResult, allow8Bit ) ) {
475 result += QString::fromLatin1( maybeResult.first, maybeResult.second );
476 return true;
477 }
478
479 return false;
480}
481
482bool parseToken( const char * &scursor, const char * const send,
483 QPair<const char*,int> &result, bool allow8Bit )
484{
485 bool success = false;
486 const char * start = scursor;
487
488 while ( scursor != send ) {
489 signed char ch = *scursor++;
490 if ( ch > 0 && isTText( ch ) ) {
491 // TText: OK
492 success = true;
493 } else if ( allow8Bit && ch < 0 ) {
494 // 8bit char: not OK, but be tolerant.
495 KMIME_WARN_8BIT( ch );
496 success = true;
497 } else {
498 // CTL or tspecial - marking the end of the atom:
499 // re-set sursor to point to the offending
500 // char and return:
501 scursor--;
502 break;
503 }
504 }
505 result.first = start;
506 result.second = scursor - start;
507 return success;
508}
509
510#define READ_ch_OR_FAIL if ( scursor == send ) { \
511 KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \
512 return false; \
513 } else { \
514 ch = *scursor++; \
515 }
516
517// known issues:
518//
519// - doesn't handle quoted CRLF
520
521// FIXME: Why is result a QString? This should be a QByteArray, since at this level, we don't
522// know about encodings yet!
523bool parseGenericQuotedString( const char* &scursor, const char * const send,
524 QString &result, bool isCRLF,
525 const char openChar, const char closeChar )
526{
527 char ch;
528 // We are in a quoted-string or domain-literal or comment and the
529 // cursor points to the first char after the openChar.
530 // We will apply unfolding and quoted-pair removal.
531 // We return when we either encounter the end or unescaped openChar
532 // or closeChar.
533
534 assert( *( scursor - 1 ) == openChar || *( scursor - 1 ) == closeChar );
535
536 while ( scursor != send ) {
537 ch = *scursor++;
538
539 if ( ch == closeChar || ch == openChar ) {
540 // end of quoted-string or another opening char:
541 // let caller decide what to do.
542 return true;
543 }
544
545 switch ( ch ) {
546 case '\\': // quoted-pair
547 // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5
548 READ_ch_OR_FAIL;
549 KMIME_WARN_IF_8BIT( ch );
550 result += QLatin1Char( ch );
551 break;
552 case '\r':
553 // ###
554 // The case of lonely '\r' is easy to solve, as they're
555 // not part of Unix Line-ending conventions.
556 // But I see a problem if we are given Unix-native
557 // line-ending-mails, where we cannot determine anymore
558 // whether a given '\n' was part of a CRLF or was occurring
559 // on it's own.
560 READ_ch_OR_FAIL;
561 if ( ch != '\n' ) {
562 // CR on it's own...
563 KMIME_WARN_LONE( CR );
564 result += QLatin1Char( '\r' );
565 scursor--; // points to after the '\r' again
566 } else {
567 // CRLF encountered.
568 // lookahead: check for folding
569 READ_ch_OR_FAIL;
570 if ( ch == ' ' || ch == '\t' ) {
571 // correct folding;
572 // position cursor behind the CRLF WSP (unfolding)
573 // and add the WSP to the result
574 result += QLatin1Char( ch );
575 } else {
576 // this is the "shouldn't happen"-case. There is a CRLF
577 // inside a quoted-string without it being part of FWS.
578 // We take it verbatim.
579 KMIME_WARN_NON_FOLDING( CRLF );
580 result += QLatin1String( "\r\n" );
581 // the cursor is decremented again, so's we need not
582 // duplicate the whole switch here. "ch" could've been
583 // everything (incl. openChar or closeChar).
584 scursor--;
585 }
586 }
587 break;
588 case '\n':
589 // Note: CRLF has been handled above already!
590 // ### LF needs special treatment, depending on whether isCRLF
591 // is true (we can be sure a lonely '\n' was meant this way) or
592 // false ('\n' alone could have meant LF or CRLF in the original
593 // message. This parser assumes CRLF iff the LF is followed by
594 // either WSP (folding) or NULL (premature end of quoted-string;
595 // Should be fixed, since NULL is allowed as per rfc822).
596 READ_ch_OR_FAIL;
597 if ( !isCRLF && ( ch == ' ' || ch == '\t' ) ) {
598 // folding
599 // correct folding
600 result += QLatin1Char( ch );
601 } else {
602 // non-folding
603 KMIME_WARN_LONE( LF );
604 result += QLatin1Char( '\n' );
605 // pos is decremented, so's we need not duplicate the whole
606 // switch here. ch could've been everything (incl. <">, "\").
607 scursor--;
608 }
609 break;
610 case '=':
611 {
612 // ### Work around broken clients that send encoded words in quoted-strings
613 // For example, older KMail versions.
614 if ( scursor == send ) {
615 break;
616 }
617
618 const char *oldscursor = scursor;
619 QString tmp;
620 QByteArray lang, charset;
621 if ( *scursor++ == '?' ) {
622 --scursor;
623 if ( parseEncodedWord( scursor, send, tmp, lang, charset ) ) {
624 result += tmp;
625 break;
626 } else {
627 scursor = oldscursor;
628 }
629 } else {
630 scursor = oldscursor;
631 }
632 // fall through
633 }
634 default:
635 KMIME_WARN_IF_8BIT( ch );
636 result += QLatin1Char( ch );
637 }
638 }
639
640 return false;
641}
642
643// known issues:
644//
645// - doesn't handle encoded-word inside comments.
646
647bool parseComment( const char* &scursor, const char * const send,
648 QString &result, bool isCRLF, bool reallySave )
649{
650 int commentNestingDepth = 1;
651 const char *afterLastClosingParenPos = 0;
652 QString maybeCmnt;
653 const char *oldscursor = scursor;
654
655 assert( *( scursor - 1 ) == '(' );
656
657 while ( commentNestingDepth ) {
658 QString cmntPart;
659 if ( parseGenericQuotedString( scursor, send, cmntPart, isCRLF, '(', ')' ) ) {
660 assert( *( scursor - 1 ) == ')' || *( scursor - 1 ) == '(' );
661 // see the kdoc for above function for the possible conditions
662 // we have to check:
663 switch ( *( scursor - 1 ) ) {
664 case ')':
665 if ( reallySave ) {
666 // add the chunk that's now surely inside the comment.
667 result += maybeCmnt;
668 result += cmntPart;
669 if ( commentNestingDepth > 1 ) {
670 // don't add the outermost ')'...
671 result += QLatin1Char( ')' );
672 }
673 maybeCmnt.clear();
674 }
675 afterLastClosingParenPos = scursor;
676 --commentNestingDepth;
677 break;
678 case '(':
679 if ( reallySave ) {
680 // don't add to "result" yet, because we might find that we
681 // are already outside the (broken) comment...
682 maybeCmnt += cmntPart;
683 maybeCmnt += QLatin1Char( '(' );
684 }
685 ++commentNestingDepth;
686 break;
687 default: assert( 0 );
688 } // switch
689 } else {
690 // !parseGenericQuotedString, ie. premature end
691 if ( afterLastClosingParenPos ) {
692 scursor = afterLastClosingParenPos;
693 } else {
694 scursor = oldscursor;
695 }
696 return false;
697 }
698 } // while
699
700 return true;
701}
702
703// known issues: none.
704
705bool parsePhrase( const char* &scursor, const char * const send,
706 QString &result, bool isCRLF )
707{
708 enum {
709 None, Phrase, Atom, EncodedWord, QuotedString
710 } found = None;
711
712 QString tmp;
713 QByteArray lang, charset;
714 const char *successfullyParsed = 0;
715 // only used by the encoded-word branch
716 const char *oldscursor;
717 // used to suppress whitespace between adjacent encoded-words
718 // (rfc2047, 6.2):
719 bool lastWasEncodedWord = false;
720
721 while ( scursor != send ) {
722 char ch = *scursor++;
723 switch ( ch ) {
724 case '.': // broken, but allow for intorop's sake
725 if ( found == None ) {
726 --scursor;
727 return false;
728 } else {
729 if ( scursor != send && ( *scursor == ' ' || *scursor == '\t' ) ) {
730 result += QLatin1String( ". " );
731 } else {
732 result += QLatin1Char( '.' );
733 }
734 successfullyParsed = scursor;
735 }
736 break;
737 case '"': // quoted-string
738 tmp.clear();
739 if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
740 successfullyParsed = scursor;
741 assert( *( scursor - 1 ) == '"' );
742 switch ( found ) {
743 case None:
744 found = QuotedString;
745 break;
746 case Phrase:
747 case Atom:
748 case EncodedWord:
749 case QuotedString:
750 found = Phrase;
751 result += QLatin1Char( ' ' ); // rfc822, 3.4.4
752 break;
753 default:
754 assert( 0 );
755 }
756 lastWasEncodedWord = false;
757 result += tmp;
758 } else {
759 // premature end of quoted string.
760 // What to do? Return leading '"' as special? Return as quoted-string?
761 // We do the latter if we already found something, else signal failure.
762 if ( found == None ) {
763 return false;
764 } else {
765 result += QLatin1Char( ' ' ); // rfc822, 3.4.4
766 result += tmp;
767 return true;
768 }
769 }
770 break;
771 case '(': // comment
772 // parse it, but ignore content:
773 tmp.clear();
774 if ( parseComment( scursor, send, tmp, isCRLF,
775 false /*don't bother with the content*/ ) ) {
776 successfullyParsed = scursor;
777 lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2
778 } else {
779 if ( found == None ) {
780 return false;
781 } else {
782 scursor = successfullyParsed;
783 return true;
784 }
785 }
786 break;
787 case '=': // encoded-word
788 tmp.clear();
789 oldscursor = scursor;
790 lang.clear();
791 charset.clear();
792 if ( parseEncodedWord( scursor, send, tmp, lang, charset ) ) {
793 successfullyParsed = scursor;
794 switch ( found ) {
795 case None:
796 found = EncodedWord;
797 break;
798 case Phrase:
799 case EncodedWord:
800 case Atom:
801 case QuotedString:
802 if ( !lastWasEncodedWord ) {
803 result += QLatin1Char( ' ' ); // rfc822, 3.4.4
804 }
805 found = Phrase;
806 break;
807 default: assert( 0 );
808 }
809 lastWasEncodedWord = true;
810 result += tmp;
811 break;
812 } else {
813 // parse as atom:
814 scursor = oldscursor;
815 }
816 // fall though...
817
818 default: //atom
819 tmp.clear();
820 scursor--;
821 if ( parseAtom( scursor, send, tmp, true /* allow 8bit */ ) ) {
822 successfullyParsed = scursor;
823 switch ( found ) {
824 case None:
825 found = Atom;
826 break;
827 case Phrase:
828 case Atom:
829 case EncodedWord:
830 case QuotedString:
831 found = Phrase;
832 result += QLatin1Char( ' ' ); // rfc822, 3.4.4
833 break;
834 default:
835 assert( 0 );
836 }
837 lastWasEncodedWord = false;
838 result += tmp;
839 } else {
840 if ( found == None ) {
841 return false;
842 } else {
843 scursor = successfullyParsed;
844 return true;
845 }
846 }
847 }
848 eatWhiteSpace( scursor, send );
849 }
850
851 return found != None;
852}
853
854// FIXME: This should probably by QByteArray &result instead?
855bool parseDotAtom( const char* &scursor, const char * const send,
856 QString &result, bool isCRLF )
857{
858 eatCFWS( scursor, send, isCRLF );
859
860 // always points to just after the last atom parsed:
861 const char *successfullyParsed;
862
863 QString tmp;
864 if ( !parseAtom( scursor, send, tmp, false /* no 8bit */ ) ) {
865 return false;
866 }
867 result += tmp;
868 successfullyParsed = scursor;
869
870 while ( scursor != send ) {
871
872 // end of header or no '.' -> return
873 if ( scursor == send || *scursor != '.' ) {
874 return true;
875 }
876 scursor++; // eat '.'
877
878 if ( scursor == send || !isAText( *scursor ) ) {
879 // end of header or no AText, but this time following a '.'!:
880 // reset cursor to just after last successfully parsed char and
881 // return:
882 scursor = successfullyParsed;
883 return true;
884 }
885
886 // try to parse the next atom:
887 QString maybeAtom;
888 if ( !parseAtom( scursor, send, maybeAtom, false /*no 8bit*/ ) ) {
889 scursor = successfullyParsed;
890 return true;
891 }
892
893 result += QLatin1Char( '.' );
894 result += maybeAtom;
895 successfullyParsed = scursor;
896 }
897
898 scursor = successfullyParsed;
899 return true;
900}
901
902void eatCFWS( const char* &scursor, const char * const send, bool isCRLF )
903{
904 QString dummy;
905
906 while ( scursor != send ) {
907 const char *oldscursor = scursor;
908
909 char ch = *scursor++;
910
911 switch ( ch ) {
912 case ' ':
913 case '\t': // whitespace
914 case '\r':
915 case '\n': // folding
916 continue;
917
918 case '(': // comment
919 if ( parseComment( scursor, send, dummy, isCRLF, false /*don't save*/ ) ) {
920 continue;
921 }
922 scursor = oldscursor;
923 return;
924
925 default:
926 scursor = oldscursor;
927 return;
928 }
929 }
930}
931
932bool parseDomain( const char* &scursor, const char * const send,
933 QString &result, bool isCRLF )
934{
935 eatCFWS( scursor, send, isCRLF );
936 if ( scursor == send ) {
937 return false;
938 }
939
940 // domain := dot-atom / domain-literal / atom *("." atom)
941 //
942 // equivalent to:
943 // domain = dot-atom / domain-literal,
944 // since parseDotAtom does allow CFWS between atoms and dots
945
946 if ( *scursor == '[' ) {
947 // domain-literal:
948 QString maybeDomainLiteral;
949 // eat '[':
950 scursor++;
951 while ( parseGenericQuotedString( scursor, send, maybeDomainLiteral,
952 isCRLF, '[', ']' ) ) {
953 if ( scursor == send ) {
954 // end of header: check for closing ']':
955 if ( *( scursor - 1 ) == ']' ) {
956 // OK, last char was ']':
957 result = maybeDomainLiteral;
958 return true;
959 } else {
960 // not OK, domain-literal wasn't closed:
961 return false;
962 }
963 }
964 // we hit openChar in parseGenericQuotedString.
965 // include it in maybeDomainLiteral and keep on parsing:
966 if ( *( scursor - 1 ) == '[' ) {
967 maybeDomainLiteral += QLatin1Char( '[' );
968 continue;
969 }
970 // OK, real end of domain-literal:
971 result = maybeDomainLiteral;
972 return true;
973 }
974 } else {
975 // dot-atom:
976 QString maybeDotAtom;
977 if ( parseDotAtom( scursor, send, maybeDotAtom, isCRLF ) ) {
978 result = maybeDotAtom;
979 // Domain may end with '.', if so preserve it'
980 if ( scursor != send && *scursor == '.' ) {
981 result += QLatin1Char( '.' );
982 scursor++;
983 }
984 return true;
985 }
986 }
987 return false;
988}
989
990bool parseObsRoute( const char* &scursor, const char* const send,
991 QStringList &result, bool isCRLF, bool save )
992{
993 while ( scursor != send ) {
994 eatCFWS( scursor, send, isCRLF );
995 if ( scursor == send ) {
996 return false;
997 }
998
999 // empty entry:
1000 if ( *scursor == ',' ) {
1001 scursor++;
1002 if ( save ) {
1003 result.append( QString() );
1004 }
1005 continue;
1006 }
1007
1008 // empty entry ending the list:
1009 if ( *scursor == ':' ) {
1010 scursor++;
1011 if ( save ) {
1012 result.append( QString() );
1013 }
1014 return true;
1015 }
1016
1017 // each non-empty entry must begin with '@':
1018 if ( *scursor != '@' ) {
1019 return false;
1020 } else {
1021 scursor++;
1022 }
1023
1024 QString maybeDomain;
1025 if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
1026 return false;
1027 }
1028 if ( save ) {
1029 result.append( maybeDomain );
1030 }
1031
1032 // eat the following (optional) comma:
1033 eatCFWS( scursor, send, isCRLF );
1034 if ( scursor == send ) {
1035 return false;
1036 }
1037 if ( *scursor == ':' ) {
1038 scursor++;
1039 return true;
1040 }
1041 if ( *scursor == ',' ) {
1042 scursor++;
1043 }
1044 }
1045
1046 return false;
1047}
1048
1049bool parseAddrSpec( const char* &scursor, const char * const send,
1050 AddrSpec &result, bool isCRLF )
1051{
1052 //
1053 // STEP 1:
1054 // local-part := dot-atom / quoted-string / word *("." word)
1055 //
1056 // this is equivalent to:
1057 // local-part := word *("." word)
1058
1059 QString maybeLocalPart;
1060 QString tmp;
1061
1062 while ( scursor != send ) {
1063 // first, eat any whitespace
1064 eatCFWS( scursor, send, isCRLF );
1065
1066 char ch = *scursor++;
1067 switch ( ch ) {
1068 case '.': // dot
1069 maybeLocalPart += QLatin1Char( '.' );
1070 break;
1071
1072 case '@':
1073 goto SAW_AT_SIGN;
1074 break;
1075
1076 case '"': // quoted-string
1077 tmp.clear();
1078 if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
1079 maybeLocalPart += tmp;
1080 } else {
1081 return false;
1082 }
1083 break;
1084
1085 default: // atom
1086 scursor--; // re-set scursor to point to ch again
1087 tmp.clear();
1088 if ( parseAtom( scursor, send, tmp, false /* no 8bit */ ) ) {
1089 maybeLocalPart += tmp;
1090 } else {
1091 return false; // parseAtom can only fail if the first char is non-atext.
1092 }
1093 break;
1094 }
1095 }
1096
1097 return false;
1098
1099 //
1100 // STEP 2:
1101 // domain
1102 //
1103
1104SAW_AT_SIGN:
1105
1106 assert( *( scursor - 1 ) == '@' );
1107
1108 QString maybeDomain;
1109 if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
1110 return false;
1111 }
1112
1113 result.localPart = maybeLocalPart;
1114 result.domain = maybeDomain;
1115
1116 return true;
1117}
1118
1119bool parseAngleAddr( const char* &scursor, const char * const send,
1120 AddrSpec &result, bool isCRLF )
1121{
1122 // first, we need an opening angle bracket:
1123 eatCFWS( scursor, send, isCRLF );
1124 if ( scursor == send || *scursor != '<' ) {
1125 return false;
1126 }
1127 scursor++; // eat '<'
1128
1129 eatCFWS( scursor, send, isCRLF );
1130 if ( scursor == send ) {
1131 return false;
1132 }
1133
1134 if ( *scursor == '@' || *scursor == ',' ) {
1135 // obs-route: parse, but ignore:
1136 KMIME_WARN << "obsolete source route found! ignoring.";
1137 QStringList dummy;
1138 if ( !parseObsRoute( scursor, send, dummy,
1139 isCRLF, false /* don't save */ ) ) {
1140 return false;
1141 }
1142 // angle-addr isn't complete until after the '>':
1143 if ( scursor == send ) {
1144 return false;
1145 }
1146 }
1147
1148 // parse addr-spec:
1149 AddrSpec maybeAddrSpec;
1150 if ( !parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
1151 return false;
1152 }
1153
1154 eatCFWS( scursor, send, isCRLF );
1155 if ( scursor == send || *scursor != '>' ) {
1156 return false;
1157 }
1158 scursor++;
1159
1160 result = maybeAddrSpec;
1161 return true;
1162
1163}
1164
1165static QString stripQuotes( const QString &input )
1166{
1167 const QLatin1Char quotes( '"' );
1168 if ( input.startsWith( quotes ) && input.endsWith( quotes ) ) {
1169 QString stripped( input.mid( 1, input.size() - 2 ) );
1170 return stripped;
1171 } else {
1172 return input;
1173 }
1174}
1175
1176bool parseMailbox( const char* &scursor, const char * const send,
1177 Mailbox &result, bool isCRLF )
1178{
1179 eatCFWS( scursor, send, isCRLF );
1180 if ( scursor == send ) {
1181 return false;
1182 }
1183
1184 AddrSpec maybeAddrSpec;
1185 QString maybeDisplayName;
1186
1187 // first, try if it's a vanilla addr-spec:
1188 const char * oldscursor = scursor;
1189 if ( parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
1190 result.setAddress( maybeAddrSpec );
1191 // check for the obsolete form of display-name (as comment):
1192 eatWhiteSpace( scursor, send );
1193 if ( scursor != send && *scursor == '(' ) {
1194 scursor++;
1195 if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /*keep*/ ) ) {
1196 return false;
1197 }
1198 }
1199 result.setName( stripQuotes( maybeDisplayName ) );
1200 return true;
1201 }
1202 scursor = oldscursor;
1203
1204 // second, see if there's a display-name:
1205 if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
1206 // failed: reset cursor, note absent display-name
1207 maybeDisplayName.clear();
1208 scursor = oldscursor;
1209 } else {
1210 // succeeded: eat CFWS
1211 eatCFWS( scursor, send, isCRLF );
1212 if ( scursor == send ) {
1213 return false;
1214 }
1215 }
1216
1217 // third, parse the angle-addr:
1218 if ( !parseAngleAddr( scursor, send, maybeAddrSpec, isCRLF ) ) {
1219 return false;
1220 }
1221
1222 if ( maybeDisplayName.isNull() ) {
1223 // check for the obsolete form of display-name (as comment):
1224 eatWhiteSpace( scursor, send );
1225 if ( scursor != send && *scursor == '(' ) {
1226 scursor++;
1227 if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /*keep*/ ) ) {
1228 return false;
1229 }
1230 }
1231 }
1232
1233 result.setName( stripQuotes( maybeDisplayName ) );
1234 result.setAddress( maybeAddrSpec );
1235 return true;
1236}
1237
1238bool parseGroup( const char* &scursor, const char * const send,
1239 Address &result, bool isCRLF )
1240{
1241 // group := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS]
1242 //
1243 // equivalent to:
1244 // group := display-name ":" [ obs-mbox-list ] ";"
1245
1246 eatCFWS( scursor, send, isCRLF );
1247 if ( scursor == send ) {
1248 return false;
1249 }
1250
1251 // get display-name:
1252 QString maybeDisplayName;
1253 if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
1254 return false;
1255 }
1256
1257 // get ":":
1258 eatCFWS( scursor, send, isCRLF );
1259 if ( scursor == send || *scursor != ':' ) {
1260 return false;
1261 }
1262
1263 // KDE5 TODO: Don't expose displayName as public, but rather add setter for it that
1264 // automatically calls removeBidiControlChars
1265 result.displayName = removeBidiControlChars( maybeDisplayName );
1266
1267 // get obs-mbox-list (may contain empty entries):
1268 scursor++;
1269 while ( scursor != send ) {
1270 eatCFWS( scursor, send, isCRLF );
1271 if ( scursor == send ) {
1272 return false;
1273 }
1274
1275 // empty entry:
1276 if ( *scursor == ',' ) {
1277 scursor++;
1278 continue;
1279 }
1280
1281 // empty entry ending the list:
1282 if ( *scursor == ';' ) {
1283 scursor++;
1284 return true;
1285 }
1286
1287 Mailbox maybeMailbox;
1288 if ( !parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
1289 return false;
1290 }
1291 result.mailboxList.append( maybeMailbox );
1292
1293 eatCFWS( scursor, send, isCRLF );
1294 // premature end:
1295 if ( scursor == send ) {
1296 return false;
1297 }
1298 // regular end of the list:
1299 if ( *scursor == ';' ) {
1300 scursor++;
1301 return true;
1302 }
1303 // eat regular list entry separator:
1304 if ( *scursor == ',' ) {
1305 scursor++;
1306 }
1307 }
1308 return false;
1309}
1310
1311bool parseAddress( const char* &scursor, const char * const send,
1312 Address &result, bool isCRLF )
1313{
1314 // address := mailbox / group
1315
1316 eatCFWS( scursor, send, isCRLF );
1317 if ( scursor == send ) {
1318 return false;
1319 }
1320
1321 // first try if it's a single mailbox:
1322 Mailbox maybeMailbox;
1323 const char * oldscursor = scursor;
1324 if ( parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
1325 // yes, it is:
1326 result.displayName.clear();
1327 result.mailboxList.append( maybeMailbox );
1328 return true;
1329 }
1330 scursor = oldscursor;
1331
1332 Address maybeAddress;
1333
1334 // no, it's not a single mailbox. Try if it's a group:
1335 if ( !parseGroup( scursor, send, maybeAddress, isCRLF ) ) {
1336 return false;
1337 }
1338
1339 result = maybeAddress;
1340 return true;
1341}
1342
1343bool parseAddressList( const char* &scursor, const char * const send,
1344 AddressList &result, bool isCRLF )
1345{
1346 while ( scursor != send ) {
1347 eatCFWS( scursor, send, isCRLF );
1348 // end of header: this is OK.
1349 if ( scursor == send ) {
1350 return true;
1351 }
1352 // empty entry: ignore:
1353 if ( *scursor == ',' ) {
1354 scursor++;
1355 continue;
1356 }
1357 // broken clients might use ';' as list delimiter, accept that as well
1358 if ( *scursor == ';' ) {
1359 scursor++;
1360 continue;
1361 }
1362
1363 // parse one entry
1364 Address maybeAddress;
1365 if ( !parseAddress( scursor, send, maybeAddress, isCRLF ) ) {
1366 return false;
1367 }
1368 result.append( maybeAddress );
1369
1370 eatCFWS( scursor, send, isCRLF );
1371 // end of header: this is OK.
1372 if ( scursor == send ) {
1373 return true;
1374 }
1375 // comma separating entries: eat it.
1376 if ( *scursor == ',' ) {
1377 scursor++;
1378 }
1379 }
1380 return true;
1381}
1382
1383static QString asterisk = QString::fromLatin1( "*0*", 1 );
1384static QString asteriskZero = QString::fromLatin1( "*0*", 2 );
1385//static QString asteriskZeroAsterisk = QString::fromLatin1( "*0*", 3 );
1386
1387// FIXME: Get rid of the very ugly "QStringOrQPair" thing. At this level, we are supposed to work
1388// on byte arrays, not strings! The result parameter should be a simple
1389// QPair<QByteArray,QByteArray>, which is the attribute name and the value.
1390bool parseParameter( const char* &scursor, const char * const send,
1391 QPair<QString,QStringOrQPair> &result, bool isCRLF )
1392{
1393 // parameter = regular-parameter / extended-parameter
1394 // regular-parameter = regular-parameter-name "=" value
1395 // extended-parameter =
1396 // value = token / quoted-string
1397 //
1398 // note that rfc2231 handling is out of the scope of this function.
1399 // Therefore we return the attribute as QString and the value as
1400 // (start,length) tupel if we see that the value is encoded
1401 // (trailing asterisk), for parseParameterList to decode...
1402
1403 eatCFWS( scursor, send, isCRLF );
1404 if ( scursor == send ) {
1405 return false;
1406 }
1407
1408 //
1409 // parse the parameter name:
1410 //
1411 // FIXME: maybeAttribute should be a QByteArray
1412 QString maybeAttribute;
1413 if ( !parseToken( scursor, send, maybeAttribute, false /* no 8bit */ ) ) {
1414 return false;
1415 }
1416
1417 eatCFWS( scursor, send, isCRLF );
1418 // premature end: not OK (haven't seen '=' yet).
1419 if ( scursor == send || *scursor != '=' ) {
1420 return false;
1421 }
1422 scursor++; // eat '='
1423
1424 eatCFWS( scursor, send, isCRLF );
1425 if ( scursor == send ) {
1426 // don't choke on attribute=, meaning the value was omitted:
1427 if ( maybeAttribute.endsWith( asterisk ) ) {
1428 KMIME_WARN << "attribute ends with \"*\", but value is empty!"
1429 "Chopping away \"*\".";
1430 maybeAttribute.truncate( maybeAttribute.length() - 1 );
1431 }
1432 result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
1433 return true;
1434 }
1435
1436 const char * oldscursor = scursor;
1437
1438 //
1439 // parse the parameter value:
1440 //
1441 QStringOrQPair maybeValue;
1442 if ( *scursor == '"' ) {
1443 // value is a quoted-string:
1444 scursor++;
1445 if ( maybeAttribute.endsWith( asterisk ) ) {
1446 // attributes ending with "*" designate extended-parameters,
1447 // which cannot have quoted-strings as values. So we remove the
1448 // trailing "*" to not confuse upper layers.
1449 KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!"
1450 "Chopping away \"*\".";
1451 maybeAttribute.truncate( maybeAttribute.length() - 1 );
1452 }
1453
1454 if ( !parseGenericQuotedString( scursor, send, maybeValue.qstring, isCRLF ) ) {
1455 scursor = oldscursor;
1456 result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
1457 return false; // this case needs further processing by upper layers!!
1458 }
1459 } else {
1460 // value is a token:
1461 if ( !parseToken( scursor, send, maybeValue.qpair, false /* no 8bit */ ) ) {
1462 scursor = oldscursor;
1463 result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
1464 return false; // this case needs further processing by upper layers!!
1465 }
1466 }
1467
1468 result = qMakePair( maybeAttribute.toLower(), maybeValue );
1469 return true;
1470}
1471
1472// FIXME: Get rid of QStringOrQPair: Use a simply QMap<QByteArray, QByteArray> for "result"
1473// instead!
1474bool parseRawParameterList( const char* &scursor, const char * const send,
1475 QMap<QString,QStringOrQPair> &result,
1476 bool isCRLF )
1477{
1478 // we use parseParameter() consecutively to obtain a map of raw
1479 // attributes to raw values. "Raw" here means that we don't do
1480 // rfc2231 decoding and concatenation. This is left to
1481 // parseParameterList(), which will call this function.
1482 //
1483 // The main reason for making this chunk of code a separate
1484 // (private) method is that we can deal with broken parameters
1485 // _here_ and leave the rfc2231 handling solely to
1486 // parseParameterList(), which will still be enough work.
1487
1488 while ( scursor != send ) {
1489 eatCFWS( scursor, send, isCRLF );
1490 // empty entry ending the list: OK.
1491 if ( scursor == send ) {
1492 return true;
1493 }
1494 // empty list entry: ignore.
1495 if ( *scursor == ';' ) {
1496 scursor++;
1497 continue;
1498 }
1499
1500 QPair<QString, QStringOrQPair> maybeParameter;
1501 if ( !parseParameter( scursor, send, maybeParameter, isCRLF ) ) {
1502 // we need to do a bit of work if the attribute is not
1503 // NULL. These are the cases marked with "needs further
1504 // processing" in parseParameter(). Specifically, parsing of the
1505 // token or the quoted-string, which should represent the value,
1506 // failed. We take the easy way out and simply search for the
1507 // next ';' to start parsing again. (Another option would be to
1508 // take the text between '=' and ';' as value)
1509 if ( maybeParameter.first.isNull() ) {
1510 return false;
1511 }
1512 while ( scursor != send ) {
1513 if ( *scursor++ == ';' ) {
1514 goto IS_SEMICOLON;
1515 }
1516 }
1517 // scursor == send case: end of list.
1518 return true;
1519 IS_SEMICOLON:
1520 // *scursor == ';' case: parse next entry.
1521 continue;
1522 }
1523 // successful parsing brings us here:
1524 result.insert( maybeParameter.first, maybeParameter.second );
1525
1526 eatCFWS( scursor, send, isCRLF );
1527 // end of header: ends list.
1528 if ( scursor == send ) {
1529 return true;
1530 }
1531 // regular separator: eat it.
1532 if ( *scursor == ';' ) {
1533 scursor++;
1534 }
1535 }
1536 return true;
1537}
1538
1539static void decodeRFC2231Value( Codec* &rfc2231Codec,
1540 QTextCodec* &textcodec,
1541 bool isContinuation, QString &value,
1542 QPair<const char*,int> &source, QByteArray& charset )
1543{
1544 //
1545 // parse the raw value into (charset,language,text):
1546 //
1547
1548 const char * decBegin = source.first;
1549 const char * decCursor = decBegin;
1550 const char * decEnd = decCursor + source.second;
1551
1552 if ( !isContinuation ) {
1553 // find the first single quote
1554 while ( decCursor != decEnd ) {
1555 if ( *decCursor == '\'' ) {
1556 break;
1557 } else {
1558 decCursor++;
1559 }
1560 }
1561
1562 if ( decCursor == decEnd ) {
1563 // there wasn't a single single quote at all!
1564 // take the whole value to be in latin-1:
1565 KMIME_WARN << "No charset in extended-initial-value."
1566 "Assuming \"iso-8859-1\".";
1567 value += QString::fromLatin1( decBegin, source.second );
1568 return;
1569 }
1570
1571 charset = QByteArray( decBegin, decCursor - decBegin );
1572
1573 const char * oldDecCursor = ++decCursor;
1574 // find the second single quote (we ignore the language tag):
1575 while ( decCursor != decEnd ) {
1576 if ( *decCursor == '\'' ) {
1577 break;
1578 } else {
1579 decCursor++;
1580 }
1581 }
1582 if ( decCursor == decEnd ) {
1583 KMIME_WARN << "No language in extended-initial-value."
1584 "Trying to recover.";
1585 decCursor = oldDecCursor;
1586 } else {
1587 decCursor++;
1588 }
1589
1590 // decCursor now points to the start of the
1591 // "extended-other-values":
1592
1593 //
1594 // get the decoders:
1595 //
1596
1597 bool matchOK = false;
1598 textcodec = KGlobal::charsets()->codecForName( QLatin1String( charset ), matchOK );
1599 if ( !matchOK ) {
1600 textcodec = 0;
1601 KMIME_WARN_UNKNOWN( Charset, charset );
1602 }
1603 }
1604
1605 if ( !rfc2231Codec ) {
1606 rfc2231Codec = Codec::codecForName( "x-kmime-rfc2231" );
1607 assert( rfc2231Codec );
1608 }
1609
1610 if ( !textcodec ) {
1611 value += QString::fromLatin1( decCursor, decEnd - decCursor );
1612 return;
1613 }
1614
1615 Decoder * dec = rfc2231Codec->makeDecoder();
1616 assert( dec );
1617
1618 //
1619 // do the decoding:
1620 //
1621
1622 QByteArray buffer;
1623 buffer.resize( rfc2231Codec->maxDecodedSizeFor( decEnd - decCursor ) );
1624 QByteArray::Iterator bit = buffer.begin();
1625 QByteArray::ConstIterator bend = buffer.end();
1626
1627 if ( !dec->decode( decCursor, decEnd, bit, bend ) ) {
1628 KMIME_WARN << rfc2231Codec->name()
1629 << "codec lies about its maxDecodedSizeFor()" << endl
1630 << "result may be truncated";
1631 }
1632
1633 value += textcodec->toUnicode( buffer.begin(), bit - buffer.begin() );
1634
1635 // kDebug() << "value now: \"" << value << "\"";
1636 // cleanup:
1637 delete dec;
1638}
1639
1640// known issues:
1641// - permutes rfc2231 continuations when the total number of parts
1642// exceeds 10 (other-sections then becomes *xy, ie. two digits)
1643
1644bool parseParameterListWithCharset( const char* &scursor,
1645 const char * const send,
1646 QMap<QString,QString> &result,
1647 QByteArray& charset, bool isCRLF )
1648{
1649// parse the list into raw attribute-value pairs:
1650 QMap<QString, QStringOrQPair> rawParameterList;
1651 if ( !parseRawParameterList( scursor, send, rawParameterList, isCRLF ) ) {
1652 return false;
1653 }
1654
1655 if ( rawParameterList.isEmpty() ) {
1656 return true;
1657 }
1658
1659 // decode rfc 2231 continuations and alternate charset encoding:
1660
1661 // NOTE: this code assumes that what QMapIterator delivers is sorted
1662 // by the key!
1663
1664 Codec * rfc2231Codec = 0;
1665 QTextCodec * textcodec = 0;
1666 QString attribute;
1667 QString value;
1668 enum Mode {
1669 NoMode = 0x0, Continued = 0x1, Encoded = 0x2
1670 };
1671
1672 enum EncodingMode {
1673 NoEncoding,
1674 RFC2047,
1675 RFC2231
1676 };
1677
1678 QMap<QString, QStringOrQPair>::Iterator it, end = rawParameterList.end();
1679
1680 for ( it = rawParameterList.begin() ; it != end ; ++it ) {
1681 if ( attribute.isNull() || !it.key().startsWith( attribute ) ) {
1682 //
1683 // new attribute:
1684 //
1685
1686 // store the last attribute/value pair in the result map now:
1687 if ( !attribute.isNull() ) {
1688 result.insert( attribute, value );
1689 }
1690 // and extract the information from the new raw attribute:
1691 value.clear();
1692 attribute = it.key();
1693 int mode = NoMode;
1694 EncodingMode encodingMode = NoEncoding;
1695
1696 // is the value rfc2331-encoded?
1697 if ( attribute.endsWith( asterisk ) ) {
1698 attribute.truncate( attribute.length() - 1 );
1699 mode |= Encoded;
1700 encodingMode = RFC2231;
1701 }
1702 // is the value rfc2047-encoded?
1703 if ( !( *it ).qstring.isNull() && ( *it ).qstring.contains( QLatin1String( "=?" ) ) ) {
1704 mode |= Encoded;
1705 encodingMode = RFC2047;
1706 }
1707 // is the value continued?
1708 if ( attribute.endsWith( asteriskZero ) ) {
1709 attribute.truncate( attribute.length() - 2 );
1710 mode |= Continued;
1711 }
1712 //
1713 // decode if necessary:
1714 //
1715 if ( mode & Encoded ) {
1716 if ( encodingMode == RFC2231 ) {
1717 decodeRFC2231Value( rfc2231Codec, textcodec,
1718 false, /* isn't continuation */
1719 value, ( *it ).qpair, charset );
1720 }
1721 else if ( encodingMode == RFC2047 ) {
1722 value += decodeRFC2047String( ( *it ).qstring.toLatin1(), charset );
1723 }
1724 } else {
1725 // not encoded.
1726 if ( ( *it ).qpair.first ) {
1727 value += QString::fromLatin1( ( *it ).qpair.first, ( *it ).qpair.second );
1728 } else {
1729 value += ( *it ).qstring;
1730 }
1731 }
1732
1733 //
1734 // shortcut-processing when the value isn't encoded:
1735 //
1736
1737 if ( !( mode & Continued ) ) {
1738 // save result already:
1739 result.insert( attribute, value );
1740 // force begin of a new attribute:
1741 attribute.clear();
1742 }
1743 } else { // it.key().startsWith( attribute )
1744 //
1745 // continuation
1746 //
1747
1748 // ignore the section and trust QMap to have sorted the keys:
1749 if ( it.key().endsWith( asterisk ) ) {
1750 // encoded
1751 decodeRFC2231Value( rfc2231Codec, textcodec,
1752 true, /* is continuation */
1753 value, ( *it ).qpair, charset );
1754 } else {
1755 // not encoded
1756 if ( ( *it ).qpair.first ) {
1757 value += QString::fromLatin1( ( *it ).qpair.first, ( *it ).qpair.second );
1758 } else {
1759 value += ( *it ).qstring;
1760 }
1761 }
1762 }
1763 }
1764
1765 // write last attr/value pair:
1766 if ( !attribute.isNull() ) {
1767 result.insert( attribute, value );
1768 }
1769
1770 return true;
1771}
1772
1773
1774bool parseParameterList( const char* &scursor, const char * const send,
1775 QMap<QString,QString> &result, bool isCRLF )
1776{
1777 QByteArray charset;
1778 return parseParameterListWithCharset( scursor, send, result, charset, isCRLF );
1779}
1780
1781static const char * const stdDayNames[] = {
1782 "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
1783};
1784static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames;
1785
1786static bool parseDayName( const char* &scursor, const char * const send )
1787{
1788 // check bounds:
1789 if ( send - scursor < 3 ) {
1790 return false;
1791 }
1792
1793 for ( int i = 0 ; i < stdDayNamesLen ; ++i ) {
1794 if ( qstrnicmp( scursor, stdDayNames[i], 3 ) == 0 ) {
1795 scursor += 3;
1796 // kDebug() << "found" << stdDayNames[i];
1797 return true;
1798 }
1799 }
1800
1801 return false;
1802}
1803
1804static const char * const stdMonthNames[] = {
1805 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
1806 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
1807};
1808static const int stdMonthNamesLen =
1809 sizeof stdMonthNames / sizeof *stdMonthNames;
1810
1811static bool parseMonthName( const char* &scursor, const char * const send,
1812 int &result )
1813{
1814 // check bounds:
1815 if ( send - scursor < 3 ) {
1816 return false;
1817 }
1818
1819 for ( result = 0 ; result < stdMonthNamesLen ; ++result ) {
1820 if ( qstrnicmp( scursor, stdMonthNames[result], 3 ) == 0 ) {
1821 scursor += 3;
1822 return true;
1823 }
1824 }
1825
1826 // not found:
1827 return false;
1828}
1829
1830static const struct {
1831 const char * tzName;
1832 long int secsEastOfGMT;
1833} timeZones[] = {
1834 // rfc 822 timezones:
1835 { "GMT", 0 },
1836 { "UT", 0 },
1837 { "EDT", -4*3600 },
1838 { "EST", -5*3600 },
1839 { "MST", -5*3600 },
1840 { "CST", -6*3600 },
1841 { "MDT", -6*3600 },
1842 { "MST", -7*3600 },
1843 { "PDT", -7*3600 },
1844 { "PST", -8*3600 },
1845 // common, non-rfc-822 zones:
1846 { "CET", 1*3600 },
1847 { "MET", 1*3600 },
1848 { "UTC", 0 },
1849 { "CEST", 2*3600 },
1850 { "BST", 1*3600 },
1851 // rfc 822 military timezones:
1852 { "Z", 0 },
1853 { "A", -1*3600 },
1854 { "B", -2*3600 },
1855 { "C", -3*3600 },
1856 { "D", -4*3600 },
1857 { "E", -5*3600 },
1858 { "F", -6*3600 },
1859 { "G", -7*3600 },
1860 { "H", -8*3600 },
1861 { "I", -9*3600 },
1862 // J is not used!
1863 { "K", -10*3600 },
1864 { "L", -11*3600 },
1865 { "M", -12*3600 },
1866 { "N", 1*3600 },
1867 { "O", 2*3600 },
1868 { "P", 3*3600 },
1869 { "Q", 4*3600 },
1870 { "R", 5*3600 },
1871 { "S", 6*3600 },
1872 { "T", 7*3600 },
1873 { "U", 8*3600 },
1874 { "V", 9*3600 },
1875 { "W", 10*3600 },
1876 { "X", 11*3600 },
1877 { "Y", 12*3600 },
1878};
1879static const int timeZonesLen = sizeof timeZones / sizeof *timeZones;
1880
1881static bool parseAlphaNumericTimeZone( const char* &scursor,
1882 const char * const send,
1883 long int &secsEastOfGMT,
1884 bool &timeZoneKnown )
1885{
1886 // allow the timezone to be wrapped in quotes; bug 260761
1887 if ( *scursor == '"' ) {
1888 scursor++;
1889
1890 if ( scursor == send ) {
1891 return false;
1892 }
1893 }
1894
1895 QPair<const char*, int> maybeTimeZone( 0, 0 );
1896 if ( !parseToken( scursor, send, maybeTimeZone, false /*no 8bit*/ ) ) {
1897 return false;
1898 }
1899 for ( int i = 0 ; i < timeZonesLen ; ++i ) {
1900 if ( qstrnicmp( timeZones[i].tzName,
1901 maybeTimeZone.first, maybeTimeZone.second ) == 0 ) {
1902 scursor += maybeTimeZone.second;
1903 secsEastOfGMT = timeZones[i].secsEastOfGMT;
1904 timeZoneKnown = true;
1905
1906 if ( *scursor == '"' ) {
1907 scursor++;
1908 }
1909
1910 return true;
1911 }
1912 }
1913
1914 // don't choke just because we don't happen to know the time zone
1915 KMIME_WARN_UNKNOWN( time zone,
1916 QByteArray( maybeTimeZone.first, maybeTimeZone.second ) );
1917 secsEastOfGMT = 0;
1918 timeZoneKnown = false;
1919 return true;
1920}
1921
1922// parse a number and return the number of digits parsed:
1923int parseDigits( const char* &scursor, const char * const send, int &result )
1924{
1925 result = 0;
1926 int digits = 0;
1927 for ( ; scursor != send && isdigit( *scursor ) ; scursor++, digits++ ) {
1928 result *= 10;
1929 result += int( *scursor - '0' );
1930 }
1931 return digits;
1932}
1933
1934static bool parseTimeOfDay( const char* &scursor, const char * const send,
1935 int &hour, int &min, int &sec, bool isCRLF=false )
1936{
1937 // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ]
1938
1939 //
1940 // 2DIGIT representing "hour":
1941 //
1942 if ( !parseDigits( scursor, send, hour ) ) {
1943 return false;
1944 }
1945
1946 eatCFWS( scursor, send, isCRLF );
1947 if ( scursor == send || *scursor != ':' ) {
1948 return false;
1949 }
1950 scursor++; // eat ':'
1951
1952 eatCFWS( scursor, send, isCRLF );
1953 if ( scursor == send ) {
1954 return false;
1955 }
1956
1957 //
1958 // 2DIGIT representing "minute":
1959 //
1960 if ( !parseDigits( scursor, send, min ) ) {
1961 return false;
1962 }
1963
1964 eatCFWS( scursor, send, isCRLF );
1965 if ( scursor == send ) {
1966 return true; // seconds are optional
1967 }
1968
1969 //
1970 // let's see if we have a 2DIGIT representing "second":
1971 //
1972 if ( *scursor == ':' ) {
1973 // yepp, there are seconds:
1974 scursor++; // eat ':'
1975 eatCFWS( scursor, send, isCRLF );
1976 if ( scursor == send ) {
1977 return false;
1978 }
1979
1980 if ( !parseDigits( scursor, send, sec ) ) {
1981 return false;
1982 }
1983 } else {
1984 sec = 0;
1985 }
1986
1987 return true;
1988}
1989
1990bool parseTime( const char* &scursor, const char * send,
1991 int &hour, int &min, int &sec, long int &secsEastOfGMT,
1992 bool &timeZoneKnown, bool isCRLF )
1993{
1994 // time := time-of-day CFWS ( zone / obs-zone )
1995 //
1996 // obs-zone := "UT" / "GMT" /
1997 // "EST" / "EDT" / ; -0500 / -0400
1998 // "CST" / "CDT" / ; -0600 / -0500
1999 // "MST" / "MDT" / ; -0700 / -0600
2000 // "PST" / "PDT" / ; -0800 / -0700
2001 // "A"-"I" / "a"-"i" /
2002 // "K"-"Z" / "k"-"z"
2003
2004 eatCFWS( scursor, send, isCRLF );
2005 if ( scursor == send ) {
2006 return false;
2007 }
2008
2009 if ( !parseTimeOfDay( scursor, send, hour, min, sec, isCRLF ) ) {
2010 return false;
2011 }
2012
2013 eatCFWS( scursor, send, isCRLF );
2014 // there might be no timezone but a year following
2015 if ( ( scursor == send ) || isdigit( *scursor ) ) {
2016 timeZoneKnown = false;
2017 secsEastOfGMT = 0;
2018 return true; // allow missing timezone
2019 }
2020
2021 timeZoneKnown = true;
2022 if ( *scursor == '+' || *scursor == '-' ) {
2023 // remember and eat '-'/'+':
2024 const char sign = *scursor++;
2025 // numerical timezone:
2026 int maybeTimeZone;
2027 if ( parseDigits( scursor, send, maybeTimeZone ) != 4 ) {
2028 return false;
2029 }
2030 secsEastOfGMT = 60 * ( maybeTimeZone / 100 * 60 + maybeTimeZone % 100 );
2031 if ( sign == '-' ) {
2032 secsEastOfGMT *= -1;
2033 if ( secsEastOfGMT == 0 ) {
2034 timeZoneKnown = false; // -0000 means indetermined tz
2035 }
2036 }
2037 } else {
2038 // maybe alphanumeric timezone:
2039 if ( !parseAlphaNumericTimeZone( scursor, send, secsEastOfGMT, timeZoneKnown ) ) {
2040 return false;
2041 }
2042 }
2043 return true;
2044}
2045
2046bool parseDateTime( const char* &scursor, const char * const send,
2047 KDateTime &result, bool isCRLF )
2048{
2049 // Parsing date-time; strict mode:
2050 //
2051 // date-time := [ [CFWS] day-name [CFWS] "," ] ; wday
2052 // (expanded) [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date
2053 // time
2054 //
2055 // day-name := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
2056 // month-name := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" /
2057 // "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec"
2058
2059 result = KDateTime();
2060 QDateTime maybeDateTime;
2061
2062 eatCFWS( scursor, send, isCRLF );
2063 if ( scursor == send ) {
2064 return false;
2065 }
2066
2067 //
2068 // let's see if there's a day-of-week:
2069 //
2070 if ( parseDayName( scursor, send ) ) {
2071 eatCFWS( scursor, send, isCRLF );
2072 if ( scursor == send ) {
2073 return false;
2074 }
2075 // day-name should be followed by ',' but we treat it as optional:
2076 if ( *scursor == ',' ) {
2077 scursor++; // eat ','
2078 eatCFWS( scursor, send, isCRLF );
2079 }
2080 }
2081
2082 int maybeMonth = -1;
2083 bool asctimeFormat = false;
2084
2085 // ANSI-C asctime() format is: Wed Jun 30 21:49:08 1993
2086 if ( !isdigit( *scursor ) && parseMonthName( scursor, send, maybeMonth ) ) {
2087 asctimeFormat = true;
2088 eatCFWS( scursor, send, isCRLF );
2089 }
2090
2091 //
2092 // 1*2DIGIT representing "day" (of month):
2093 //
2094 int maybeDay;
2095 if ( !parseDigits( scursor, send, maybeDay ) ) {
2096 return false;
2097 }
2098
2099 eatCFWS( scursor, send, isCRLF );
2100 if ( scursor == send ) {
2101 return false;
2102 }
2103
2104 // ignore ","; bug 54098
2105 if ( *scursor == ',' ) {
2106 scursor++;
2107 }
2108
2109 //
2110 // month-name:
2111 //
2112 if ( !asctimeFormat && !parseMonthName( scursor, send, maybeMonth ) ) {
2113 return false;
2114 }
2115 if ( scursor == send ) {
2116 return false;
2117 }
2118 assert( maybeMonth >= 0 ); assert( maybeMonth <= 11 );
2119 ++maybeMonth; // 0-11 -> 1-12
2120
2121 eatCFWS( scursor, send, isCRLF );
2122 if ( scursor == send ) {
2123 return false;
2124 }
2125
2126 // check for "year HH:MM:SS" or only "HH:MM:SS" (or "H:MM:SS")
2127 bool timeAfterYear = true;
2128 if ( ( send - scursor > 3 ) && ( ( scursor[1] == ':' ) || ( scursor[2] == ':' ) ) ) {
2129 timeAfterYear = false; // first read time, then year
2130 }
2131
2132 //
2133 // 2*DIGIT representing "year":
2134 //
2135 int maybeYear = 0;
2136
2137 if ( timeAfterYear && !parseDigits( scursor, send, maybeYear ) ) {
2138 return false;
2139 }
2140
2141 eatCFWS( scursor, send, isCRLF );
2142 if ( scursor == send ) {
2143 return false;
2144 }
2145
2146 //
2147 // time
2148 //
2149 int maybeHour, maybeMinute, maybeSecond;
2150 long int secsEastOfGMT;
2151 bool timeZoneKnown = true;
2152
2153 if ( !parseTime( scursor, send,
2154 maybeHour, maybeMinute, maybeSecond,
2155 secsEastOfGMT, timeZoneKnown, isCRLF ) ) {
2156 return false;
2157 }
2158
2159 // in asctime() the year follows the time
2160 if ( !timeAfterYear ) {
2161 eatCFWS( scursor, send, isCRLF );
2162 if ( scursor == send ) {
2163 return false;
2164 }
2165
2166 if ( !parseDigits( scursor, send, maybeYear ) ) {
2167 return false;
2168 }
2169 }
2170
2171 // RFC 2822 4.3 processing:
2172 if ( maybeYear < 50 ) {
2173 maybeYear += 2000;
2174 } else if ( maybeYear < 1000 ) {
2175 maybeYear += 1900;
2176 }
2177 // else keep as is
2178 if ( maybeYear < 1900 ) {
2179 return false; // rfc2822, 3.3
2180 }
2181
2182 maybeDateTime.setDate( QDate( maybeYear, maybeMonth, maybeDay ) );
2183 maybeDateTime.setTime( QTime( maybeHour, maybeMinute, maybeSecond ) );
2184
2185 if ( !maybeDateTime.isValid() ) {
2186 return false;
2187 }
2188
2189 result = KDateTime( maybeDateTime, KDateTime::Spec( KDateTime::OffsetFromUTC, secsEastOfGMT ) );
2190 if ( !result.isValid() ) {
2191 return false;
2192 }
2193 return true;
2194}
2195
2196Headers::Base *extractFirstHeader( QByteArray &head )
2197{
2198 int endOfFieldBody = 0;
2199 bool folded = false;
2200 Headers::Base *header = 0;
2201
2202 int startOfFieldBody = head.indexOf( ':' );
2203 const int endOfFieldHeader = startOfFieldBody;
2204
2205 if ( startOfFieldBody > -1 ) { //there is another header
2206 startOfFieldBody++; //skip the ':'
2207 if ( head[startOfFieldBody] == ' ' ) { // skip the space after the ':', if there
2208 startOfFieldBody++;
2209 }
2210 endOfFieldBody = findHeaderLineEnd( head, startOfFieldBody, &folded );
2211
2212 QByteArray rawType = head.left( endOfFieldHeader );
2213 QByteArray rawFieldBody = head.mid( startOfFieldBody, endOfFieldBody - startOfFieldBody );
2214 if ( folded ) {
2215 rawFieldBody = unfoldHeader( rawFieldBody );
2216 }
2217 // We might get an invalid mail without a field name, don't crash on that.
2218 if ( !rawType.isEmpty() ) {
2219 header = HeaderFactory::self()->createHeader( rawType );
2220 }
2221 if ( !header ) {
2222 //kWarning() << "Returning Generic header of type" << rawType;
2223 header = new Headers::Generic( rawType.constData() );
2224 }
2225 header->from7BitString( rawFieldBody );
2226
2227 head.remove( 0, endOfFieldBody + 1 );
2228 } else {
2229 head.clear();
2230 }
2231
2232 return header;
2233}
2234
2235void extractHeaderAndBody( const QByteArray &content, QByteArray &header, QByteArray &body )
2236{
2237 header.clear();
2238 body.clear();
2239
2240 // empty header
2241 if ( content.startsWith( '\n' ) ) {
2242 body = content.right( content.length() - 1 );
2243 return;
2244 }
2245
2246 int pos = content.indexOf( "\n\n", 0 );
2247 if ( pos > -1 ) {
2248 header = content.left( ++pos ); //header *must* end with "\n" !!
2249 body = content.mid( pos + 1, content.length() - pos - 1 );
2250 } else {
2251 header = content;
2252 }
2253}
2254
2255Headers::Base::List parseHeaders( const QByteArray &head )
2256{
2257 Headers::Base::List ret;
2258 Headers::Base *h;
2259
2260 QByteArray copy = head;
2261 while ( ( h = extractFirstHeader( copy ) ) ) {
2262 ret << h;
2263 }
2264
2265 return ret;
2266}
2267
2268} // namespace HeaderParsing
2269
2270} // namespace KMime
2271