1/* -*- c++ -*-
2 kmime_codec_qp.cpp
3
4 KMime, the KDE Internet mail/usenet news message library.
5 Copyright (c) 2002 Marc Mutz <mutz@kde.org>
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public
9 License as published by the Free Software Foundation; either
10 version 2 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public License
18 along with this library; see the file COPYING.LIB. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
21*/
22/**
23 @file
24 This file is part of the API for handling @ref MIME data and
25 defines the @ref QuotedPrintable, @ref RFC2047Q, and
26 @ref RFC2231 @ref Codec classes.
27
28 @brief
29 Defines the classes QuotedPrintableCodec, Rfc2047QEncodingCodec, and
30 Rfc2231EncodingCodec.
31
32 @authors Marc Mutz \<mutz@kde.org\>
33*/
34
35#include "kmime_codec_qp.h"
36#include "kmime_util.h"
37
38#include <kdebug.h>
39
40#include <cassert>
41
42using namespace KMime;
43
44namespace KMime {
45
46// some helpful functions:
47
48/**
49 Converts a 4-bit @p value into its hexadecimal characater representation.
50 So input of value [0,15] returns ['0','1',... 'F']. Input values
51 greater than 15 will produce undesired results.
52 @param value is an unsigned character containing the 4-bit input value.
53*/
54static inline char binToHex( uchar value )
55{
56 if ( value > 9 ) {
57 return value + 'A' - 10;
58 } else {
59 return value + '0';
60 }
61}
62
63/**
64 Returns the high-order 4 bits of an 8-bit value in another 8-bit value.
65 @param ch is an unsigned character containing the 8-bit input value.
66*/
67static inline uchar highNibble( uchar ch )
68{
69 return ch >> 4;
70}
71
72/**
73 Returns the low-order 4 bits of an 8-bit value in another 8-bit value.
74 @param ch is an unsigned character containing the 8-bit input value.
75*/
76static inline uchar lowNibble( uchar ch )
77{
78 return ch & 0xF;
79}
80
81/**
82 Returns true if the specified value is a not Control character or
83 question mark; else true.
84 @param ch is an unsigned character containing the 8-bit input value.
85*/
86static inline bool keep( uchar ch )
87{
88 // no CTLs, except HT and not '?'
89 return !( ( ch < ' ' && ch != '\t' ) || ch == '?' );
90}
91
92//
93// QuotedPrintableCodec
94//
95
96class QuotedPrintableEncoder : public Encoder
97{
98 char mInputBuffer[16];
99 uchar mCurrentLineLength; // 0..76
100 uchar mAccu;
101 uint mInputBufferReadCursor : 4; // 0..15
102 uint mInputBufferWriteCursor : 4; // 0..15
103 enum {
104 Never, AtBOL, Definitely
105 } mAccuNeedsEncoding : 2;
106 bool mSawLineEnd : 1;
107 bool mSawCR : 1;
108 bool mFinishing : 1;
109 bool mFinished : 1;
110 protected:
111 friend class QuotedPrintableCodec;
112 QuotedPrintableEncoder( bool withCRLF=false )
113 : Encoder( withCRLF ), mCurrentLineLength( 0 ), mAccu( 0 ),
114 mInputBufferReadCursor( 0 ), mInputBufferWriteCursor( 0 ),
115 mAccuNeedsEncoding( Never ),
116 mSawLineEnd( false ), mSawCR( false ), mFinishing( false ),
117 mFinished( false ) {}
118
119 bool needsEncoding( uchar ch )
120 { return ch > '~' || ( ch < ' ' && ch != '\t' ) || ch == '='; }
121 bool needsEncodingAtEOL( uchar ch )
122 { return ch == ' ' || ch == '\t'; }
123 bool needsEncodingAtBOL( uchar ch )
124 { return ch == 'F' || ch == '.' || ch == '-'; }
125 bool fillInputBuffer( const char* &scursor, const char * const send );
126 bool processNextChar();
127 void createOutputBuffer( char* &dcursor, const char * const dend );
128 public:
129 virtual ~QuotedPrintableEncoder() {}
130
131 bool encode( const char* &scursor, const char * const send,
132 char* &dcursor, const char * const dend );
133
134 bool finish( char* &dcursor, const char * const dend );
135};
136
137class QuotedPrintableDecoder : public Decoder
138{
139 const char mEscapeChar;
140 char mBadChar;
141 /** @p accu holds the msb nibble of the hexchar or zero. */
142 uchar mAccu;
143 /** @p insideHexChar is true iff we're inside an hexchar (=XY).
144 Together with @ref mAccu, we can build this states:
145 @li @p insideHexChar == @p false:
146 normal text
147 @li @p insideHexChar == @p true, @p mAccu == 0:
148 saw the leading '='
149 @li @p insideHexChar == @p true, @p mAccu != 0:
150 saw the first nibble '=X'
151 */
152 const bool mQEncoding;
153 bool mInsideHexChar;
154 bool mFlushing;
155 bool mExpectLF;
156 bool mHaveAccu;
157 /** @p mLastChar holds the first char of an encoded char, so that
158 we are able to keep the first char if the second char is invalid. */
159 char mLastChar;
160 protected:
161 friend class QuotedPrintableCodec;
162 friend class Rfc2047QEncodingCodec;
163 friend class Rfc2231EncodingCodec;
164 QuotedPrintableDecoder( bool withCRLF=false,
165 bool aQEncoding=false, char aEscapeChar='=' )
166 : Decoder( withCRLF ),
167 mEscapeChar( aEscapeChar ),
168 mBadChar( 0 ),
169 mAccu( 0 ),
170 mQEncoding( aQEncoding ),
171 mInsideHexChar( false ),
172 mFlushing( false ),
173 mExpectLF( false ),
174 mHaveAccu( false ),
175 mLastChar( 0 ) {}
176 public:
177 virtual ~QuotedPrintableDecoder() {}
178
179 bool decode( const char* &scursor, const char * const send,
180 char* &dcursor, const char * const dend );
181 bool finish( char* & dcursor, const char * const dend );
182};
183
184class Rfc2047QEncodingEncoder : public Encoder
185{
186 uchar mAccu;
187 uchar mStepNo;
188 const char mEscapeChar;
189 bool mInsideFinishing : 1;
190 protected:
191 friend class Rfc2047QEncodingCodec;
192 friend class Rfc2231EncodingCodec;
193 Rfc2047QEncodingEncoder( bool withCRLF=false, char aEscapeChar='=' )
194 : Encoder( withCRLF ),
195 mAccu( 0 ), mStepNo( 0 ), mEscapeChar( aEscapeChar ),
196 mInsideFinishing( false )
197 {
198 // else an optimization in ::encode might break.
199 assert( aEscapeChar == '=' || aEscapeChar == '%' );
200 }
201
202 // this code assumes that isEText( mEscapeChar ) == false!
203 bool needsEncoding( uchar ch )
204 {
205 if ( ch > 'z' ) {
206 return true; // {|}~ DEL and 8bit chars need
207 }
208 if ( !isEText( ch ) ) {
209 return true; // all but a-zA-Z0-9!/*+- need, too
210 }
211 if ( mEscapeChar == '%' && ( ch == '*' || ch == '/' ) ) {
212 return true; // not allowed in rfc2231 encoding
213 }
214 return false;
215 }
216
217 public:
218 virtual ~Rfc2047QEncodingEncoder() {}
219
220 bool encode( const char* & scursor, const char * const send,
221 char* & dcursor, const char * const dend );
222 bool finish( char* & dcursor, const char * const dend );
223};
224
225// this doesn't access any member variables, so it can be defined static
226// but then we can't call it from virtual functions
227static int QuotedPrintableDecoder_maxDecodedSizeFor( int insize, bool withCRLF )
228{
229 // all chars unencoded:
230 int result = insize;
231 // but maybe all of them are \n and we need to make them \r\n :-o
232 if ( withCRLF )
233 result += insize;
234
235 // there might be an accu plus escape
236 result += 2;
237
238 return result;
239}
240
241Encoder *QuotedPrintableCodec::makeEncoder( bool withCRLF ) const
242{
243 return new QuotedPrintableEncoder( withCRLF );
244}
245
246Decoder *QuotedPrintableCodec::makeDecoder( bool withCRLF ) const
247{
248 return new QuotedPrintableDecoder( withCRLF );
249}
250
251int QuotedPrintableCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const
252{
253 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
254}
255
256Encoder *Rfc2047QEncodingCodec::makeEncoder( bool withCRLF ) const
257{
258 return new Rfc2047QEncodingEncoder( withCRLF );
259}
260
261Decoder *Rfc2047QEncodingCodec::makeDecoder( bool withCRLF ) const
262{
263 return new QuotedPrintableDecoder( withCRLF, true );
264}
265
266int Rfc2047QEncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const
267{
268 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
269}
270
271Encoder *Rfc2231EncodingCodec::makeEncoder( bool withCRLF ) const
272{
273 return new Rfc2047QEncodingEncoder( withCRLF, '%' );
274}
275
276Decoder *Rfc2231EncodingCodec::makeDecoder( bool withCRLF ) const
277{
278 return new QuotedPrintableDecoder( withCRLF, true, '%' );
279}
280
281int Rfc2231EncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const
282{
283 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
284}
285
286/********************************************************/
287/********************************************************/
288/********************************************************/
289
290bool QuotedPrintableDecoder::decode( const char* &scursor,
291 const char * const send,
292 char* &dcursor, const char * const dend )
293{
294 if ( mWithCRLF ) {
295 kWarning() << "CRLF output for decoders isn't yet supported!";
296 }
297
298 while ( scursor != send && dcursor != dend ) {
299 if ( mFlushing ) {
300 // we have to flush chars in the aftermath of an decoding
301 // error. The way to request a flush is to
302 // - store the offending character in mBadChar and
303 // - set mFlushing to true.
304 // The supported cases are (H: hexchar, X: bad char):
305 // =X, =HX, CR
306 // mBadChar is only written out if it is not by itself illegal in
307 // quoted-printable (e.g. CTLs, 8Bits).
308 // A fast way to suppress mBadChar output is to set it to NUL.
309 if ( mInsideHexChar ) {
310 // output '='
311 *dcursor++ = mEscapeChar;
312 mInsideHexChar = false;
313 } else if ( mHaveAccu ) {
314 // output the high nibble of the accumulator:
315 *dcursor++ = mLastChar;
316 mHaveAccu = false;
317 mAccu = 0;
318 } else {
319 // output mBadChar
320 assert( mAccu == 0 );
321 if ( mBadChar ) {
322 if ( mBadChar == '=' ) {
323 mInsideHexChar = true;
324 } else {
325 *dcursor++ = mBadChar;
326 }
327 mBadChar = 0;
328 }
329 mFlushing = false;
330 }
331 continue;
332 }
333 assert( mBadChar == 0 );
334
335 uchar ch = *scursor++;
336 uchar value = 255;
337
338 if ( mExpectLF && ch != '\n' ) {
339 kWarning() << "QuotedPrintableDecoder:"
340 "illegally formed soft linebreak or lonely CR!";
341 mInsideHexChar = false;
342 mExpectLF = false;
343 assert( mAccu == 0 );
344 }
345
346 if ( mInsideHexChar ) {
347 // next char(s) represent nibble instead of itself:
348 if ( ch <= '9' ) {
349 if ( ch >= '0' ) {
350 value = ch - '0';
351 } else {
352 switch ( ch ) {
353 case '\r':
354 mExpectLF = true;
355 break;
356 case '\n':
357 // soft line break, but only if mAccu is NUL.
358 if ( !mHaveAccu ) {
359 mExpectLF = false;
360 mInsideHexChar = false;
361 break;
362 }
363 // else fall through
364 default:
365 kWarning() << "QuotedPrintableDecoder:"
366 "illegally formed hex char! Outputting verbatim.";
367 mBadChar = ch;
368 mFlushing = true;
369 }
370 continue;
371 }
372 } else { // ch > '9'
373 if ( ch <= 'F' ) {
374 if ( ch >= 'A' ) {
375 value = 10 + ch - 'A';
376 } else { // [:-@]
377 mBadChar = ch;
378 mFlushing = true;
379 continue;
380 }
381 } else { // ch > 'F'
382 if ( ch <= 'f' && ch >= 'a' ) {
383 value = 10 + ch - 'a';
384 } else {
385 mBadChar = ch;
386 mFlushing = true;
387 continue;
388 }
389 }
390 }
391
392 assert( value < 16 );
393 assert( mBadChar == 0 );
394 assert( !mExpectLF );
395
396 if ( mHaveAccu ) {
397 *dcursor++ = char( mAccu | value );
398 mAccu = 0;
399 mHaveAccu = false;
400 mInsideHexChar = false;
401 } else {
402 mHaveAccu = true;
403 mAccu = value << 4;
404 mLastChar = ch;
405 }
406 } else { // not mInsideHexChar
407 if ( ( ch <= '~' && ch >= ' ' ) || ch == '\t' ) {
408 if ( ch == mEscapeChar ) {
409 mInsideHexChar = true;
410 } else if ( mQEncoding && ch == '_' ) {
411 *dcursor++ = char( 0x20 );
412 } else {
413 *dcursor++ = char( ch );
414 }
415 } else if ( ch == '\n' ) {
416 *dcursor++ = '\n';
417 mExpectLF = false;
418 } else if ( ch == '\r' ) {
419 mExpectLF = true;
420 } else {
421 //kWarning() << "QuotedPrintableDecoder:" << ch <<
422 // "illegal character in input stream!";
423 *dcursor++ = char( ch );
424 }
425 }
426 }
427
428 return scursor == send;
429}
430
431bool QuotedPrintableDecoder::finish( char* &dcursor, const char * const dend )
432{
433 while ( ( mInsideHexChar || mHaveAccu || mFlushing ) && dcursor != dend ) {
434 // we have to flush chars
435 if ( mInsideHexChar ) {
436 // output '='
437 *dcursor++ = mEscapeChar;
438 mInsideHexChar = false;
439 }
440 else if ( mHaveAccu ) {
441 // output the high nibble of the accumulator:
442 *dcursor++ = mLastChar;
443 mHaveAccu = false;
444 mAccu = 0;
445 } else {
446 // output mBadChar
447 assert( mAccu == 0 );
448 if ( mBadChar ) {
449 *dcursor++ = mBadChar;
450 mBadChar = 0;
451 }
452 mFlushing = false;
453 }
454 }
455
456 // return false if we are not finished yet; note that mInsideHexChar is always false
457 return !( mHaveAccu || mFlushing );
458}
459
460bool QuotedPrintableEncoder::fillInputBuffer( const char* &scursor,
461 const char * const send ) {
462 // Don't read more if there's still a tail of a line in the buffer:
463 if ( mSawLineEnd ) {
464 return true;
465 }
466
467 // Read until the buffer is full or we have found CRLF or LF (which
468 // don't end up in the input buffer):
469 for ( ; ( mInputBufferWriteCursor + 1 ) % 16 != mInputBufferReadCursor &&
470 scursor != send ; mInputBufferWriteCursor++ ) {
471 char ch = *scursor++;
472 if ( ch == '\r' ) {
473 mSawCR = true;
474 } else if ( ch == '\n' ) {
475 // remove the CR from the input buffer (if any) and return that
476 // we found a line ending:
477 if ( mSawCR ) {
478 mSawCR = false;
479 assert( mInputBufferWriteCursor != mInputBufferReadCursor );
480 mInputBufferWriteCursor--;
481 }
482 mSawLineEnd = true;
483 return true; // saw CRLF or LF
484 } else {
485 mSawCR = false;
486 }
487 mInputBuffer[ mInputBufferWriteCursor ] = ch;
488 }
489 mSawLineEnd = false;
490 return false; // didn't see a line ending...
491}
492
493bool QuotedPrintableEncoder::processNextChar()
494{
495
496 // If we process a buffer which doesn't end in a line break, we
497 // can't process all of it, since the next chars that will be read
498 // could be a line break. So we empty the buffer only until a fixed
499 // number of chars is left (except when mFinishing, which means that
500 // the data doesn't end in newline):
501 const int minBufferFillWithoutLineEnd = 4;
502
503 assert( mOutputBufferCursor == 0 );
504
505 int bufferFill =
506 int( mInputBufferWriteCursor ) - int( mInputBufferReadCursor ) ;
507 if ( bufferFill < 0 ) {
508 bufferFill += 16;
509 }
510
511 assert( bufferFill >=0 && bufferFill <= 15 );
512
513 if ( !mFinishing && !mSawLineEnd &&
514 bufferFill < minBufferFillWithoutLineEnd ) {
515 return false;
516 }
517
518 // buffer is empty, return false:
519 if ( mInputBufferReadCursor == mInputBufferWriteCursor ) {
520 return false;
521 }
522
523 // Real processing goes here:
524 mAccu = mInputBuffer[ mInputBufferReadCursor++ ];
525 if ( needsEncoding( mAccu ) ) { // always needs encoding or
526 mAccuNeedsEncoding = Definitely;
527 } else if ( ( mSawLineEnd || mFinishing ) && // needs encoding at end of line
528 bufferFill == 1 && // or end of buffer
529 needsEncodingAtEOL( mAccu ) ) {
530 mAccuNeedsEncoding = Definitely;
531 } else if ( needsEncodingAtBOL( mAccu ) ) {
532 mAccuNeedsEncoding = AtBOL;
533 } else {
534 // never needs encoding
535 mAccuNeedsEncoding = Never;
536 }
537
538 return true;
539}
540
541// Outputs processed (verbatim or hex-encoded) chars and inserts soft
542// line breaks as necessary. Depends on processNextChar's directions
543// on whether or not to encode the current char, and whether or not
544// the current char is the last one in it's input line:
545void QuotedPrintableEncoder::createOutputBuffer( char* &dcursor,
546 const char * const dend )
547{
548 const int maxLineLength = 76; // rfc 2045
549
550 assert( mOutputBufferCursor == 0 );
551
552 bool lastOneOnThisLine = mSawLineEnd
553 && mInputBufferReadCursor == mInputBufferWriteCursor;
554
555 int neededSpace = 1;
556 if ( mAccuNeedsEncoding == Definitely ) {
557 neededSpace = 3;
558 }
559
560 // reserve space for the soft hyphen (=)
561 if ( !lastOneOnThisLine ) {
562 neededSpace++;
563 }
564
565 if ( mCurrentLineLength > maxLineLength - neededSpace ) {
566 // current line too short, insert soft line break:
567 write( '=', dcursor, dend );
568 writeCRLF( dcursor, dend );
569 mCurrentLineLength = 0;
570 }
571
572 if ( Never == mAccuNeedsEncoding ||
573 ( AtBOL == mAccuNeedsEncoding && mCurrentLineLength != 0 ) ) {
574 write( mAccu, dcursor, dend );
575 mCurrentLineLength++;
576 } else {
577 write( '=', dcursor, dend );
578 write( binToHex( highNibble( mAccu ) ), dcursor, dend );
579 write( binToHex( lowNibble( mAccu ) ), dcursor, dend );
580 mCurrentLineLength += 3;
581 }
582}
583
584bool QuotedPrintableEncoder::encode( const char* &scursor,
585 const char * const send,
586 char* &dcursor, const char * const dend )
587{
588 // support probing by the caller:
589 if ( mFinishing ) {
590 return true;
591 }
592
593 while ( scursor != send && dcursor != dend ) {
594 if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) ) {
595 return scursor == send;
596 }
597
598 assert( mOutputBufferCursor == 0 );
599
600 // fill input buffer until eol has been reached or until the
601 // buffer is full, whatever comes first:
602 fillInputBuffer( scursor, send );
603
604 if ( processNextChar() ) {
605 // there was one...
606 createOutputBuffer( dcursor, dend );
607 } else if ( mSawLineEnd &&
608 mInputBufferWriteCursor == mInputBufferReadCursor ) {
609 // load a hard line break into output buffer:
610 writeCRLF( dcursor, dend );
611 // signal fillInputBuffer() we are ready for the next line:
612 mSawLineEnd = false;
613 mCurrentLineLength = 0;
614 } else {
615 // we are supposedly finished with this input block:
616 break;
617 }
618 }
619
620 // make sure we write as much as possible and don't stop _writing_
621 // just because we have no more _input_:
622 if ( mOutputBufferCursor ) {
623 flushOutputBuffer( dcursor, dend );
624 }
625
626 return scursor == send;
627
628} // encode
629
630bool QuotedPrintableEncoder::finish( char* &dcursor, const char * const dend )
631{
632 mFinishing = true;
633
634 if ( mFinished ) {
635 return flushOutputBuffer( dcursor, dend );
636 }
637
638 while ( dcursor != dend ) {
639 if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) ) {
640 return false;
641 }
642
643 assert( mOutputBufferCursor == 0 );
644
645 if ( processNextChar() ) {
646 // there was one...
647 createOutputBuffer( dcursor, dend );
648 } else if ( mSawLineEnd &&
649 mInputBufferWriteCursor == mInputBufferReadCursor ) {
650 // load a hard line break into output buffer:
651 writeCRLF( dcursor, dend );
652 mSawLineEnd = false;
653 mCurrentLineLength = 0;
654 } else {
655 mFinished = true;
656 return flushOutputBuffer( dcursor, dend );
657 }
658 }
659
660 return mFinished && !mOutputBufferCursor;
661
662} // finish
663
664bool Rfc2047QEncodingEncoder::encode( const char* &scursor,
665 const char * const send,
666 char* &dcursor, const char * const dend )
667{
668 if ( mInsideFinishing ) {
669 return true;
670 }
671
672 while ( scursor != send && dcursor != dend ) {
673 uchar value = 0;
674 switch ( mStepNo ) {
675 case 0:
676 // read the next char and decide if and how do encode:
677 mAccu = *scursor++;
678 if ( !needsEncoding( mAccu ) ) {
679 *dcursor++ = char( mAccu );
680 } else if ( mEscapeChar == '=' && mAccu == 0x20 ) {
681 // shortcut encoding for 0x20 (latin-1/us-ascii SPACE)
682 // (not for rfc2231 encoding)
683 *dcursor++ = '_';
684 } else {
685 // needs =XY encoding - write escape char:
686 *dcursor++ = mEscapeChar;
687 mStepNo = 1;
688 }
689 continue;
690 case 1:
691 // extract hi-nibble:
692 value = highNibble( mAccu );
693 mStepNo = 2;
694 break;
695 case 2:
696 // extract lo-nibble:
697 value = lowNibble( mAccu );
698 mStepNo = 0;
699 break;
700 default: assert( 0 );
701 }
702
703 // and write:
704 *dcursor++ = binToHex( value );
705 }
706
707 return scursor == send;
708} // encode
709
710#include <QtCore/QString>
711
712bool Rfc2047QEncodingEncoder::finish( char* &dcursor, const char * const dend )
713{
714 mInsideFinishing = true;
715
716 // write the last bits of mAccu, if any:
717 while ( mStepNo != 0 && dcursor != dend ) {
718 uchar value = 0;
719 switch ( mStepNo ) {
720 case 1:
721 // extract hi-nibble:
722 value = highNibble( mAccu );
723 mStepNo = 2;
724 break;
725 case 2:
726 // extract lo-nibble:
727 value = lowNibble( mAccu );
728 mStepNo = 0;
729 break;
730 default: assert( 0 );
731 }
732
733 // and write:
734 *dcursor++ = binToHex( value );
735 }
736
737 return mStepNo == 0;
738}
739
740} // namespace KMime
741