1/* -*- c++ -*-
2
3 KMime, the KDE Internet mail/usenet news message library.
4 Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
20*/
21/**
22 @file
23 This file is part of the API for handling @ref MIME data and
24 defines the Codec class.
25
26 @brief
27 Defines the classes Codec class.
28
29 @authors Marc Mutz \<mutz@kde.org\>
30
31 @glossary @anchor MIME @anchor mime @b MIME:
32 <b>Multipurpose Internet Mail Extensions</b> or @acronym MIME is an
33 Internet Standard that extends the format of e-mail to support text in
34 character sets other than US-ASCII, non-text attachments, multi-part message
35 bodies, and header information in non-ASCII character sets. Virtually all
36 human-written Internet e-mail and a fairly large proportion of automated
37 e-mail is transmitted via @acronym SMTP in MIME format. Internet e-mail is
38 so closely associated with the SMTP and MIME standards that it is sometimes
39 called SMTP/MIME e-mail. The content types defined by MIME standards are
40 also of growing importance outside of e-mail, such as in communication
41 protocols like @acronym HTTP for the World Wide Web. MIME is also a
42 fundamental component of communication protocols such as HTTP, which
43 requires that data be transmitted in the context of e-mail-like messages,
44 even though the data may not actually be e-mail.
45
46 @glossary @anchor codec @anchor codecs @anchor Codec @anchor Codecs @b codec:
47 a program capable of performing encoding and decoding on a digital data
48 stream. Codecs encode data for storage or encryption and decode it for
49 viewing or editing.
50
51 @glossary @anchor CRLF @b CRLF: a "Carriage Return (0x0D)" followed by a
52 "Line Feed (0x0A)", two ASCII control characters used to represent a
53 newline on some operating systems, notably DOS and Microsoft Windows.
54
55 @glossary @anchor LF @b LF: a "Line Feed (0x0A)" ASCII control character used
56 to represent a newline on some operating systems, notably Unix, Unix-like,
57 and Linux.
58*/
59
60#ifndef __KMIME_CODECS__
61#define __KMIME_CODECS__
62
63#include <QtCore/QByteArray>
64
65#include <kdebug.h> // for kFatal()
66
67#include "kmime_export.h"
68
69namespace KMime {
70
71template <class Key, class T> class KAutoDeleteHash;
72
73class Encoder;
74class Decoder;
75
76/**
77 @brief
78 An abstract base class of @ref codecs for common mail transfer encodings.
79
80 Provides an abstract base class of @ref codecs like base64 and quoted-printable.
81 Implemented as a singleton.
82*/
83class KMIME_EXPORT Codec
84{
85 protected:
86 //@cond PRIVATE
87 static KAutoDeleteHash<QByteArray, Codec> *all;
88 static void cleanupCodec();
89 //@endcond
90 /**
91 Contructs the codec.
92 */
93 Codec() {}
94
95 public:
96 /**
97 Returns a codec associated with the specified @p name.
98
99 @param name points to a character string containing a valid codec name.
100 */
101 static Codec *codecForName( const char *name );
102
103 /**
104 Returns a codec associated with the specified @p name.
105
106 @param name is a QByteArray containing a valid codec name.
107 */
108 static Codec *codecForName( const QByteArray &name );
109
110 /**
111 Computes the maximum size, in characters, needed for the encoding.
112
113 @param insize is the number of input characters to be encoded.
114 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
115
116 @return the maximum number of characters in the encoding.
117 */
118 virtual int maxEncodedSizeFor( int insize, bool withCRLF=false ) const = 0;
119
120 /**
121 Computes the maximum size, in characters, needed for the deccoding.
122
123 @param insize is the number of input characters to be decoded.
124 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
125
126 @return the maximum number of characters in the decoding.
127 */
128 virtual int maxDecodedSizeFor( int insize, bool withCRLF=false ) const = 0;
129
130 /**
131 Creates the encoder for the codec.
132
133 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
134
135 @return a pointer to an instance of the codec's encoder.
136 */
137 virtual Encoder *makeEncoder( bool withCRLF=false ) const = 0;
138
139 /**
140 Creates the decoder for the codec.
141
142 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
143
144 @return a pointer to an instance of the codec's decoder.
145 */
146 virtual Decoder *makeDecoder( bool withCRLF=false ) const = 0;
147
148 /**
149 Convenience wrapper that can be used for small chunks of data
150 when you can provide a large enough buffer. The default
151 implementation creates an Encoder and uses it.
152
153 Encodes a chunk of bytes starting at @p scursor and extending to
154 @p send into the buffer described by @p dcursor and @p dend.
155
156 This function doesn't support chaining of blocks. The returned
157 block cannot be added to, but you don't need to finalize it, too.
158
159 Example usage (@p in contains the input data):
160 <pre>
161 KMime::Codec *codec = KMime::Codec::codecForName( "base64" );
162 kFatal( !codec ) << "no base64 codec found!?";
163 QByteArray out( in.size()*1.4 ); // crude maximal size of b64 encoding
164 QByteArray::Iterator iit = in.begin();
165 QByteArray::Iterator oit = out.begin();
166 if ( !codec->encode( iit, in.end(), oit, out.end() ) ) {
167 kDebug() << "output buffer too small";
168 return;
169 }
170 kDebug() << "Size of encoded data:" << oit - out.begin();
171 </pre>
172
173 @param scursor is a pointer to the start of the input buffer.
174 @param send is a pointer to the end of the input buffer.
175 @param dcursor is a pointer to the start of the output buffer.
176 @param dend is a pointer to the end of the output buffer.
177 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
178
179 @return false if the encoded data didn't fit into the output buffer;
180 true otherwise.
181 */
182 virtual bool encode( const char* &scursor, const char * const send,
183 char* &dcursor, const char * const dend,
184 bool withCRLF=false ) const;
185
186 /**
187 Convenience wrapper that can be used for small chunks of data
188 when you can provide a large enough buffer. The default
189 implementation creates a Decoder and uses it.
190
191 Decodes a chunk of bytes starting at @p scursor and extending to
192 @p send into the buffer described by @p dcursor and @p dend.
193
194 This function doesn't support chaining of blocks. The returned
195 block cannot be added to, but you don't need to finalize it, too.
196
197 Example usage (@p in contains the input data):
198 <pre>
199 KMime::Codec *codec = KMime::Codec::codecForName( "base64" );
200 kFatal( !codec ) << "no base64 codec found!?";
201 QByteArray out( in.size() ); // good guess for any encoding...
202 QByteArray::Iterator iit = in.begin();
203 QByteArray::Iterator oit = out.begin();
204 if ( !codec->decode( iit, in.end(), oit, out.end() ) ) {
205 kDebug() << "output buffer too small";
206 return;
207 }
208 kDebug() << "Size of decoded data:" << oit - out.begin();
209 </pre>
210
211 @param scursor is a pointer to the start of the input buffer.
212 @param send is a pointer to the end of the input buffer.
213 @param dcursor is a pointer to the start of the output buffer.
214 @param dend is a pointer to the end of the output buffer.
215 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
216
217 @return false if the decoded data didn't fit into the output buffer;
218 true otherwise.
219 */
220 virtual bool decode( const char* &scursor, const char * const send,
221 char* &dcursor, const char * const dend,
222 bool withCRLF=false ) const;
223
224 /**
225 Even more convenient, but also a bit slower and more memory
226 intensive, since it allocates storage for the worst case and then
227 shrinks the result QByteArray to the actual size again.
228
229 For use with small @p src.
230
231 @param src is a QByteArray containing the data to encode.
232 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
233 */
234 virtual QByteArray encode( const QByteArray &src, bool withCRLF=false ) const;
235
236 /**
237 Even more convenient, but also a bit slower and more memory
238 intensive, since it allocates storage for the worst case and then
239 shrinks the result QByteArray to the actual size again.
240
241 For use with small @p src.
242
243 @param src is a QByteArray containing the data to decode.
244 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
245 */
246 virtual QByteArray decode( const QByteArray &src, bool withCRLF=false ) const;
247
248 /**
249 Returns the name of the encoding. Guaranteed to be lowercase.
250 */
251 virtual const char *name() const = 0;
252
253 /**
254 Destroys the codec.
255 */
256 virtual ~Codec() {}
257
258 private:
259 /**
260 Fills the KAutoDeleteHash with all the supported codecs.
261 */
262 static void fillDictionary();
263};
264
265/**
266 @brief Stateful CTE decoder class
267
268 Stateful decoder class, modelled after QTextDecoder.
269
270 @section Overview
271
272 KMime decoders are designed to be able to process encoded data in
273 chunks of arbitrary size and to work with output buffers of also
274 arbitrary size. They maintain any state necessary to go on where
275 the previous call left off.
276
277 The class consists of only two methods of interest: see decode,
278 which decodes an input block and finalize, which flushes any
279 remaining data to the output stream.
280
281 Typically, you will create a decoder instance, call decode as
282 often as necessary, then call finalize (most often a single
283 call suffices, but it might be that during that call the output
284 buffer is filled, so you should be prepared to call finalize
285 as often as necessary, ie. until it returns @p true).
286
287 @section Return Values
288
289 Both methods return @p true to indicate that they've finished their
290 job. For decode, a return value of @p true means that the
291 current input block has been finished (@p false most often means
292 that the output buffer is full, but that isn't required
293 behavior. The decode call is free to return at arbitrary
294 times during processing).
295
296 For finalize, a return value of @p true means that all data
297 implicitly or explicitly stored in the decoder instance has been
298 flushed to the output buffer. A @p false return value should be
299 interpreted as "check if the output buffer is full and call me
300 again", just as with decode.
301
302 @section Usage Pattern
303
304 Since the decoder maintains state, you can only use it once. After
305 a sequence of input blocks has been processed, you finalize
306 the output and then delete the decoder instance. If you want to
307 process another input block sequence, you create a new instance.
308
309 Typical usage (@p in contains the (base64-encoded) input data),
310 taking into account all the conventions detailed above:
311
312 <pre>
313 KMime::Codec *codec = KMime::Codec::codecForName( "base64" );
314 kFatal( !codec ) << "No codec found for base64!";
315 KMime::Decoder *dec = codec->makeDecoder();
316 assert( dec ); // should not happen
317 QByteArray out( 256 ); // small buffer is enough ;-)
318 QByteArray::Iterator iit = in.begin();
319 QByteArray::Iterator oit = out.begin();
320 // decode the chunk
321 while ( !dec->decode( iit, in.end(), oit, out.end() ) )
322 if ( oit == out.end() ) { // output buffer full, process contents
323 do_something_with( out );
324 oit = out.begin();
325 }
326 // repeat while loop for each input block
327 // ...
328 // finish (flush remaining data from decoder):
329 while ( !dec->finish( oit, out.end() ) )
330 if ( oit == out.end() ) { // output buffer full, process contents
331 do_something_with( out );
332 oit = out.begin();
333 }
334 // now process last chunk:
335 out.resize( oit - out.begin() );
336 do_something_with( out );
337 // _delete_ the decoder, but not the codec:
338 delete dec;
339 </pre>
340*/
341class Decoder
342{
343 protected:
344 friend class Codec;
345 /**
346 Protected constructor. Use KMime::Codec::makeDecoder to create an
347 instance.
348
349 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
350 */
351 Decoder( bool withCRLF=false )
352 : mWithCRLF( withCRLF ) {}
353
354 public:
355 /**
356 Destroys the decoder.
357 */
358 virtual ~Decoder() {}
359
360 /**
361 Decodes a chunk of data, maintaining state information between
362 calls. See class decumentation for calling conventions.
363
364 @param scursor is a pointer to the start of the input buffer.
365 @param send is a pointer to the end of the input buffer.
366 @param dcursor is a pointer to the start of the output buffer.
367 @param dend is a pointer to the end of the output buffer.
368 */
369 virtual bool decode( const char* &scursor, const char * const send,
370 char* &dcursor, const char * const dend ) = 0;
371
372 /**
373 Call this method to finalize the output stream. Writes all
374 remaining data and resets the decoder. See KMime::Codec for
375 calling conventions.
376
377 @param dcursor is a pointer to the start of the output buffer.
378 @param dend is a pointer to the end of the output buffer.
379 */
380 virtual bool finish( char* &dcursor, const char * const dend ) = 0;
381
382 protected:
383 //@cond PRIVATE
384 const bool mWithCRLF;
385 //@endcond
386};
387
388/**
389 @brief
390 Stateful encoder class.
391
392 Stateful encoder class, modeled after QTextEncoder.
393*/
394class Encoder
395{
396 protected:
397 friend class Codec;
398 /**
399 Protected constructor. Use KMime::Codec::makeEncoder if you want one.
400
401 @param withCRLF if true, make the newlines @ref CRLF; else use @ref LF.
402 */
403 explicit Encoder( bool withCRLF=false )
404 : mOutputBufferCursor( 0 ), mWithCRLF( withCRLF ) {}
405
406 public:
407 /**
408 Destroys the encoder.
409 */
410 virtual ~Encoder() {}
411
412 /**
413 Encodes a chunk of data, maintaining state information between
414 calls. See KMime::Codec for calling conventions.
415
416 @param scursor is a pointer to the start of the input buffer.
417 @param send is a pointer to the end of the input buffer.
418 @param dcursor is a pointer to the start of the output buffer.
419 @param dend is a pointer to the end of the output buffer.
420 */
421 virtual bool encode( const char* &scursor, const char * const send,
422 char* &dcursor, const char * const dend ) = 0;
423
424 /**
425 Call this method to finalize the output stream. Writes all remaining
426 data and resets the encoder. See KMime::Codec for calling conventions.
427
428 @param dcursor is a pointer to the start of the output buffer.
429 @param dend is a pointer to the end of the output buffer.
430 */
431 virtual bool finish( char* &dcursor, const char * const dend ) = 0;
432
433 protected:
434 /**
435 The maximum number of characters permitted in the output buffer.
436 */
437 enum {
438 maxBufferedChars = 8 /**< Eight */
439 };
440
441 /**
442 Writes character @p ch to the output stream or the output buffer,
443 depending on whether or not the output stream has space left.
444
445 @param ch is the character to write.
446 @param dcursor is a pointer to the start of the output buffer.
447 @param dend is a pointer to the end of the output buffer.
448
449 @return true if written to the output stream; else false if buffered.
450 */
451 bool write( char ch, char* &dcursor, const char * const dend )
452 {
453 if ( dcursor != dend ) {
454 // if there's space in the output stream, write there:
455 *dcursor++ = ch;
456 return true;
457 } else {
458 // else buffer the output:
459 kFatal( mOutputBufferCursor >= maxBufferedChars )
460 << "KMime::Encoder: internal buffer overflow!";
461 mOutputBuffer[ mOutputBufferCursor++ ] = ch;
462 return false;
463 }
464 }
465
466 /**
467 Writes characters from the output buffer to the output stream.
468 Implementations of encode and finish should call this
469 at the very beginning and for each iteration of the while loop.
470
471 @param dcursor is a pointer to the start of the output buffer.
472 @param dend is a pointer to the end of the output buffer.
473
474 @return true if all chars could be written, false otherwise
475 */
476 bool flushOutputBuffer( char* &dcursor, const char * const dend );
477
478 /**
479 Convenience function. Outputs @ref LF or @ref CRLF, based on the
480 state of mWithCRLF.
481
482 @param dcursor is a pointer to the start of the output buffer.
483 @param dend is a pointer to the end of the output buffer.
484 */
485 bool writeCRLF( char* &dcursor, const char * const dend )
486 {
487 if ( mWithCRLF ) {
488 write( '\r', dcursor, dend );
489 }
490 return write( '\n', dcursor, dend );
491 }
492
493 private:
494 /**
495 An output buffer to simplify some codecs.
496 Used with write() and flushOutputBuffer().
497 */
498 //@cond PRIVATE
499 char mOutputBuffer[ maxBufferedChars ];
500 //@endcond
501
502 protected:
503 //@cond PRIVATE
504 uchar mOutputBufferCursor;
505 const bool mWithCRLF;
506 //@endcond
507};
508
509} // namespace KMime
510
511#endif // __KMIME_CODECS__
512