kmime_parsers.cpp [kdepimlibs/kmime/kmime_parsers.cpp]

1	/*
2	kmime_parsers.cpp
3
4	KMime, the KDE Internet mail/usenet news message library.
5	Copyright (c) 2001 the KMime authors.
6	See file AUTHORS for details
7
8	This library is free software; you can redistribute it and/or
9	modify it under the terms of the GNU Library General Public
10	License as published by the Free Software Foundation; either
11	version 2 of the License, or (at your option) any later version.
12
13	This library is distributed in the hope that it will be useful,
14	but WITHOUT ANY WARRANTY; without even the implied warranty of
15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16	Library General Public License for more details.
17
18	You should have received a copy of the GNU Library General Public License
19	along with this library; see the file COPYING.LIB. If not, write to
20	the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21	Boston, MA 02110-1301, USA.
22	*/
23	#include "kmime_parsers.h"
24
25	#include <QtCore/QRegExp>
26	#include <QtCore/QByteArray>
27
28	using namespace KMime::Parser;
29
30	namespace KMime {
31	namespace Parser {
32
33	MultiPart::MultiPart( const QByteArray &src, const QByteArray &boundary )
34	{
35	s_rc =src;
36	b_oundary =boundary;
37	}
38
39	bool MultiPart::parse()
40	{
41	QByteArray b = "--" + b_oundary, part;
42	int pos1=`0`, pos2=`0`, blen=b.length();
43
44	p_arts.clear();
45
46	//find the first valid boundary
47	while ( `1` ) {
48	if ( ( pos1 = s_rc.indexOf( b, pos1 ) ) == -`1` \|\| pos1 == `0` \|\|
49	s_rc [pos1-`1`] == '\n' ) { //valid boundary found or no boundary at all
50	break;
51	}
52	pos1 += blen; //boundary found but not valid => skip it;
53	}
54
55	if ( pos1 > -`1` ) {
56	pos1 += blen;
57	if ( s_rc [pos1] == '-' && s_rc [pos1+`1`] == '-' ) {
58	// the only valid boundary is the end-boundary
59	// this message is really* broken*
60	pos1 = -`1`; //we give up
61	} else if ( ( pos1 - blen ) > `1` ) { //preamble present
62	p_reamble = s_rc.left( pos1 - blen - `1` );
63	}
64	}
65
66	while ( pos1 > -`1` && pos2 > -`1` ) {
67
68	//skip the rest of the line for the first boundary - the message-part starts here
69	if ( ( pos1 = s_rc.indexOf( '\n', pos1 ) ) > -`1` ) {
70	//now search the next linebreak
71	//now find the next valid boundary
72	pos2=++pos1; //pos1 and pos2 point now to the beginning of the next line after the boundary
73	while ( `1` ) {
74	if ( ( pos2 = s_rc.indexOf( b, pos2 ) ) == -`1` \|\|
75	s_rc [pos2-`1`] == '\n' ) { //valid boundary or no more boundaries found
76	break;
77	}
78	pos2 += blen; //boundary is invalid => skip it;
79	}
80
81	if ( pos2 == -`1` ) { // no more boundaries found
82	part = s_rc.mid( pos1, s_rc.length() - pos1 ); //take the rest of the string
83	p_arts.append( part );
84	pos1 = -`1`;
85	pos2 = -`1`; //break;
86	} else {
87	part = s_rc.mid( pos1, pos2 - pos1 - `1` ); // pos2 - 1 (\n) is part of the boundary (see RFC 2046, section 5.1.1)
88	p_arts.append( part );
89	pos2 += blen; //pos2 points now to the first character after the boundary
90	if ( s_rc [pos2] == '-' && s_rc [pos2+`1`] == '-' ) { //end-boundary
91	pos1 = pos2 + `2`; //pos1 points now to the character directly after the end-boundary
92
93	if ( ( pos1 = s_rc.indexOf( '\n', pos1 ) ) > -`1` ) { //skip the rest of this line
94	//everything after the end-boundary is considered as the epilouge
95	e_pilouge = s_rc.mid( pos1 + `1`, s_rc.length() - pos1 - `1` );
96	}
97	pos1 = -`1`;
98	pos2 = -`1`; //break
99	} else {
100	pos1 = pos2; //the search continues ...
101	}
102	}
103	}
104	}
105
106	return !p_arts.isEmpty();
107	}
108
109	//=============================================================================
110
111	NonMimeParser::NonMimeParser( const QByteArray &src ) :
112	s_rc ( src ), p_artNr( -`1` ), t_otalNr( -`1` )
113	{
114	}
115
116	/**
117	* try to guess the mimetype from the file-extension
118	*/
119	QByteArray NonMimeParser::guessMimeType( const QByteArray &fileName )
120	{
121	QByteArray tmp, mimeType;
122	int pos;
123
124	if ( !fileName.isEmpty() ) {
125	pos = fileName.lastIndexOf( '.' );
126	if ( pos++ != -`1` ) {
127	tmp = fileName.mid( pos, fileName.length() - pos ).toUpper();
128	if ( tmp == "JPG" \|\| tmp == "JPEG" ) {
129	mimeType = "image/jpeg";
130	} else if ( tmp == "GIF" ) {
131	mimeType = "image/gif";
132	} else if ( tmp == "PNG" ) {
133	mimeType = "image/png";
134	} else if ( tmp == "TIFF" \|\| tmp == "TIF" ) {
135	mimeType = "image/tiff";
136	} else if ( tmp == "XPM" ) {
137	mimeType = "image/x-xpixmap";
138	} else if ( tmp == "XBM" ) {
139	mimeType = "image/x-xbitmap";
140	} else if ( tmp == "BMP" ) {
141	mimeType = "image/bmp";
142	} else if ( tmp == "TXT" \|\|
143	tmp == "ASC" \|\|
144	tmp == "H" \|\|
145	tmp == "C" \|\|
146	tmp == "CC" \|\|
147	tmp == "CPP" ) {
148	mimeType = "text/plain";
149	} else if ( tmp == "HTML" \|\| tmp == "HTM" ) {
150	mimeType = "text/html";
151	} else {
152	mimeType = "application/octet-stream";
153	}
154	} else {
155	mimeType = "application/octet-stream";
156	}
157	} else {
158	mimeType = "application/octet-stream";
159	}
160
161	return mimeType;
162	}
163
164	//==============================================================================
165
166	UUEncoded::UUEncoded( const QByteArray &src, const QByteArray &subject ) :
167	NonMimeParser ( src ), s_ubject ( subject )
168	{}
169
170	bool UUEncoded::parse()
171	{
172	int currentPos=`0`;
173	bool success=true, firstIteration=true;
174
175	while ( success ) {
176	int beginPos=currentPos, uuStart=currentPos, endPos=`0`, lineCount=`0`, MCount=`0`, pos=`0`, len=`0`;
177	bool containsBegin=false, containsEnd=false;
178	QByteArray tmp, fileName;
179
180	if ( ( beginPos = QString::fromLatin1( s_rc ).indexOf( QRegExp( QLatin1String ( "begin [0-9][0-9][0-9]" ) ),
181	currentPos ) ) > -`1` &&
182	( beginPos == `0` \|\| s_rc.at( beginPos - `1` ) == '\n' ) ) {
183	containsBegin = true;
184	uuStart = s_rc.indexOf( '\n', beginPos );
185	if ( uuStart == -`1` ) {//no more line breaks found, we give up
186	success = false;
187	break;
188	} else {
189	uuStart++; //points now at the beginning of the next line
190	}
191	} else {
192	beginPos=currentPos;
193	}
194
195	if ( ( endPos = s_rc.indexOf( "\nend", ( uuStart > `0` ) ? uuStart - `1` : `0` ) ) == -`1` ) {
196	endPos = s_rc.length(); //no end found
197	} else {
198	containsEnd = true;
199	}
200
201	if ( ( containsBegin && containsEnd ) \|\| firstIteration ) {
202
203	//printf("beginPos=%d , uuStart=%d , endPos=%d\n", beginPos, uuStart, endPos);
204	//all lines in a uuencoded text start with 'M'
205	for ( int idx=uuStart; idx<endPos; idx++ ) {
206	if ( s_rc [idx] == '\n' ) {
207	lineCount++;
208	if ( idx + `1` < endPos && s_rc [idx + `1`] == 'M' ) {
209	idx++;
210	MCount++;
211	}
212	}
213	}
214
215	//printf("lineCount=%d , MCount=%d\n", lineCount, MCount);
216	if ( MCount == `0` \|\| ( lineCount - MCount ) > `10` \|\|
217	( ( !containsBegin \|\| !containsEnd ) && ( MCount < `15` ) ) ) {
218	// harder check for split-articles
219	success = false;
220	break; //too many "non-M-Lines" found, we give up
221	}
222
223	if ( ( !containsBegin \|\| !containsEnd ) && !s_ubject.isNull() ) {
224	// message may be split up => parse subject
225	QRegExp rx( QLatin1String ( "[0-9]+/[0-9]+" ) );
226	pos = rx.indexIn( QLatin1String ( s_ubject ), `0` );
227	len = rx.matchedLength();
228	if ( pos != -`1` ) {
229	tmp = s_ubject.mid( pos, len );
230	pos = tmp.indexOf( '/' );
231	p_artNr = tmp.left( pos ).toInt();
232	t_otalNr = tmp.right( tmp.length() - pos - `1` ).toInt();
233	} else {
234	success = false;
235	break; //no "part-numbers" found in the subject, we give up
236	}
237	}
238
239	//everything before "begin" is text
240	if ( beginPos > `0` ) {
241	t_ext.append( s_rc.mid( currentPos, beginPos - currentPos ) );
242	}
243
244	if ( containsBegin ) {
245	//everything between "begin ### " and the next LF is considered as the filename
246	fileName = s_rc.mid( beginPos + `10`, uuStart - beginPos - `11` );
247	} else {
248	fileName = "";
249	}
250	f_ilenames.append( fileName );
251	//everything beetween "begin" and "end" is uuencoded
252	b_ins.append( s_rc.mid( uuStart, endPos - uuStart + `1` ) );
253	m_imeTypes.append( guessMimeType( fileName ) );
254	firstIteration = false;
255
256	int next = s_rc.indexOf( '\n', endPos + `1` );
257	if ( next == -`1` ) { //no more line breaks found, we give up
258	success = false;
259	break;
260	} else {
261	next++; //points now at the beginning of the next line
262	}
263	currentPos = next;
264
265	} else {
266	success = false;
267	}
268	}
269
270	// append trailing text part of the article
271	t_ext.append( s_rc.right( s_rc.length() - currentPos ) );
272
273	return ( ( b_ins.count() > `0` ) \|\| isPartial() );
274	}
275
276	//==============================================================================
277
278	YENCEncoded::YENCEncoded( const QByteArray &src ) :
279	NonMimeParser ( src )
280	{
281	}
282
283	bool YENCEncoded::yencMeta( QByteArray &src, const QByteArray &name, int *value )
284	{
285	bool found = false;
286	QByteArray sought=name + '=';
287
288	int iPos = src.indexOf( sought );
289	if ( iPos > -`1` ) {
290	int pos1 = src.indexOf( ' ', iPos );
291	int pos2 = src.indexOf( '\r', iPos );
292	int pos3 = src.indexOf( '\t', iPos );
293	int pos4 = src.indexOf( '\n', iPos );
294	if ( pos2 >= `0` && ( pos1 < `0` \|\| pos1 > pos2 ) ) {
295	pos1 = pos2;
296	}
297	if ( pos3 >= `0` && ( pos1 < `0` \|\| pos1 > pos3 ) ) {
298	pos1 = pos3;
299	}
300	if ( pos4 >= `0` && ( pos1 < `0` \|\| pos1 > pos4 ) ) {
301	pos1 = pos4;
302	}
303	iPos=src.lastIndexOf( '=', pos1 ) + `1`;
304	if ( iPos < pos1 ) {
305	char c = src.at( iPos );
306	if ( c>='0' && c<='9' ) {
307	found = true;
308	*value = src.mid( iPos, pos1 - iPos ).toInt();
309	}
310	}
311	}
312	return found;
313	}
314
315	bool YENCEncoded::parse()
316	{
317	int currentPos=`0`;
318	bool success=true;
319
320	while ( success ) {
321	int beginPos=currentPos, yencStart=currentPos;
322	bool containsPart=false;
323	QByteArray fileName, mimeType;
324
325	if ( ( beginPos = s_rc.indexOf( "=ybegin ", currentPos ) ) > -`1` &&
326	( beginPos == `0` \|\| s_rc.at( beginPos - `1` ) == '\n' ) ) {
327	yencStart = s_rc.indexOf( '\n', beginPos );
328	if ( yencStart == -`1` ) { // no more line breaks found, give up
329	success = false;
330	break;
331	} else {
332	yencStart++;
333	if ( s_rc.indexOf( "=ypart", yencStart ) == yencStart ) {
334	containsPart = true;
335	yencStart = s_rc.indexOf( '\n', yencStart );
336	if ( yencStart == -`1` ) {
337	success = false;
338	break;
339	}
340	yencStart++;
341	}
342	}
343	// Try to identify yenc meta data
344
345	// Filenames can contain any embedded chars until end of line
346	QByteArray meta = s_rc.mid( beginPos, yencStart - beginPos );
347	int namePos = meta.indexOf( "name=" );
348	if ( namePos == -`1` ) {
349	success = false;
350	break;
351	}
352	int eolPos = meta.indexOf( '\r', namePos );
353	if ( eolPos == -`1` ) {
354	eolPos = meta.indexOf( '\n', namePos );
355	}
356	if ( eolPos == -`1` ) {
357	success = false;
358	break;
359	}
360	fileName = meta.mid( namePos + `5`, eolPos - ( namePos + `5` ) );
361
362	// Other metadata is integer
363	int yencLine;
364	if ( !yencMeta( meta, "line", &yencLine ) ) {
365	success = false;
366	break;
367	}
368	int yencSize;
369	if ( !yencMeta( meta, "size", &yencSize ) ) {
370	success = false;
371	break;
372	}
373
374	int partBegin, partEnd;
375	if ( containsPart ) {
376	if ( !yencMeta( meta, "part", &p_artNr ) ) {
377	success = false;
378	break;
379	}
380	if ( !yencMeta( meta, "begin", &partBegin ) \|\|
381	!yencMeta( meta, "end", &partEnd ) ) {
382	success = false;
383	break;
384	}
385	if ( !yencMeta( meta, "total", &t_otalNr ) ) {
386	t_otalNr = p_artNr + `1`;
387	}
388	if ( yencSize == partEnd - partBegin + `1` ) {
389	t_otalNr = `1`;
390	} else {
391	yencSize = partEnd - partBegin + `1`;
392	}
393	}
394
395	// We have a valid yenc header; now we extract the binary data
396	int totalSize = `0`;
397	int pos = yencStart;
398	int len = s_rc.length();
399	bool lineStart = true;
400	int lineLength = `0`;
401	bool containsEnd = false;
402	QByteArray binary;
403	binary.resize( yencSize );
404	while ( pos < len ) {
405	int ch = s_rc.at( pos );
406	if ( ch < `0` ) {
407	ch += `256`;
408	}
409	if ( ch == '\r' ) {
410	if ( lineLength != yencLine && totalSize != yencSize ) {
411	break;
412	}
413	pos++;
414	}
415	else if ( ch == '\n' ) {
416	lineStart = true;
417	lineLength = `0`;
418	pos++;
419	} else {
420	if ( ch == '=' ) {
421	if ( pos + `1` < len ) {
422	ch = s_rc.at( pos + `1` );
423	if ( lineStart && ch == 'y' ) {
424	containsEnd = true;
425	break;
426	}
427	pos += `2`;
428	ch -= `64`+`42`;
429	if ( ch < `0` ) {
430	ch += `256`;
431	}
432	if ( totalSize >= yencSize ) {
433	break;
434	}
435	binary [totalSize++] = ch;
436	lineLength++;
437	} else {
438	break;
439	}
440	} else {
441	ch -= `42`;
442	if ( ch < `0` ) {
443	ch += `256`;
444	}
445	if ( totalSize >= yencSize ) {
446	break;
447	}
448	binary [totalSize++] = ch;
449	lineLength++;
450	pos++;
451	}
452	lineStart = false;
453	}
454	}
455
456	if ( !containsEnd ) {
457	success = false;
458	break;
459	}
460	if ( totalSize != yencSize ) {
461	success = false;
462	break;
463	}
464
465	// pos now points to =yend; get end data
466	eolPos = s_rc.indexOf( '\n', pos );
467	if ( eolPos == -`1` ) {
468	success = false;
469	break;
470	}
471	meta = s_rc.mid( pos, eolPos - pos );
472	if ( !yencMeta( meta, "size", &totalSize ) ) {
473	success = false;
474	break;
475	}
476	if ( totalSize != yencSize ) {
477	success = false;
478	break;
479	}
480
481	f_ilenames.append( fileName );
482	m_imeTypes.append( guessMimeType( fileName ) );
483	b_ins.append( binary );
484
485	//everything before "begin" is text
486	if ( beginPos > `0` ) {
487	t_ext.append( s_rc.mid( currentPos, beginPos - currentPos ) );
488	}
489	currentPos = eolPos + `1`;
490
491	} else {
492	success = false;
493	}
494	}
495
496	// append trailing text part of the article
497	t_ext.append( s_rc.right( s_rc.length() - currentPos ) );
498
499	return b_ins.count()>`0`;
500	}
501
502	} // namespace Parser
503
504	} // namespace KMime
505