1/*
2 Copyright (c) 2002 Dave Corrie <kde@davecorrie.com>
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Library General Public
6 License as published by the Free Software Foundation; either
7 version 2 of the License, or (at your option) any later version.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
13
14 You should have received a copy of the GNU Library General Public License
15 along with this library; see the file COPYING.LIB. If not, write to
16 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 Boston, MA 02110-1301, USA.
18*/
19/**
20 @file
21 This file is part of the KDEPIM Utilities library and provides the
22 LinkLocator class.
23
24 @brief
25 Identifies URLs and email addresses embedded in plaintext.
26
27 @author Dave Corrie \<kde@davecorrie.com\>
28*/
29#include "linklocator.h"
30
31#include <KEmoticons>
32
33#include <QtCore/QCoreApplication>
34#include <QtCore/QFile>
35#include <QtCore/QRegExp>
36#include <QTextDocument>
37
38#include <climits>
39
40using namespace KPIMUtils;
41
42/**
43 Private class that helps to provide binary compatibility between releases.
44 @internal
45*/
46//@cond PRIVATE
47class KPIMUtils::LinkLocator::Private
48{
49 public:
50 int mMaxUrlLen;
51 int mMaxAddressLen;
52};
53//@endcond
54
55// Use a static for this as calls to the KEmoticons constructor are expensive.
56K_GLOBAL_STATIC( KEmoticons, sEmoticons )
57
58LinkLocator::LinkLocator( const QString &text, int pos )
59 : mText( text ), mPos( pos ), d( new KPIMUtils::LinkLocator::Private )
60{
61 d->mMaxUrlLen = 4096;
62 d->mMaxAddressLen = 255;
63
64 // If you change either of the above values for maxUrlLen or
65 // maxAddressLen, then please also update the documentation for
66 // setMaxUrlLen()/setMaxAddressLen() in the header file AND the
67 // default values used for the maxUrlLen/maxAddressLen parameters
68 // of convertToHtml().
69}
70
71LinkLocator::~LinkLocator()
72{
73 delete d;
74}
75
76void LinkLocator::setMaxUrlLen( int length )
77{
78 d->mMaxUrlLen = length;
79}
80
81int LinkLocator::maxUrlLen() const
82{
83 return d->mMaxUrlLen;
84}
85
86void LinkLocator::setMaxAddressLen( int length )
87{
88 d->mMaxAddressLen = length;
89}
90
91int LinkLocator::maxAddressLen() const
92{
93 return d->mMaxAddressLen;
94}
95
96QString LinkLocator::getUrl()
97{
98 QString url;
99 if ( atUrl() ) {
100 // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C
101 // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall
102 // be allowed and should be ignored when the URI is extracted.
103
104 // This implementation follows this recommendation and
105 // allows the URL to be enclosed within different kind of brackets/quotes
106 // If an URL is enclosed, whitespace characters are allowed and removed, otherwise
107 // the URL ends with the first whitespace
108 // Also, if the URL is enclosed in brackets, the URL itself is not allowed
109 // to contain the closing bracket, as this would be detected as the end of the URL
110
111 QChar beforeUrl, afterUrl;
112
113 // detect if the url has been surrounded by brackets or quotes
114 if ( mPos > 0 ) {
115 beforeUrl = mText[mPos - 1];
116
117 /*if ( beforeUrl == '(' ) {
118 afterUrl = ')';
119 } else */if ( beforeUrl == QLatin1Char('[') ) {
120 afterUrl = QLatin1Char(']');
121 } else if ( beforeUrl == QLatin1Char('<') ) {
122 afterUrl = QLatin1Char('>');
123 } else if ( beforeUrl == QLatin1Char('>') ) { // for e.g. <link>http://.....</link>
124 afterUrl = QLatin1Char('<');
125 } else if ( beforeUrl == QLatin1Char('"') ) {
126 afterUrl = QLatin1Char('"');
127 }
128 }
129
130 url.reserve( maxUrlLen() ); // avoid allocs
131 int start = mPos;
132 while ( ( mPos < (int)mText.length() ) &&
133 ( mText[mPos].isPrint() || mText[mPos].isSpace() ) &&
134 ( ( afterUrl.isNull() && !mText[mPos].isSpace() ) ||
135 ( !afterUrl.isNull() && mText[mPos] != afterUrl ) ) ) {
136 if ( !mText[mPos].isSpace() ) { // skip whitespace
137 url.append( mText[mPos] );
138 if ( url.length() > maxUrlLen() ) {
139 break;
140 }
141 }
142
143 mPos++;
144 }
145
146 if ( isEmptyUrl( url ) || ( url.length() > maxUrlLen() ) ) {
147 mPos = start;
148 url.clear();
149 } else {
150 --mPos;
151 }
152 }
153
154 // HACK: This is actually against the RFC. However, most people don't properly escape the URL in
155 // their text with "" or <>. That leads to people writing an url, followed immediatley by
156 // a dot to finish the sentence. That would lead the parser to include the dot in the url,
157 // even though that is not wanted. So work around that here.
158 // Most real-life URLs hopefully don't end with dots or commas.
159 QList<QChar> wordBoundaries;
160 wordBoundaries << QLatin1Char('.') << QLatin1Char(',') << QLatin1Char(':') << QLatin1Char('!') << QLatin1Char('?') << QLatin1Char(')') << QLatin1Char('>');
161 if ( url.length() > 1 ) {
162 do {
163 if ( wordBoundaries.contains( url.at( url.length() - 1 ) ) ) {
164 url.chop( 1 );
165 --mPos;
166 } else {
167 break;
168 }
169 } while( url.length() > 1 );
170 }
171
172 return url;
173}
174
175// keep this in sync with KMMainWin::slotUrlClicked()
176bool LinkLocator::atUrl() const
177{
178 // the following characters are allowed in a dot-atom (RFC 2822):
179 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
180 const QString allowedSpecialChars = QLatin1String( ".!#$%&'*+-/=?^_`{|}~" );
181
182 // the character directly before the URL must not be a letter, a number or
183 // any other character allowed in a dot-atom (RFC 2822).
184 if ( ( mPos > 0 ) &&
185 ( mText[mPos-1].isLetterOrNumber() ||
186 ( allowedSpecialChars.indexOf( mText[mPos-1] ) != -1 ) ) ) {
187 return false;
188 }
189
190 QChar ch = mText[mPos];
191 return
192 ( ch == QLatin1Char('h') && ( mText.mid( mPos, 7 ) == QLatin1String( "http://" ) ||
193 mText.mid( mPos, 8 ) == QLatin1String( "https://" ) ) ) ||
194 ( ch == QLatin1Char('v') && mText.mid( mPos, 6 ) == QLatin1String( "vnc://" ) ) ||
195 ( ch == QLatin1Char('f') && ( mText.mid( mPos, 7 ) == QLatin1String( "fish://" ) ||
196 mText.mid( mPos, 6 ) == QLatin1String( "ftp://" ) ||
197 mText.mid( mPos, 7 ) == QLatin1String( "ftps://" ) ) ) ||
198 ( ch == QLatin1Char('s') && ( mText.mid( mPos, 7 ) == QLatin1String( "sftp://" ) ||
199 mText.mid( mPos, 6 ) == QLatin1String( "smb://" ) ) ) ||
200 ( ch == QLatin1Char('m') && mText.mid( mPos, 7 ) == QLatin1String( "mailto:" ) ) ||
201 ( ch == QLatin1Char('w') && mText.mid( mPos, 4 ) == QLatin1String( "www." ) ) ||
202 ( ch == QLatin1Char('f') && ( mText.mid( mPos, 4 ) == QLatin1String( "ftp." ) ||
203 mText.mid( mPos, 7 ) == QLatin1String( "file://" ) ) )||
204 ( ch == QLatin1Char('n') && mText.mid( mPos, 5 ) == QLatin1String( "news:" ) );
205}
206
207bool LinkLocator::isEmptyUrl( const QString &url ) const
208{
209 return url.isEmpty() ||
210 url == QLatin1String( "http://" ) ||
211 url == QLatin1String( "https://" ) ||
212 url == QLatin1String( "fish://" ) ||
213 url == QLatin1String( "ftp://" ) ||
214 url == QLatin1String( "ftps://" ) ||
215 url == QLatin1String( "sftp://" ) ||
216 url == QLatin1String( "smb://" ) ||
217 url == QLatin1String( "vnc://" ) ||
218 url == QLatin1String( "mailto" ) ||
219 url == QLatin1String( "www" ) ||
220 url == QLatin1String( "ftp" ) ||
221 url == QLatin1String( "news" ) ||
222 url == QLatin1String( "news://" );
223}
224
225QString LinkLocator::getEmailAddress()
226{
227 QString address;
228
229 if ( mText[mPos] == QLatin1Char('@') ) {
230 // the following characters are allowed in a dot-atom (RFC 2822):
231 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
232 const QString allowedSpecialChars = QLatin1String( ".!#$%&'*+-/=?^_`{|}~" );
233
234 // determine the local part of the email address
235 int start = mPos - 1;
236 while ( start >= 0 && mText[start].unicode() < 128 &&
237 ( mText[start].isLetterOrNumber() ||
238 mText[start] == QLatin1Char('@') || // allow @ to find invalid email addresses
239 allowedSpecialChars.indexOf( mText[start] ) != -1 ) ) {
240 if ( mText[start] == QLatin1Char('@') ) {
241 return QString(); // local part contains '@' -> no email address
242 }
243 --start;
244 }
245 ++start;
246 // we assume that an email address starts with a letter or a digit
247 while ( ( start < mPos ) && !mText[start].isLetterOrNumber() ) {
248 ++start;
249 }
250 if ( start == mPos ) {
251 return QString(); // local part is empty -> no email address
252 }
253
254 // determine the domain part of the email address
255 int dotPos = INT_MAX;
256 int end = mPos + 1;
257 while ( end < (int)mText.length() &&
258 ( mText[end].isLetterOrNumber() ||
259 mText[end] == QLatin1Char('@') || // allow @ to find invalid email addresses
260 mText[end] == QLatin1Char('.') ||
261 mText[end] == QLatin1Char('-') ) ) {
262 if ( mText[end] == QLatin1Char('@') ) {
263 return QString(); // domain part contains '@' -> no email address
264 }
265 if ( mText[end] == QLatin1Char('.') ) {
266 dotPos = qMin( dotPos, end ); // remember index of first dot in domain
267 }
268 ++end;
269 }
270 // we assume that an email address ends with a letter or a digit
271 while ( ( end > mPos ) && !mText[end - 1].isLetterOrNumber() ) {
272 --end;
273 }
274 if ( end == mPos ) {
275 return QString(); // domain part is empty -> no email address
276 }
277 if ( dotPos >= end ) {
278 return QString(); // domain part doesn't contain a dot
279 }
280
281 if ( end - start > maxAddressLen() ) {
282 return QString(); // too long -> most likely no email address
283 }
284 address = mText.mid( start, end - start );
285
286 mPos = end - 1;
287 }
288 return address;
289}
290
291QString LinkLocator::convertToHtml( const QString &plainText, int flags,
292 int maxUrlLen, int maxAddressLen )
293{
294 LinkLocator locator( plainText );
295 locator.setMaxUrlLen( maxUrlLen );
296 locator.setMaxAddressLen( maxAddressLen );
297
298 QString str;
299 QString result( (QChar*)0, (int)locator.mText.length() * 2 );
300 QChar ch;
301 int x;
302 bool startOfLine = true;
303
304 for ( locator.mPos = 0, x = 0; locator.mPos < (int)locator.mText.length();
305 locator.mPos++, x++ ) {
306 ch = locator.mText[locator.mPos];
307 if ( flags & PreserveSpaces ) {
308 if ( ch == QLatin1Char(' ') ) {
309 if ( locator.mPos + 1 < locator.mText.length() ) {
310 if ( locator.mText[locator.mPos + 1] != QLatin1Char(' ') ) {
311
312 // A single space, make it breaking if not at the start or end of the line
313 const bool endOfLine = locator.mText[locator.mPos + 1] == QLatin1Char('\n');
314 if ( !startOfLine && !endOfLine ) {
315 result += QLatin1Char(' ');
316 } else {
317 result += QLatin1String("&nbsp;");
318 }
319 } else {
320
321 // Whitespace of more than one space, make it all non-breaking
322 while ( locator.mPos < locator.mText.length() && locator.mText[locator.mPos] == QLatin1Char(' ') ) {
323 result += QLatin1String("&nbsp;");
324 locator.mPos++;
325 x++;
326 }
327
328 // We incremented once to often, undo that
329 locator.mPos--;
330 x--;
331 }
332 } else {
333 // Last space in the text, it is non-breaking
334 result += QLatin1String("&nbsp;");
335 }
336
337 if ( startOfLine ) {
338 startOfLine = false;
339 }
340 continue;
341 } else if ( ch == QLatin1Char('\t') ) {
342 do {
343 result += QLatin1String("&nbsp;");
344 x++;
345 } while ( ( x & 7 ) != 0 );
346 x--;
347 startOfLine = false;
348 continue;
349 }
350 }
351 if ( ch == QLatin1Char('\n') ) {
352 result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly.
353 startOfLine = true;
354 x = -1;
355 continue;
356 }
357
358 startOfLine = false;
359 if ( ch == QLatin1Char('&') ) {
360 result += QLatin1String("&amp;");
361 } else if ( ch == QLatin1Char('"') ) {
362 result += QLatin1String("&quot;");
363 } else if ( ch == QLatin1Char('<') ) {
364 result += QLatin1String("&lt;");
365 } else if ( ch == QLatin1Char('>') ) {
366 result += QLatin1String("&gt;");
367 } else {
368 const int start = locator.mPos;
369 if ( !( flags & IgnoreUrls ) ) {
370 str = locator.getUrl();
371 if ( !str.isEmpty() ) {
372 QString hyperlink;
373 if ( str.left( 4 ) == QLatin1String("www.") ) {
374 hyperlink = QLatin1String("http://") + str;
375 } else if ( str.left( 4 ) == QLatin1String("ftp.") ) {
376 hyperlink = QLatin1String("ftp://") + str;
377 } else {
378 hyperlink = str;
379 }
380
381 result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + Qt::escape( str ) + QLatin1String("</a>");
382 x += locator.mPos - start;
383 continue;
384 }
385 str = locator.getEmailAddress();
386 if ( !str.isEmpty() ) {
387 // len is the length of the local part
388 int len = str.indexOf( QLatin1Char('@') );
389 QString localPart = str.left( len );
390
391 // remove the local part from the result (as '&'s have been expanded to
392 // &amp; we have to take care of the 4 additional characters per '&')
393 result.truncate( result.length() -
394 len - ( localPart.count( QLatin1Char('&') ) * 4 ) );
395 x -= len;
396
397 result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>");
398 x += str.length() - 1;
399 continue;
400 }
401 }
402 if ( flags & HighlightText ) {
403 str = locator.highlightedText();
404 if ( !str.isEmpty() ) {
405 result += str;
406 x += locator.mPos - start;
407 continue;
408 }
409 }
410 result += ch;
411 }
412 }
413
414 if ( flags & ReplaceSmileys ) {
415 QStringList exclude;
416 exclude << QLatin1String("(c)") << QLatin1String("(C)") << QLatin1String("&gt;:-(") << QLatin1String("&gt;:(") << QLatin1String("(B)") << QLatin1String("(b)") << QLatin1String("(P)") << QLatin1String("(p)");
417 exclude << QLatin1String("(O)") << QLatin1String("(o)") << QLatin1String("(D)") << QLatin1String("(d)") << QLatin1String("(E)") << QLatin1String("(e)") << QLatin1String("(K)")<< QLatin1String("(k)");
418 exclude << QLatin1String("(I)") << QLatin1String("(i)") << QLatin1String("(L)") << QLatin1String("(l)") << QLatin1String("(8)") << QLatin1String("(T)") << QLatin1String("(t)") << QLatin1String("(G)");
419 exclude << QLatin1String("(g)") << QLatin1String("(F)") << QLatin1String("(f)") << QLatin1String("(H)");
420 exclude << QLatin1String("8)") << QLatin1String("(N)") << QLatin1String("(n)") << QLatin1String("(Y)") << QLatin1String("(y)" )<< QLatin1String("(U)") << QLatin1String("(u)") << QLatin1String("(W)") << QLatin1String("(w)");
421 static QString cachedEmoticonsThemeName;
422 if ( cachedEmoticonsThemeName.isEmpty() ) {
423 cachedEmoticonsThemeName = KEmoticons::currentThemeName();
424 }
425 result =
426 sEmoticons->theme( cachedEmoticonsThemeName ).parseEmoticons(
427 result, KEmoticonsTheme::StrictParse | KEmoticonsTheme::SkipHTML, exclude );
428 }
429
430 return result;
431}
432
433QString LinkLocator::pngToDataUrl( const QString &iconPath )
434{
435 if ( iconPath.isEmpty() ) {
436 return QString();
437 }
438
439 QFile pngFile( iconPath );
440 if ( !pngFile.open( QIODevice::ReadOnly | QIODevice::Unbuffered ) ) {
441 return QString();
442 }
443
444 QByteArray ba = pngFile.readAll();
445 pngFile.close();
446 return QString::fromLatin1( "data:image/png;base64,%1" ).arg( QLatin1String(ba.toBase64().constData()) );
447}
448
449QString LinkLocator::highlightedText()
450{
451 // formating symbols must be prepended with a whitespace
452 if ( ( mPos > 0 ) && !mText[mPos-1].isSpace() ) {
453 return QString();
454 }
455
456 const QChar ch = mText[mPos];
457 if ( ch != QLatin1Char('/') && ch != QLatin1Char('*') && ch != QLatin1Char('_') && ch != QLatin1Char('-') ) {
458 return QString();
459 }
460
461 QRegExp re =
462 QRegExp( QString::fromLatin1( "\\%1((\\w+)([\\s-']\\w+)*( ?[,.:\\?!;])?)\\%2" ).arg( ch ).arg( ch ) );
463 re.setMinimal( true );
464 if ( re.indexIn( mText, mPos ) == mPos ) {
465 int length = re.matchedLength();
466 // there must be a whitespace after the closing formating symbol
467 if ( mPos + length < mText.length() && !mText[mPos + length].isSpace() ) {
468 return QString();
469 }
470 mPos += length - 1;
471 switch ( ch.toLatin1() ) {
472 case '*':
473 return QLatin1String("<b>*") + re.cap( 1 ) + QLatin1String("*</b>");
474 case '_':
475 return QLatin1String("<u>_") + re.cap( 1 ) + QLatin1String("_</u>");
476 case '/':
477 return QLatin1String("<i>/") + re.cap( 1 ) + QLatin1String("/</i>");
478 case '-':
479 return QLatin1String("<strike>-") + re.cap( 1 ) + QLatin1String("-</strike>");
480 }
481 }
482 return QString();
483}
484