1 | /* |
2 | Copyright (c) 2002 Dave Corrie <kde@davecorrie.com> |
3 | |
4 | This library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Library General Public |
6 | License as published by the Free Software Foundation; either |
7 | version 2 of the License, or (at your option) any later version. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Library General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Library General Public License |
15 | along with this library; see the file COPYING.LIB. If not, write to |
16 | the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
17 | Boston, MA 02110-1301, USA. |
18 | */ |
19 | /** |
20 | @file |
21 | This file is part of the KDEPIM Utilities library and provides the |
22 | LinkLocator class. |
23 | |
24 | @brief |
25 | Identifies URLs and email addresses embedded in plaintext. |
26 | |
27 | @author Dave Corrie \<kde@davecorrie.com\> |
28 | */ |
29 | #include "linklocator.h" |
30 | |
31 | #include <KEmoticons> |
32 | |
33 | #include <QtCore/QCoreApplication> |
34 | #include <QtCore/QFile> |
35 | #include <QtCore/QRegExp> |
36 | #include <QTextDocument> |
37 | |
38 | #include <climits> |
39 | |
40 | using namespace KPIMUtils; |
41 | |
42 | /** |
43 | Private class that helps to provide binary compatibility between releases. |
44 | @internal |
45 | */ |
46 | //@cond PRIVATE |
47 | class KPIMUtils::LinkLocator::Private |
48 | { |
49 | public: |
50 | int mMaxUrlLen; |
51 | int mMaxAddressLen; |
52 | }; |
53 | //@endcond |
54 | |
55 | // Use a static for this as calls to the KEmoticons constructor are expensive. |
56 | K_GLOBAL_STATIC( KEmoticons, sEmoticons ) |
57 | |
58 | LinkLocator::LinkLocator( const QString &text, int pos ) |
59 | : mText( text ), mPos( pos ), d( new KPIMUtils::LinkLocator::Private ) |
60 | { |
61 | d->mMaxUrlLen = 4096; |
62 | d->mMaxAddressLen = 255; |
63 | |
64 | // If you change either of the above values for maxUrlLen or |
65 | // maxAddressLen, then please also update the documentation for |
66 | // setMaxUrlLen()/setMaxAddressLen() in the header file AND the |
67 | // default values used for the maxUrlLen/maxAddressLen parameters |
68 | // of convertToHtml(). |
69 | } |
70 | |
71 | LinkLocator::~LinkLocator() |
72 | { |
73 | delete d; |
74 | } |
75 | |
76 | void LinkLocator::setMaxUrlLen( int length ) |
77 | { |
78 | d->mMaxUrlLen = length; |
79 | } |
80 | |
81 | int LinkLocator::maxUrlLen() const |
82 | { |
83 | return d->mMaxUrlLen; |
84 | } |
85 | |
86 | void LinkLocator::setMaxAddressLen( int length ) |
87 | { |
88 | d->mMaxAddressLen = length; |
89 | } |
90 | |
91 | int LinkLocator::maxAddressLen() const |
92 | { |
93 | return d->mMaxAddressLen; |
94 | } |
95 | |
96 | QString LinkLocator::getUrl() |
97 | { |
98 | QString url; |
99 | if ( atUrl() ) { |
100 | // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C |
101 | // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall |
102 | // be allowed and should be ignored when the URI is extracted. |
103 | |
104 | // This implementation follows this recommendation and |
105 | // allows the URL to be enclosed within different kind of brackets/quotes |
106 | // If an URL is enclosed, whitespace characters are allowed and removed, otherwise |
107 | // the URL ends with the first whitespace |
108 | // Also, if the URL is enclosed in brackets, the URL itself is not allowed |
109 | // to contain the closing bracket, as this would be detected as the end of the URL |
110 | |
111 | QChar beforeUrl, afterUrl; |
112 | |
113 | // detect if the url has been surrounded by brackets or quotes |
114 | if ( mPos > 0 ) { |
115 | beforeUrl = mText[mPos - 1]; |
116 | |
117 | /*if ( beforeUrl == '(' ) { |
118 | afterUrl = ')'; |
119 | } else */if ( beforeUrl == QLatin1Char('[') ) { |
120 | afterUrl = QLatin1Char(']'); |
121 | } else if ( beforeUrl == QLatin1Char('<') ) { |
122 | afterUrl = QLatin1Char('>'); |
123 | } else if ( beforeUrl == QLatin1Char('>') ) { // for e.g. <link>http://.....</link> |
124 | afterUrl = QLatin1Char('<'); |
125 | } else if ( beforeUrl == QLatin1Char('"') ) { |
126 | afterUrl = QLatin1Char('"'); |
127 | } |
128 | } |
129 | |
130 | url.reserve( maxUrlLen() ); // avoid allocs |
131 | int start = mPos; |
132 | while ( ( mPos < (int)mText.length() ) && |
133 | ( mText[mPos].isPrint() || mText[mPos].isSpace() ) && |
134 | ( ( afterUrl.isNull() && !mText[mPos].isSpace() ) || |
135 | ( !afterUrl.isNull() && mText[mPos] != afterUrl ) ) ) { |
136 | if ( !mText[mPos].isSpace() ) { // skip whitespace |
137 | url.append( mText[mPos] ); |
138 | if ( url.length() > maxUrlLen() ) { |
139 | break; |
140 | } |
141 | } |
142 | |
143 | mPos++; |
144 | } |
145 | |
146 | if ( isEmptyUrl( url ) || ( url.length() > maxUrlLen() ) ) { |
147 | mPos = start; |
148 | url.clear(); |
149 | } else { |
150 | --mPos; |
151 | } |
152 | } |
153 | |
154 | // HACK: This is actually against the RFC. However, most people don't properly escape the URL in |
155 | // their text with "" or <>. That leads to people writing an url, followed immediatley by |
156 | // a dot to finish the sentence. That would lead the parser to include the dot in the url, |
157 | // even though that is not wanted. So work around that here. |
158 | // Most real-life URLs hopefully don't end with dots or commas. |
159 | QList<QChar> wordBoundaries; |
160 | wordBoundaries << QLatin1Char('.') << QLatin1Char(',') << QLatin1Char(':') << QLatin1Char('!') << QLatin1Char('?') << QLatin1Char(')') << QLatin1Char('>'); |
161 | if ( url.length() > 1 ) { |
162 | do { |
163 | if ( wordBoundaries.contains( url.at( url.length() - 1 ) ) ) { |
164 | url.chop( 1 ); |
165 | --mPos; |
166 | } else { |
167 | break; |
168 | } |
169 | } while( url.length() > 1 ); |
170 | } |
171 | |
172 | return url; |
173 | } |
174 | |
175 | // keep this in sync with KMMainWin::slotUrlClicked() |
176 | bool LinkLocator::atUrl() const |
177 | { |
178 | // the following characters are allowed in a dot-atom (RFC 2822): |
179 | // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ |
180 | const QString allowedSpecialChars = QLatin1String( ".!#$%&'*+-/=?^_`{|}~" ); |
181 | |
182 | // the character directly before the URL must not be a letter, a number or |
183 | // any other character allowed in a dot-atom (RFC 2822). |
184 | if ( ( mPos > 0 ) && |
185 | ( mText[mPos-1].isLetterOrNumber() || |
186 | ( allowedSpecialChars.indexOf( mText[mPos-1] ) != -1 ) ) ) { |
187 | return false; |
188 | } |
189 | |
190 | QChar ch = mText[mPos]; |
191 | return |
192 | ( ch == QLatin1Char('h') && ( mText.mid( mPos, 7 ) == QLatin1String( "http://" ) || |
193 | mText.mid( mPos, 8 ) == QLatin1String( "https://" ) ) ) || |
194 | ( ch == QLatin1Char('v') && mText.mid( mPos, 6 ) == QLatin1String( "vnc://" ) ) || |
195 | ( ch == QLatin1Char('f') && ( mText.mid( mPos, 7 ) == QLatin1String( "fish://" ) || |
196 | mText.mid( mPos, 6 ) == QLatin1String( "ftp://" ) || |
197 | mText.mid( mPos, 7 ) == QLatin1String( "ftps://" ) ) ) || |
198 | ( ch == QLatin1Char('s') && ( mText.mid( mPos, 7 ) == QLatin1String( "sftp://" ) || |
199 | mText.mid( mPos, 6 ) == QLatin1String( "smb://" ) ) ) || |
200 | ( ch == QLatin1Char('m') && mText.mid( mPos, 7 ) == QLatin1String( "mailto:" ) ) || |
201 | ( ch == QLatin1Char('w') && mText.mid( mPos, 4 ) == QLatin1String( "www." ) ) || |
202 | ( ch == QLatin1Char('f') && ( mText.mid( mPos, 4 ) == QLatin1String( "ftp." ) || |
203 | mText.mid( mPos, 7 ) == QLatin1String( "file://" ) ) )|| |
204 | ( ch == QLatin1Char('n') && mText.mid( mPos, 5 ) == QLatin1String( "news:" ) ); |
205 | } |
206 | |
207 | bool LinkLocator::isEmptyUrl( const QString &url ) const |
208 | { |
209 | return url.isEmpty() || |
210 | url == QLatin1String( "http://" ) || |
211 | url == QLatin1String( "https://" ) || |
212 | url == QLatin1String( "fish://" ) || |
213 | url == QLatin1String( "ftp://" ) || |
214 | url == QLatin1String( "ftps://" ) || |
215 | url == QLatin1String( "sftp://" ) || |
216 | url == QLatin1String( "smb://" ) || |
217 | url == QLatin1String( "vnc://" ) || |
218 | url == QLatin1String( "mailto" ) || |
219 | url == QLatin1String( "www" ) || |
220 | url == QLatin1String( "ftp" ) || |
221 | url == QLatin1String( "news" ) || |
222 | url == QLatin1String( "news://" ); |
223 | } |
224 | |
225 | QString LinkLocator::getEmailAddress() |
226 | { |
227 | QString address; |
228 | |
229 | if ( mText[mPos] == QLatin1Char('@') ) { |
230 | // the following characters are allowed in a dot-atom (RFC 2822): |
231 | // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ |
232 | const QString allowedSpecialChars = QLatin1String( ".!#$%&'*+-/=?^_`{|}~" ); |
233 | |
234 | // determine the local part of the email address |
235 | int start = mPos - 1; |
236 | while ( start >= 0 && mText[start].unicode() < 128 && |
237 | ( mText[start].isLetterOrNumber() || |
238 | mText[start] == QLatin1Char('@') || // allow @ to find invalid email addresses |
239 | allowedSpecialChars.indexOf( mText[start] ) != -1 ) ) { |
240 | if ( mText[start] == QLatin1Char('@') ) { |
241 | return QString(); // local part contains '@' -> no email address |
242 | } |
243 | --start; |
244 | } |
245 | ++start; |
246 | // we assume that an email address starts with a letter or a digit |
247 | while ( ( start < mPos ) && !mText[start].isLetterOrNumber() ) { |
248 | ++start; |
249 | } |
250 | if ( start == mPos ) { |
251 | return QString(); // local part is empty -> no email address |
252 | } |
253 | |
254 | // determine the domain part of the email address |
255 | int dotPos = INT_MAX; |
256 | int end = mPos + 1; |
257 | while ( end < (int)mText.length() && |
258 | ( mText[end].isLetterOrNumber() || |
259 | mText[end] == QLatin1Char('@') || // allow @ to find invalid email addresses |
260 | mText[end] == QLatin1Char('.') || |
261 | mText[end] == QLatin1Char('-') ) ) { |
262 | if ( mText[end] == QLatin1Char('@') ) { |
263 | return QString(); // domain part contains '@' -> no email address |
264 | } |
265 | if ( mText[end] == QLatin1Char('.') ) { |
266 | dotPos = qMin( dotPos, end ); // remember index of first dot in domain |
267 | } |
268 | ++end; |
269 | } |
270 | // we assume that an email address ends with a letter or a digit |
271 | while ( ( end > mPos ) && !mText[end - 1].isLetterOrNumber() ) { |
272 | --end; |
273 | } |
274 | if ( end == mPos ) { |
275 | return QString(); // domain part is empty -> no email address |
276 | } |
277 | if ( dotPos >= end ) { |
278 | return QString(); // domain part doesn't contain a dot |
279 | } |
280 | |
281 | if ( end - start > maxAddressLen() ) { |
282 | return QString(); // too long -> most likely no email address |
283 | } |
284 | address = mText.mid( start, end - start ); |
285 | |
286 | mPos = end - 1; |
287 | } |
288 | return address; |
289 | } |
290 | |
291 | QString LinkLocator::convertToHtml( const QString &plainText, int flags, |
292 | int maxUrlLen, int maxAddressLen ) |
293 | { |
294 | LinkLocator locator( plainText ); |
295 | locator.setMaxUrlLen( maxUrlLen ); |
296 | locator.setMaxAddressLen( maxAddressLen ); |
297 | |
298 | QString str; |
299 | QString result( (QChar*)0, (int)locator.mText.length() * 2 ); |
300 | QChar ch; |
301 | int x; |
302 | bool startOfLine = true; |
303 | |
304 | for ( locator.mPos = 0, x = 0; locator.mPos < (int)locator.mText.length(); |
305 | locator.mPos++, x++ ) { |
306 | ch = locator.mText[locator.mPos]; |
307 | if ( flags & PreserveSpaces ) { |
308 | if ( ch == QLatin1Char(' ') ) { |
309 | if ( locator.mPos + 1 < locator.mText.length() ) { |
310 | if ( locator.mText[locator.mPos + 1] != QLatin1Char(' ') ) { |
311 | |
312 | // A single space, make it breaking if not at the start or end of the line |
313 | const bool endOfLine = locator.mText[locator.mPos + 1] == QLatin1Char('\n'); |
314 | if ( !startOfLine && !endOfLine ) { |
315 | result += QLatin1Char(' '); |
316 | } else { |
317 | result += QLatin1String(" " ); |
318 | } |
319 | } else { |
320 | |
321 | // Whitespace of more than one space, make it all non-breaking |
322 | while ( locator.mPos < locator.mText.length() && locator.mText[locator.mPos] == QLatin1Char(' ') ) { |
323 | result += QLatin1String(" " ); |
324 | locator.mPos++; |
325 | x++; |
326 | } |
327 | |
328 | // We incremented once to often, undo that |
329 | locator.mPos--; |
330 | x--; |
331 | } |
332 | } else { |
333 | // Last space in the text, it is non-breaking |
334 | result += QLatin1String(" " ); |
335 | } |
336 | |
337 | if ( startOfLine ) { |
338 | startOfLine = false; |
339 | } |
340 | continue; |
341 | } else if ( ch == QLatin1Char('\t') ) { |
342 | do { |
343 | result += QLatin1String(" " ); |
344 | x++; |
345 | } while ( ( x & 7 ) != 0 ); |
346 | x--; |
347 | startOfLine = false; |
348 | continue; |
349 | } |
350 | } |
351 | if ( ch == QLatin1Char('\n') ) { |
352 | result += QLatin1String("<br />\n" ); // Keep the \n, so apps can figure out the quoting levels correctly. |
353 | startOfLine = true; |
354 | x = -1; |
355 | continue; |
356 | } |
357 | |
358 | startOfLine = false; |
359 | if ( ch == QLatin1Char('&') ) { |
360 | result += QLatin1String("&" ); |
361 | } else if ( ch == QLatin1Char('"') ) { |
362 | result += QLatin1String(""" ); |
363 | } else if ( ch == QLatin1Char('<') ) { |
364 | result += QLatin1String("<" ); |
365 | } else if ( ch == QLatin1Char('>') ) { |
366 | result += QLatin1String(">" ); |
367 | } else { |
368 | const int start = locator.mPos; |
369 | if ( !( flags & IgnoreUrls ) ) { |
370 | str = locator.getUrl(); |
371 | if ( !str.isEmpty() ) { |
372 | QString hyperlink; |
373 | if ( str.left( 4 ) == QLatin1String("www." ) ) { |
374 | hyperlink = QLatin1String("http://" ) + str; |
375 | } else if ( str.left( 4 ) == QLatin1String("ftp." ) ) { |
376 | hyperlink = QLatin1String("ftp://" ) + str; |
377 | } else { |
378 | hyperlink = str; |
379 | } |
380 | |
381 | result += QLatin1String("<a href=\"" ) + hyperlink + QLatin1String("\">" ) + Qt::escape( str ) + QLatin1String("</a>" ); |
382 | x += locator.mPos - start; |
383 | continue; |
384 | } |
385 | str = locator.getEmailAddress(); |
386 | if ( !str.isEmpty() ) { |
387 | // len is the length of the local part |
388 | int len = str.indexOf( QLatin1Char('@') ); |
389 | QString localPart = str.left( len ); |
390 | |
391 | // remove the local part from the result (as '&'s have been expanded to |
392 | // & we have to take care of the 4 additional characters per '&') |
393 | result.truncate( result.length() - |
394 | len - ( localPart.count( QLatin1Char('&') ) * 4 ) ); |
395 | x -= len; |
396 | |
397 | result += QLatin1String("<a href=\"mailto:" ) + str + QLatin1String("\">" ) + str + QLatin1String("</a>" ); |
398 | x += str.length() - 1; |
399 | continue; |
400 | } |
401 | } |
402 | if ( flags & HighlightText ) { |
403 | str = locator.highlightedText(); |
404 | if ( !str.isEmpty() ) { |
405 | result += str; |
406 | x += locator.mPos - start; |
407 | continue; |
408 | } |
409 | } |
410 | result += ch; |
411 | } |
412 | } |
413 | |
414 | if ( flags & ReplaceSmileys ) { |
415 | QStringList exclude; |
416 | exclude << QLatin1String("(c)" ) << QLatin1String("(C)" ) << QLatin1String(">:-(" ) << QLatin1String(">:(" ) << QLatin1String("(B)" ) << QLatin1String("(b)" ) << QLatin1String("(P)" ) << QLatin1String("(p)" ); |
417 | exclude << QLatin1String("(O)" ) << QLatin1String("(o)" ) << QLatin1String("(D)" ) << QLatin1String("(d)" ) << QLatin1String("(E)" ) << QLatin1String("(e)" ) << QLatin1String("(K)" )<< QLatin1String("(k)" ); |
418 | exclude << QLatin1String("(I)" ) << QLatin1String("(i)" ) << QLatin1String("(L)" ) << QLatin1String("(l)" ) << QLatin1String("(8)" ) << QLatin1String("(T)" ) << QLatin1String("(t)" ) << QLatin1String("(G)" ); |
419 | exclude << QLatin1String("(g)" ) << QLatin1String("(F)" ) << QLatin1String("(f)" ) << QLatin1String("(H)" ); |
420 | exclude << QLatin1String("8)" ) << QLatin1String("(N)" ) << QLatin1String("(n)" ) << QLatin1String("(Y)" ) << QLatin1String("(y)" )<< QLatin1String("(U)" ) << QLatin1String("(u)" ) << QLatin1String("(W)" ) << QLatin1String("(w)" ); |
421 | static QString cachedEmoticonsThemeName; |
422 | if ( cachedEmoticonsThemeName.isEmpty() ) { |
423 | cachedEmoticonsThemeName = KEmoticons::currentThemeName(); |
424 | } |
425 | result = |
426 | sEmoticons->theme( cachedEmoticonsThemeName ).parseEmoticons( |
427 | result, KEmoticonsTheme::StrictParse | KEmoticonsTheme::SkipHTML, exclude ); |
428 | } |
429 | |
430 | return result; |
431 | } |
432 | |
433 | QString LinkLocator::pngToDataUrl( const QString &iconPath ) |
434 | { |
435 | if ( iconPath.isEmpty() ) { |
436 | return QString(); |
437 | } |
438 | |
439 | QFile pngFile( iconPath ); |
440 | if ( !pngFile.open( QIODevice::ReadOnly | QIODevice::Unbuffered ) ) { |
441 | return QString(); |
442 | } |
443 | |
444 | QByteArray ba = pngFile.readAll(); |
445 | pngFile.close(); |
446 | return QString::fromLatin1( "data:image/png;base64,%1" ).arg( QLatin1String(ba.toBase64().constData()) ); |
447 | } |
448 | |
449 | QString LinkLocator::highlightedText() |
450 | { |
451 | // formating symbols must be prepended with a whitespace |
452 | if ( ( mPos > 0 ) && !mText[mPos-1].isSpace() ) { |
453 | return QString(); |
454 | } |
455 | |
456 | const QChar ch = mText[mPos]; |
457 | if ( ch != QLatin1Char('/') && ch != QLatin1Char('*') && ch != QLatin1Char('_') && ch != QLatin1Char('-') ) { |
458 | return QString(); |
459 | } |
460 | |
461 | QRegExp re = |
462 | QRegExp( QString::fromLatin1( "\\%1((\\w+)([\\s-']\\w+)*( ?[,.:\\?!;])?)\\%2" ).arg( ch ).arg( ch ) ); |
463 | re.setMinimal( true ); |
464 | if ( re.indexIn( mText, mPos ) == mPos ) { |
465 | int length = re.matchedLength(); |
466 | // there must be a whitespace after the closing formating symbol |
467 | if ( mPos + length < mText.length() && !mText[mPos + length].isSpace() ) { |
468 | return QString(); |
469 | } |
470 | mPos += length - 1; |
471 | switch ( ch.toLatin1() ) { |
472 | case '*': |
473 | return QLatin1String("<b>*" ) + re.cap( 1 ) + QLatin1String("*</b>" ); |
474 | case '_': |
475 | return QLatin1String("<u>_" ) + re.cap( 1 ) + QLatin1String("_</u>" ); |
476 | case '/': |
477 | return QLatin1String("<i>/" ) + re.cap( 1 ) + QLatin1String("/</i>" ); |
478 | case '-': |
479 | return QLatin1String("<strike>-" ) + re.cap( 1 ) + QLatin1String("-</strike>" ); |
480 | } |
481 | } |
482 | return QString(); |
483 | } |
484 | |